[llvm] [SPIR-V] Fix generation of invalid SPIR-V in cases of bitcasts between pointers and null pointers are used in the input LLVM IR (PR #118298)

Vyacheslav Levytskyy via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 2 06:42:00 PST 2024


https://github.com/VyacheslavLevytskyy updated https://github.com/llvm/llvm-project/pull/118298

>From cf257d482b1f332628d4cda83d8e1498534d7f3a Mon Sep 17 00:00:00 2001
From: "Levytskyy, Vyacheslav" <vyacheslav.levytskyy at intel.com>
Date: Mon, 2 Dec 2024 06:22:31 -0800
Subject: [PATCH 1/2] fix intertwined load/store/function call LLVM IR input
 with bitcasts inserted between instruction uses

---
 llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp |   63 +-
 .../OpPhi_ArgumentsPlaceholders.ll            |   11 +-
 .../validate/sycl-tangle-group-algorithms.ll  | 4673 +++++++++++++++++
 3 files changed, 4724 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/validate/sycl-tangle-group-algorithms.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index e6f136cc81b4b4..82d354a3e3a228 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -475,7 +475,7 @@ void SPIRVEmitIntrinsics::propagateElemType(
   DenseMap<Function *, CallInst *> Ptrcasts;
   SmallVector<User *> Users(Op->users());
   for (auto *U : Users) {
-    if (!isa<Instruction>(U) || isa<BitCastInst>(U) || isSpvIntrinsic(U))
+    if (!isa<Instruction>(U) || isSpvIntrinsic(U))
       continue;
     if (!VisitedSubst.insert(std::make_pair(U, Op)).second)
       continue;
@@ -506,7 +506,7 @@ void SPIRVEmitIntrinsics::propagateElemTypeRec(
     return;
   SmallVector<User *> Users(Op->users());
   for (auto *U : Users) {
-    if (!isa<Instruction>(U) || isa<BitCastInst>(U) || isSpvIntrinsic(U))
+    if (!isa<Instruction>(U) || isSpvIntrinsic(U))
       continue;
     if (!VisitedSubst.insert(std::make_pair(U, Op)).second)
       continue;
@@ -958,6 +958,14 @@ void SPIRVEmitIntrinsics::deduceOperandElementType(
       return;
     Uncomplete = isTodoType(I);
     Ops.push_back(std::make_pair(Ref->getPointerOperand(), 0));
+  } else if (auto *Ref = dyn_cast<BitCastInst>(I)) {
+    if (!isPointerTy(I->getType()))
+      return;
+    KnownElemTy = GR->findDeducedElementType(I);
+    if (!KnownElemTy)
+      return;
+    Uncomplete = isTodoType(I);
+    Ops.push_back(std::make_pair(Ref->getOperand(0), 0));
   } else if (auto *Ref = dyn_cast<GetElementPtrInst>(I)) {
     if (GR->findDeducedElementType(Ref->getPointerOperand()))
       return;
@@ -1030,7 +1038,6 @@ void SPIRVEmitIntrinsics::deduceOperandElementType(
             }
           }
         }
-        TypeValidated.insert(I);
         // Non-recursive update of types in the function uncomplete returns.
         // This may happen just once per a function, the latch is a pair of
         // findDeducedElementType(F) / addDeducedElementType(F, ...).
@@ -1043,6 +1050,7 @@ void SPIRVEmitIntrinsics::deduceOperandElementType(
       } else if (UncompleteRets) {
         UncompleteRets->insert(I);
       }
+      TypeValidated.insert(I);
       return;
     }
     Uncomplete = isTodoType(CurrF);
@@ -1369,10 +1377,6 @@ void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast(
     Instruction *I, Value *Pointer, Type *ExpectedElementType,
     unsigned OperandToReplace, IRBuilder<> &B) {
   TypeValidated.insert(I);
-  // If Pointer is the result of nop BitCastInst (ptr -> ptr), use the source
-  // pointer instead. The BitCastInst should be later removed when visited.
-  while (BitCastInst *BC = dyn_cast<BitCastInst>(Pointer))
-    Pointer = BC->getOperand(0);
 
   // Do not emit spv_ptrcast if Pointer's element type is ExpectedElementType
   Type *PointerElemTy = deduceElementTypeHelper(Pointer, false);
@@ -1759,8 +1763,7 @@ bool SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I,
                                                    IRBuilder<> &B,
                                                    bool UnknownElemTypeI8) {
   reportFatalOnTokenType(I);
-  if (!isPointerTy(I->getType()) || !requireAssignType(I) ||
-      isa<BitCastInst>(I))
+  if (!isPointerTy(I->getType()) || !requireAssignType(I))
     return false;
 
   setInsertPointAfterDef(B, I);
@@ -1861,8 +1864,9 @@ void SPIRVEmitIntrinsics::insertSpirvDecorations(Instruction *I,
 void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
                                                  IRBuilder<> &B) {
   auto *II = dyn_cast<IntrinsicInst>(I);
-  if (II && II->getIntrinsicID() == Intrinsic::spv_const_composite &&
-      TrackConstants) {
+  bool IsConstComposite =
+      II && II->getIntrinsicID() == Intrinsic::spv_const_composite;
+  if (IsConstComposite && TrackConstants) {
     setInsertPointAfterDef(B, I);
     auto t = AggrConsts.find(I);
     assert(t != AggrConsts.end());
@@ -1886,12 +1890,27 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
               : B.SetInsertPoint(I);
         BPrepared = true;
       }
+      Type *OpTy = Op->getType();
       Value *OpTyVal = Op;
-      if (Op->getType()->isTargetExtTy())
-        OpTyVal = PoisonValue::get(Op->getType());
-      auto *NewOp = buildIntrWithMD(Intrinsic::spv_track_constant,
-                                    {Op->getType(), OpTyVal->getType()}, Op,
-                                    OpTyVal, {}, B);
+      if (OpTy->isTargetExtTy())
+        OpTyVal = PoisonValue::get(OpTy);
+      CallInst *NewOp =
+          buildIntrWithMD(Intrinsic::spv_track_constant,
+                          {OpTy, OpTyVal->getType()}, Op, OpTyVal, {}, B);
+      Type *OpElemTy = nullptr;
+      if (!IsConstComposite && isPointerTy(OpTy) &&
+          (OpElemTy = GR->findDeducedElementType(Op)) != nullptr &&
+          OpElemTy != IntegerType::getInt8Ty(I->getContext())) {
+        buildAssignPtr(B, IntegerType::getInt8Ty(I->getContext()), NewOp);
+        SmallVector<Type *, 2> Types = {OpTy, OpTy};
+        SmallVector<Value *, 2> Args = {
+            NewOp, buildMD(PoisonValue::get(OpElemTy)),
+            B.getInt32(getPointerAddressSpace(OpTy))};
+        CallInst *PtrCasted =
+            B.CreateIntrinsic(Intrinsic::spv_ptrcast, {Types}, Args);
+        buildAssignPtr(B, OpElemTy, PtrCasted);
+        NewOp = PtrCasted;
+      }
       I->setOperand(OpNo, NewOp);
     }
   }
@@ -2022,8 +2041,16 @@ void SPIRVEmitIntrinsics::processParamTypes(Function *F, IRBuilder<> &B) {
     if (!isUntypedPointerTy(Arg->getType()))
       continue;
     Type *ElemTy = GR->findDeducedElementType(Arg);
-    if (!ElemTy && (ElemTy = deduceFunParamElementType(F, OpIdx)) != nullptr)
-      buildAssignPtr(B, ElemTy, Arg);
+    if (!ElemTy && (ElemTy = deduceFunParamElementType(F, OpIdx)) != nullptr) {
+      if (CallInst *AssignCI = GR->findAssignPtrTypeInstr(Arg)) {
+        DenseSet<std::pair<Value *, Value *>> VisitedSubst;
+        updateAssignType(AssignCI, Arg, PoisonValue::get(ElemTy));
+        propagateElemType(Arg, IntegerType::getInt8Ty(F->getContext()),
+                          VisitedSubst);
+      } else {
+        buildAssignPtr(B, ElemTy, Arg);
+      }
+    }
   }
 }
 
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpPhi_ArgumentsPlaceholders.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpPhi_ArgumentsPlaceholders.ll
index c98fef3631e04b..ee5596ed38b1b7 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpPhi_ArgumentsPlaceholders.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpPhi_ArgumentsPlaceholders.ll
@@ -12,7 +12,8 @@
 ;;     }
 ;; }
 
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 ; XFAIL: *
 
 %struct.Node = type { %struct.Node.0 addrspace(1)* }
@@ -25,8 +26,8 @@ entry:
 for.cond:                                         ; preds = %for.inc, %entry
   %pNode.0 = phi %struct.Node addrspace(1)* [ %pNodes, %entry ], [ %1, %for.inc ]
   %j.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-; CHECK-SPIRV:      %[[#]] = OpPhi %[[#]] %[[#]] %[[#]] %[[#BitcastResultId:]] %[[#]]
-; CHECK-SPIRV-NEXT: OpPhi
+; CHECK:      %[[#]] = OpPhi %[[#]] %[[#]] %[[#]] %[[#BitcastResultId:]] %[[#]]
+; CHECK-NEXT: OpPhi
 
   %cmp = icmp slt i32 %j.0, 10
   br i1 %cmp, label %for.body, label %for.end
@@ -36,8 +37,8 @@ for.body:                                         ; preds = %for.cond
 
   %0 = load %struct.Node.0 addrspace(1)*, %struct.Node.0 addrspace(1)* addrspace(1)* %pNext, align 4
   %1 = bitcast %struct.Node.0 addrspace(1)* %0 to %struct.Node addrspace(1)*
-; CHECK-SPIRV: %[[#LoadResultId:]] = OpLoad %[[#]]
-; CHECK-SPIRV: %[[#BitcastResultId]] = OpBitcast %[[#]] %[[#LoadResultId]]
+; CHECK: %[[#LoadResultId:]] = OpLoad %[[#]]
+; CHECK: %[[#BitcastResultId]] = OpBitcast %[[#]] %[[#LoadResultId]]
 
   br label %for.inc
 
diff --git a/llvm/test/CodeGen/SPIRV/validate/sycl-tangle-group-algorithms.ll b/llvm/test/CodeGen/SPIRV/validate/sycl-tangle-group-algorithms.ll
new file mode 100644
index 00000000000000..b6b919f36d92c6
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/validate/sycl-tangle-group-algorithms.ll
@@ -0,0 +1,4673 @@
+; This is an excerpt from the SYCL end-to-end test suite, cleaned out from
+; unrelevant details, that reproduced cases of invalid SPIR-V generation due
+; to wrong types, deduced from the input LLVM IR. Namely, this test case covers
+; cases of type mismatch when null pointer constant is used in different
+; contexts and so with different pointee types, and intertwined
+; load/store/function call LLVM IR input with bitcasts inserted between
+; instruction uses.
+
+; The only pass criterion is that spirv-val considers output valid.
+
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64v1.5-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+%"nd_item" = type { i8 }
+%struct.AssertHappened = type { i32, [257 x i8], [257 x i8], [129 x i8], i32, i64, i64, i64, i64, i64, i64 }
+%"range" = type { %"detail::array" }
+%"detail::array" = type { [1 x i64] }
+%class.anon = type { %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor" }
+%"accessor" = type { %"detail::AccessorImplDevice", %union.anon }
+%"detail::AccessorImplDevice" = type { %"range", %"range", %"range" }
+%union.anon = type { ptr addrspace(1) }
+%class.anon.6 = type { ptr addrspace(4), ptr addrspace(4), ptr addrspace(4), ptr addrspace(4) }
+%"group" = type { %"range", %"range", %"range", %"range" }
+%"item" = type { %"detail::AccessorImplDevice" }
+%"item.22" = type { %"sd_ItemBase.23" }
+%"sd_ItemBase.23" = type { %"range", %"range" }
+%"tangle_group" = type { %"ss_sub_group_mask" }
+%"ss_sub_group_mask" = type { i64, i64 }
+%class.anon.8 = type { %"accessor", %"accessor", [8 x i8], %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor" }
+%"vec.16" = type { %"struct.std::array.20" }
+%"struct.std::array.20" = type { [4 x i32] }
+%class.anon.15 = type { ptr addrspace(4), ptr addrspace(4), ptr addrspace(4) }
+%class.anon.7 = type { ptr addrspace(4), ptr addrspace(4) }
+
+ at .str = private unnamed_addr addrspace(1) constant [21 x i8] c"bits_num <= max_bits\00", align 1
+ at .str.1 = private unnamed_addr addrspace(1) constant [17 x i8] c"subgroupmask.hpp\00", align 1
+ at __PRETTY_FUNCTION1 = private unnamed_addr addrspace(1) constant [32 x i8] c"subgroup_mask(BitsType, size_t)\00", align 1
+ at .str.2 = private unnamed_addr addrspace(1) constant [15 x i8] c"bn <= max_bits\00", align 1
+ at __PRETTY_FUNCTION2 = private unnamed_addr addrspace(1) constant [52 x i8] c"BitsType subgroup_mask::valuable_bits(size_t) const\00", align 1
+ at __spirv_BuiltInSubgroupMaxSize = external dso_local addrspace(1) constant i32, align 4
+ at __spirv_BuiltInSubgroupLocalInvocationId = external dso_local addrspace(1) constant i32, align 4
+ at _ZSt6ignore = linkonce_odr dso_local addrspace(1) constant %"nd_item" undef, align 1
+ at __spirv_BuiltInNumWorkgroups = external dso_local addrspace(1) constant <3 x i64>, align 32
+ at __spirv_BuiltInGlobalOffset = external dso_local addrspace(1) constant <3 x i64>, align 32
+ at __spirv_BuiltInGlobalInvocationId = external dso_local addrspace(1) constant <3 x i64>, align 32
+ at __spirv_BuiltInGlobalSize = external dso_local addrspace(1) constant <3 x i64>, align 32
+ at __spirv_BuiltInLocalInvocationId = external dso_local addrspace(1) constant <3 x i64>, align 32
+ at SPIR_AssertHappenedMem = linkonce_odr dso_local addrspace(1) global %struct.AssertHappened zeroinitializer
+ at __spirv_BuiltInWorkgroupId = external dso_local addrspace(1) constant <3 x i64>, align 32
+ at __spirv_BuiltInWorkgroupSize = external dso_local addrspace(1) constant <3 x i64>, align 32
+
+
+define weak_odr dso_local spir_kernel void @TestKernel(ptr addrspace(1) %_arg_TmpAcc, ptr byval(%"range") %_arg_TmpAcc1, ptr byval(%"range") %_arg_TmpAcc2, ptr byval(%"range") %_arg_TmpAcc3, ptr addrspace(1) align 1 %_arg_BarrierAcc, ptr byval(%"range") %_arg_BarrierAcc4, ptr byval(%"range") %_arg_BarrierAcc5, ptr byval(%"range") %_arg_BarrierAcc6, ptr addrspace(1) align 1 %_arg_BroadcastAcc, ptr byval(%"range") %_arg_BroadcastAcc7, ptr byval(%"range") %_arg_BroadcastAcc8, ptr byval(%"range") %_arg_BroadcastAcc9, ptr addrspace(1) align 1 %_arg_AnyAcc, ptr byval(%"range") %_arg_AnyAcc10, ptr byval(%"range") %_arg_AnyAcc11, ptr byval(%"range") %_arg_AnyAcc12, ptr addrspace(1) align 1 %_arg_AllAcc, ptr byval(%"range") %_arg_AllAcc13, ptr byval(%"range") %_arg_AllAcc14, ptr byval(%"range") %_arg_AllAcc15, ptr addrspace(1) align 1 %_arg_NoneAcc, ptr byval(%"range") %_arg_NoneAcc16, ptr byval(%"range") %_arg_NoneAcc17, ptr byval(%"range") %_arg_NoneAcc18, ptr addrspace(1) align 1 %_arg_ReduceAcc, ptr byval(%"range") %_arg_ReduceAcc19, ptr byval(%"range") %_arg_ReduceAcc20, ptr byval(%"range") %_arg_ReduceAcc21, ptr addrspace(1) align 1 %_arg_ExScanAcc, ptr byval(%"range") %_arg_ExScanAcc22, ptr byval(%"range") %_arg_ExScanAcc23, ptr byval(%"range") %_arg_ExScanAcc24, ptr addrspace(1) align 1 %_arg_IncScanAcc, ptr byval(%"range") %_arg_IncScanAcc25, ptr byval(%"range") %_arg_IncScanAcc26, ptr byval(%"range") %_arg_IncScanAcc27, ptr addrspace(1) align 1 %_arg_ShiftLeftAcc, ptr byval(%"range") %_arg_ShiftLeftAcc28, ptr byval(%"range") %_arg_ShiftLeftAcc29, ptr byval(%"range") %_arg_ShiftLeftAcc30, ptr addrspace(1) align 1 %_arg_ShiftRightAcc, ptr byval(%"range") %_arg_ShiftRightAcc31, ptr byval(%"range") %_arg_ShiftRightAcc32, ptr byval(%"range") %_arg_ShiftRightAcc33, ptr addrspace(1) align 1 %_arg_SelectAcc, ptr byval(%"range") %_arg_SelectAcc34, ptr byval(%"range") %_arg_SelectAcc35, ptr byval(%"range") %_arg_SelectAcc36, ptr addrspace(1) align 1 %_arg_PermuteXorAcc, ptr byval(%"range") %_arg_PermuteXorAcc37, ptr byval(%"range") %_arg_PermuteXorAcc38, ptr byval(%"range") %_arg_PermuteXorAcc39) {
+entry:
+  %_arg_TmpAcc.addr = alloca ptr addrspace(1)
+  %_arg_BarrierAcc.addr = alloca ptr addrspace(1)
+  %_arg_BroadcastAcc.addr = alloca ptr addrspace(1)
+  %_arg_AnyAcc.addr = alloca ptr addrspace(1)
+  %_arg_AllAcc.addr = alloca ptr addrspace(1)
+  %_arg_NoneAcc.addr = alloca ptr addrspace(1)
+  %_arg_ReduceAcc.addr = alloca ptr addrspace(1)
+  %_arg_ExScanAcc.addr = alloca ptr addrspace(1)
+  %_arg_IncScanAcc.addr = alloca ptr addrspace(1)
+  %_arg_ShiftLeftAcc.addr = alloca ptr addrspace(1)
+  %_arg_ShiftRightAcc.addr = alloca ptr addrspace(1)
+  %_arg_SelectAcc.addr = alloca ptr addrspace(1)
+  %_arg_PermuteXorAcc.addr = alloca ptr addrspace(1)
+  %Kernel = alloca %class.anon
+  %agg.tmp = alloca %"range"
+  %agg.tmp41 = alloca %"range"
+  %agg.tmp42 = alloca %"range"
+  %agg.tmp44 = alloca %"range"
+  %agg.tmp45 = alloca %"range"
+  %agg.tmp46 = alloca %"range"
+  %agg.tmp48 = alloca %"range"
+  %agg.tmp49 = alloca %"range"
+  %agg.tmp50 = alloca %"range"
+  %agg.tmp52 = alloca %"range"
+  %agg.tmp53 = alloca %"range"
+  %agg.tmp54 = alloca %"range"
+  %agg.tmp56 = alloca %"range"
+  %agg.tmp57 = alloca %"range"
+  %agg.tmp58 = alloca %"range"
+  %agg.tmp60 = alloca %"range"
+  %agg.tmp61 = alloca %"range"
+  %agg.tmp62 = alloca %"range"
+  %agg.tmp64 = alloca %"range"
+  %agg.tmp65 = alloca %"range"
+  %agg.tmp66 = alloca %"range"
+  %agg.tmp68 = alloca %"range"
+  %agg.tmp69 = alloca %"range"
+  %agg.tmp70 = alloca %"range"
+  %agg.tmp72 = alloca %"range"
+  %agg.tmp73 = alloca %"range"
+  %agg.tmp74 = alloca %"range"
+  %agg.tmp76 = alloca %"range"
+  %agg.tmp77 = alloca %"range"
+  %agg.tmp78 = alloca %"range"
+  %agg.tmp80 = alloca %"range"
+  %agg.tmp81 = alloca %"range"
+  %agg.tmp82 = alloca %"range"
+  %agg.tmp84 = alloca %"range"
+  %agg.tmp85 = alloca %"range"
+  %agg.tmp86 = alloca %"range"
+  %agg.tmp88 = alloca %"range"
+  %agg.tmp89 = alloca %"range"
+  %agg.tmp90 = alloca %"range"
+  %agg.tmp91 = alloca %"nd_item", align 1
+  %Kernel.ascast = addrspacecast ptr %Kernel to ptr addrspace(4)
+  %agg.tmp91.ascast = addrspacecast ptr %agg.tmp91 to ptr addrspace(4)
+  store ptr addrspace(1) %_arg_TmpAcc, ptr %_arg_TmpAcc.addr
+  store ptr addrspace(1) %_arg_BarrierAcc, ptr %_arg_BarrierAcc.addr
+  store ptr addrspace(1) %_arg_BroadcastAcc, ptr %_arg_BroadcastAcc.addr
+  store ptr addrspace(1) %_arg_AnyAcc, ptr %_arg_AnyAcc.addr
+  store ptr addrspace(1) %_arg_AllAcc, ptr %_arg_AllAcc.addr
+  store ptr addrspace(1) %_arg_NoneAcc, ptr %_arg_NoneAcc.addr
+  store ptr addrspace(1) %_arg_ReduceAcc, ptr %_arg_ReduceAcc.addr
+  store ptr addrspace(1) %_arg_ExScanAcc, ptr %_arg_ExScanAcc.addr
+  store ptr addrspace(1) %_arg_IncScanAcc, ptr %_arg_IncScanAcc.addr
+  store ptr addrspace(1) %_arg_ShiftLeftAcc, ptr %_arg_ShiftLeftAcc.addr
+  store ptr addrspace(1) %_arg_ShiftRightAcc, ptr %_arg_ShiftRightAcc.addr
+  store ptr addrspace(1) %_arg_SelectAcc, ptr %_arg_SelectAcc.addr
+  store ptr addrspace(1) %_arg_PermuteXorAcc, ptr %_arg_PermuteXorAcc.addr
+  %TmpAcc1 = bitcast ptr addrspace(4) %Kernel.ascast to ptr addrspace(4)
+  call spir_func void @Foo1(ptr addrspace(4) %TmpAcc1) 
+  %BarrierAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 1
+  call spir_func void @Foo2(ptr addrspace(4) %BarrierAcc) 
+  %BroadcastAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 2
+  call spir_func void @Foo2(ptr addrspace(4) %BroadcastAcc) 
+  %AnyAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 3
+  call spir_func void @Foo2(ptr addrspace(4) %AnyAcc) 
+  %AllAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 4
+  call spir_func void @Foo2(ptr addrspace(4) %AllAcc) 
+  %NoneAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 5
+  call spir_func void @Foo2(ptr addrspace(4) %NoneAcc) 
+  %ReduceAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 6
+  call spir_func void @Foo2(ptr addrspace(4) %ReduceAcc) 
+  %ExScanAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 7
+  call spir_func void @Foo2(ptr addrspace(4) %ExScanAcc) 
+  %IncScanAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 8
+  call spir_func void @Foo2(ptr addrspace(4) %IncScanAcc) 
+  %ShiftLeftAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 9
+  call spir_func void @Foo2(ptr addrspace(4) %ShiftLeftAcc) 
+  %ShiftRightAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 10
+  call spir_func void @Foo2(ptr addrspace(4) %ShiftRightAcc) 
+  %SelectAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 11
+  call spir_func void @Foo2(ptr addrspace(4) %SelectAcc) 
+  %PermuteXorAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 12
+  call spir_func void @Foo2(ptr addrspace(4) %PermuteXorAcc) 
+  %TmpAcc402 = bitcast ptr addrspace(4) %Kernel.ascast to ptr addrspace(4)
+  %0 = load ptr addrspace(1), ptr %_arg_TmpAcc.addr
+  call spir_func void @Foo3(ptr addrspace(4) %TmpAcc402, ptr addrspace(1) %0, ptr byval(%"range") %agg.tmp, ptr byval(%"range") %agg.tmp41, ptr byval(%"range") %agg.tmp42) 
+  %BarrierAcc43 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 1
+  %1 = load ptr addrspace(1), ptr %_arg_BarrierAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %BarrierAcc43, ptr addrspace(1) %1, ptr byval(%"range") %agg.tmp44, ptr byval(%"range") %agg.tmp45, ptr byval(%"range") %agg.tmp46) 
+  %BroadcastAcc47 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 2
+  %2 = load ptr addrspace(1), ptr %_arg_BroadcastAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %BroadcastAcc47, ptr addrspace(1) %2, ptr byval(%"range") %agg.tmp48, ptr byval(%"range") %agg.tmp49, ptr byval(%"range") %agg.tmp50) 
+  %AnyAcc51 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 3
+  %3 = load ptr addrspace(1), ptr %_arg_AnyAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %AnyAcc51, ptr addrspace(1) %3, ptr byval(%"range") %agg.tmp52, ptr byval(%"range") %agg.tmp53, ptr byval(%"range") %agg.tmp54) 
+  %AllAcc55 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 4
+  %4 = load ptr addrspace(1), ptr %_arg_AllAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %AllAcc55, ptr addrspace(1) %4, ptr byval(%"range") %agg.tmp56, ptr byval(%"range") %agg.tmp57, ptr byval(%"range") %agg.tmp58) 
+  %NoneAcc59 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 5
+  %5 = load ptr addrspace(1), ptr %_arg_NoneAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %NoneAcc59, ptr addrspace(1) %5, ptr byval(%"range") %agg.tmp60, ptr byval(%"range") %agg.tmp61, ptr byval(%"range") %agg.tmp62) 
+  %ReduceAcc63 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 6
+  %6 = load ptr addrspace(1), ptr %_arg_ReduceAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %ReduceAcc63, ptr addrspace(1) %6, ptr byval(%"range") %agg.tmp64, ptr byval(%"range") %agg.tmp65, ptr byval(%"range") %agg.tmp66) 
+  %ExScanAcc67 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 7
+  %7 = load ptr addrspace(1), ptr %_arg_ExScanAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %ExScanAcc67, ptr addrspace(1) %7, ptr byval(%"range") %agg.tmp68, ptr byval(%"range") %agg.tmp69, ptr byval(%"range") %agg.tmp70) 
+  %IncScanAcc71 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 8
+  %8 = load ptr addrspace(1), ptr %_arg_IncScanAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %IncScanAcc71, ptr addrspace(1) %8, ptr byval(%"range") %agg.tmp72, ptr byval(%"range") %agg.tmp73, ptr byval(%"range") %agg.tmp74) 
+  %ShiftLeftAcc75 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 9
+  %9 = load ptr addrspace(1), ptr %_arg_ShiftLeftAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %ShiftLeftAcc75, ptr addrspace(1) %9, ptr byval(%"range") %agg.tmp76, ptr byval(%"range") %agg.tmp77, ptr byval(%"range") %agg.tmp78) 
+  %ShiftRightAcc79 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 10
+  %10 = load ptr addrspace(1), ptr %_arg_ShiftRightAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %ShiftRightAcc79, ptr addrspace(1) %10, ptr byval(%"range") %agg.tmp80, ptr byval(%"range") %agg.tmp81, ptr byval(%"range") %agg.tmp82) 
+  %SelectAcc83 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 11
+  %11 = load ptr addrspace(1), ptr %_arg_SelectAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %SelectAcc83, ptr addrspace(1) %11, ptr byval(%"range") %agg.tmp84, ptr byval(%"range") %agg.tmp85, ptr byval(%"range") %agg.tmp86) 
+  %PermuteXorAcc87 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 12
+  %12 = load ptr addrspace(1), ptr %_arg_PermuteXorAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %PermuteXorAcc87, ptr addrspace(1) %12, ptr byval(%"range") %agg.tmp88, ptr byval(%"range") %agg.tmp89, ptr byval(%"range") %agg.tmp90) 
+  %call = call spir_func ptr addrspace(4) @Foo5() 
+  call spir_func void @Foo6(ptr addrspace(4) dead_on_unwind writable sret(%"nd_item") align 1 %agg.tmp91.ascast, ptr addrspace(4) %call) 
+  call spir_func void @Foo22(ptr addrspace(4) %Kernel.ascast, ptr byval(%"nd_item") align 1 %agg.tmp91) 
+  ret void
+}
+
+define internal spir_func void @Foo1(ptr addrspace(4) %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"range"
+  %agg.tmp2 = alloca %"range"
+  %agg.tmp3 = alloca %"range"
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  %agg.tmp2.ascast = addrspacecast ptr %agg.tmp2 to ptr addrspace(4)
+  %agg.tmp3.ascast = addrspacecast ptr %agg.tmp3 to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  call void @llvm.memset.p0.i64(ptr %agg.tmp, i8 0, i64 8, i1 false)
+  call spir_func void @Foo11(ptr addrspace(4) %agg.tmp.ascast) 
+  call spir_func void @Foo12(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp2.ascast) 
+  call spir_func void @Foo12(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp3.ascast) 
+  call spir_func void @Foo10(ptr addrspace(4) %impl1, ptr byval(%"range") %agg.tmp, ptr byval(%"range") %agg.tmp2, ptr byval(%"range") %agg.tmp3) 
+  ret void
+}
+
+
+define internal spir_func void @Foo2(ptr addrspace(4) %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"range"
+  %agg.tmp2 = alloca %"range"
+  %agg.tmp3 = alloca %"range"
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  %agg.tmp2.ascast = addrspacecast ptr %agg.tmp2 to ptr addrspace(4)
+  %agg.tmp3.ascast = addrspacecast ptr %agg.tmp3 to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  call void @llvm.memset.p0.i64(ptr %agg.tmp, i8 0, i64 8, i1 false)
+  call spir_func void @Foo11(ptr addrspace(4) %agg.tmp.ascast) 
+  call spir_func void @Foo12(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp2.ascast) 
+  call spir_func void @Foo12(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp3.ascast) 
+  call spir_func void @Foo10(ptr addrspace(4) %impl1, ptr byval(%"range") %agg.tmp, ptr byval(%"range") %agg.tmp2, ptr byval(%"range") %agg.tmp3) 
+  ret void
+}
+
+
+
+
+define internal spir_func void @Foo3(ptr addrspace(4) %this, ptr addrspace(1) %Ptr, ptr byval(%"range") %AccessRange, ptr byval(%"range") %MemRange, ptr byval(%"range") %Offset) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %Ptr.addr = alloca ptr addrspace(1)
+  %ref.tmp = alloca %class.anon.6
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(1) %Ptr, ptr %Ptr.addr
+  %AccessRange.ascast = addrspacecast ptr %AccessRange to ptr addrspace(4)
+  %MemRange.ascast = addrspacecast ptr %MemRange to ptr addrspace(4)
+  %Offset.ascast = addrspacecast ptr %Offset to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load ptr addrspace(1), ptr %Ptr.addr
+  %1 = getelementptr inbounds nuw %"accessor", ptr addrspace(4) %this1, i32 0, i32 1
+  store ptr addrspace(1) %0, ptr addrspace(4) %1
+  %2 = bitcast ptr %ref.tmp to ptr
+  store ptr addrspace(4) %this1, ptr %2
+  %Offset2 = getelementptr inbounds %class.anon.6, ptr %ref.tmp, i32 0, i32 1
+  store ptr addrspace(4) %Offset.ascast, ptr %Offset2
+  %AccessRange3 = getelementptr inbounds %class.anon.6, ptr %ref.tmp, i32 0, i32 2
+  store ptr addrspace(4) %AccessRange.ascast, ptr %AccessRange3
+  %MemRange4 = getelementptr inbounds %class.anon.6, ptr %ref.tmp, i32 0, i32 3
+  store ptr addrspace(4) %MemRange.ascast, ptr %MemRange4
+  call spir_func void @Foo13(ptr addrspace(4) %ref.tmp.ascast) 
+  %call = call spir_func i64 @Foo21(ptr addrspace(4) %this1) 
+  %3 = getelementptr inbounds nuw %"accessor", ptr addrspace(4) %this1, i32 0, i32 1
+  %4 = load ptr addrspace(1), ptr addrspace(4) %3
+  %add.ptr = getelementptr inbounds nuw i64, ptr addrspace(1) %4, i64 %call
+  store ptr addrspace(1) %add.ptr, ptr addrspace(4) %3
+  ret void
+}
+
+
+define internal spir_func void @Foo4(ptr addrspace(4) %this, ptr addrspace(1) %Ptr, ptr byval(%"range") %AccessRange, ptr byval(%"range") %MemRange, ptr byval(%"range") %Offset) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %Ptr.addr = alloca ptr addrspace(1)
+  %ref.tmp = alloca %class.anon.6
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(1) %Ptr, ptr %Ptr.addr
+  %AccessRange.ascast = addrspacecast ptr %AccessRange to ptr addrspace(4)
+  %MemRange.ascast = addrspacecast ptr %MemRange to ptr addrspace(4)
+  %Offset.ascast = addrspacecast ptr %Offset to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load ptr addrspace(1), ptr %Ptr.addr
+  %1 = getelementptr inbounds nuw %"accessor", ptr addrspace(4) %this1, i32 0, i32 1
+  store ptr addrspace(1) %0, ptr addrspace(4) %1
+  %2 = bitcast ptr %ref.tmp to ptr
+  store ptr addrspace(4) %this1, ptr %2
+  %Offset2 = getelementptr inbounds %class.anon.6, ptr %ref.tmp, i32 0, i32 1
+  store ptr addrspace(4) %Offset.ascast, ptr %Offset2
+  %AccessRange3 = getelementptr inbounds %class.anon.6, ptr %ref.tmp, i32 0, i32 2
+  store ptr addrspace(4) %AccessRange.ascast, ptr %AccessRange3
+  %MemRange4 = getelementptr inbounds %class.anon.6, ptr %ref.tmp, i32 0, i32 3
+  store ptr addrspace(4) %MemRange.ascast, ptr %MemRange4
+  call spir_func void @Foo30(ptr addrspace(4) %ref.tmp.ascast) 
+  %call = call spir_func i64 @Foo32(ptr addrspace(4) %this1) 
+  %3 = getelementptr inbounds nuw %"accessor", ptr addrspace(4) %this1, i32 0, i32 1
+  %4 = load ptr addrspace(1), ptr addrspace(4) %3
+  %add.ptr = getelementptr inbounds nuw i8, ptr addrspace(1) %4, i64 %call
+  store ptr addrspace(1) %add.ptr, ptr addrspace(4) %3
+  ret void
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo5() {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  ret ptr addrspace(4) null
+}
+
+
+define internal spir_func void @Foo6(ptr addrspace(4) dead_on_unwind noalias writable sret(%"nd_item") align 1 %agg.result, ptr addrspace(4) %0) {
+entry:
+  %.addr = alloca ptr addrspace(4)
+  %GlobalSize = alloca %"range"
+  %LocalSize = alloca %"range"
+  %GroupRange = alloca %"range"
+  %GroupId = alloca %"range"
+  %GlobalId = alloca %"range"
+  %LocalId = alloca %"range"
+  %GlobalOffset = alloca %"range"
+  %Group = alloca %"group"
+  %GlobalItem = alloca %"item"
+  %LocalItem = alloca %"item.22"
+  %cleanup.dest.slot = alloca i32, align 4
+  %GlobalSize.ascast = addrspacecast ptr %GlobalSize to ptr addrspace(4)
+  %LocalSize.ascast = addrspacecast ptr %LocalSize to ptr addrspace(4)
+  %GroupRange.ascast = addrspacecast ptr %GroupRange to ptr addrspace(4)
+  %GroupId.ascast = addrspacecast ptr %GroupId to ptr addrspace(4)
+  %GlobalId.ascast = addrspacecast ptr %GlobalId to ptr addrspace(4)
+  %LocalId.ascast = addrspacecast ptr %LocalId to ptr addrspace(4)
+  %GlobalOffset.ascast = addrspacecast ptr %GlobalOffset to ptr addrspace(4)
+  %Group.ascast = addrspacecast ptr %Group to ptr addrspace(4)
+  %GlobalItem.ascast = addrspacecast ptr %GlobalItem to ptr addrspace(4)
+  %LocalItem.ascast = addrspacecast ptr %LocalItem to ptr addrspace(4)
+  store ptr addrspace(4) %0, ptr %.addr
+  call spir_func void @Foo7(ptr addrspace(4) dead_on_unwind writable sret(%"range") %GlobalSize.ascast) 
+  call spir_func void @Init1(ptr addrspace(4) dead_on_unwind writable sret(%"range") %LocalSize.ascast) 
+  call spir_func void @Init2(ptr addrspace(4) dead_on_unwind writable sret(%"range") %GroupRange.ascast) 
+  call spir_func void @Init3(ptr addrspace(4) dead_on_unwind writable sret(%"range") %GroupId.ascast) 
+  call spir_func void @Init6(ptr addrspace(4) dead_on_unwind writable sret(%"range") %GlobalId.ascast) 
+  call spir_func void @Init4(ptr addrspace(4) dead_on_unwind writable sret(%"range") %LocalId.ascast) 
+  call spir_func void @Init5(ptr addrspace(4) dead_on_unwind writable sret(%"range") %GlobalOffset.ascast) 
+  call spir_func void @Foo23(ptr addrspace(4) dead_on_unwind writable sret(%"group") %Group.ascast, ptr addrspace(4) %GlobalSize.ascast, ptr addrspace(4) %LocalSize.ascast, ptr addrspace(4) %GroupRange.ascast, ptr addrspace(4) %GroupId.ascast) 
+  call spir_func void @Foo24(ptr addrspace(4) dead_on_unwind writable sret(%"item") %GlobalItem.ascast, ptr addrspace(4) %GlobalSize.ascast, ptr addrspace(4) %GlobalId.ascast, ptr addrspace(4) %GlobalOffset.ascast) 
+  call spir_func void @Foo25(ptr addrspace(4) dead_on_unwind writable sret(%"item.22") %LocalItem.ascast, ptr addrspace(4) %LocalSize.ascast, ptr addrspace(4) %LocalId.ascast) 
+  call spir_func void @Foo26(ptr addrspace(4) dead_on_unwind writable sret(%"nd_item") align 1 %agg.result, ptr addrspace(4) %GlobalItem.ascast, ptr addrspace(4) %LocalItem.ascast, ptr addrspace(4) %Group.ascast) 
+  ret void
+}
+
+
+define internal spir_func void @Foo22(ptr addrspace(4) %this, ptr byval(%"nd_item") align 1 %item) {
+entry:
+  %this.addr.i76 = alloca ptr addrspace(4)
+  %WI.addr.i = alloca i64
+  %TangleLeader.addr.i = alloca i64
+  %TangleSize.addr.i = alloca i64
+  %agg.tmp.i = alloca %"range"
+  %agg.tmp2.i = alloca %"tangle_group"
+  %Visible.i = alloca i64
+  %Other.i = alloca i64
+  %agg.tmp5.i = alloca %"range"
+  %agg.tmp8.i = alloca %"range"
+  %OriginalLID.i = alloca i32, align 4
+  %LID.i = alloca i32, align 4
+  %BroadcastResult.i = alloca i32, align 4
+  %agg.tmp12.i = alloca %"tangle_group"
+  %agg.tmp15.i = alloca %"range"
+  %AnyResult.i = alloca i8, align 1
+  %agg.tmp18.i = alloca %"tangle_group"
+  %agg.tmp24.i = alloca %"range"
+  %AllResult.i = alloca i8, align 1
+  %agg.tmp27.i = alloca %"tangle_group"
+  %agg.tmp35.i = alloca %"range"
+  %NoneResult.i = alloca i8, align 1
+  %agg.tmp38.i = alloca %"tangle_group"
+  %agg.tmp46.i = alloca %"range"
+  %ReduceResult.i = alloca i32, align 4
+  %agg.tmp49.i = alloca %"tangle_group"
+  %agg.tmp50.i = alloca %"nd_item", align 1
+  %agg.tmp54.i = alloca %"range"
+  %ExScanResult.i = alloca i32, align 4
+  %agg.tmp57.i = alloca %"tangle_group"
+  %agg.tmp58.i = alloca %"nd_item", align 1
+  %agg.tmp61.i = alloca %"range"
+  %IncScanResult.i = alloca i32, align 4
+  %agg.tmp64.i = alloca %"tangle_group"
+  %agg.tmp65.i = alloca %"nd_item", align 1
+  %agg.tmp69.i = alloca %"range"
+  %ShiftLeftResult.i = alloca i32, align 4
+  %agg.tmp72.i = alloca %"tangle_group"
+  %agg.tmp79.i = alloca %"range"
+  %ShiftRightResult.i = alloca i32, align 4
+  %agg.tmp82.i = alloca %"tangle_group"
+  %agg.tmp88.i = alloca %"range"
+  %SelectResult.i = alloca i32, align 4
+  %agg.tmp91.i = alloca %"tangle_group"
+  %agg.tmp92.i = alloca %"range"
+  %ref.tmp.i = alloca %"range"
+  %ref.tmp93.i = alloca %"range"
+  %ref.tmp94.i = alloca i32, align 4
+  %agg.tmp100.i = alloca %"range"
+  %PermuteXorResult.i = alloca i32, align 4
+  %agg.tmp103.i = alloca %"tangle_group"
+  %agg.tmp106.i = alloca %"range"
+  %agg.tmp18.ascast.ascast75 = alloca %"nd_item"
+  %agg.tmp17.ascast.ascast74 = alloca %"tangle_group"
+  %retval.i66 = alloca i64
+  %this.addr.i67 = alloca ptr addrspace(4)
+  %Result.i68 = alloca i64
+  %retval.i58 = alloca i64
+  %this.addr.i59 = alloca ptr addrspace(4)
+  %Result.i60 = alloca i64
+  %retval.i50 = alloca i64
+  %this.addr.i51 = alloca ptr addrspace(4)
+  %Result.i52 = alloca i64
+  %retval.i42 = alloca i64
+  %this.addr.i43 = alloca ptr addrspace(4)
+  %Result.i44 = alloca i64
+  %retval.i = alloca i64
+  %this.addr.i = alloca ptr addrspace(4)
+  %Result.i = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %WI = alloca %"range"
+  %SG = alloca %"nd_item", align 1
+  %BranchBody = alloca %class.anon.8
+  %ref.tmp = alloca %"range"
+  %ref.tmp15 = alloca i32, align 4
+  %Tangle = alloca %"tangle_group"
+  %agg.tmp = alloca %"nd_item", align 1
+  %TangleLeader = alloca i64
+  %TangleSize = alloca i64
+  %IsMember = alloca %"nd_item", align 1
+  %agg.tmp17 = alloca %"tangle_group"
+  %agg.tmp18 = alloca %"nd_item", align 1
+  %ref.tmp19 = alloca %"range"
+  %ref.tmp20 = alloca i32, align 4
+  %Tangle24 = alloca %"tangle_group"
+  %agg.tmp25 = alloca %"nd_item", align 1
+  %TangleLeader26 = alloca i64
+  %TangleSize27 = alloca i64
+  %IsMember28 = alloca %"nd_item", align 1
+  %agg.tmp30 = alloca %"tangle_group"
+  %agg.tmp31 = alloca %"nd_item", align 1
+  %Tangle33 = alloca %"tangle_group"
+  %agg.tmp34 = alloca %"nd_item", align 1
+  %TangleLeader35 = alloca i64
+  %TangleSize36 = alloca i64
+  %IsMember37 = alloca %"nd_item", align 1
+  %agg.tmp39 = alloca %"tangle_group"
+  %agg.tmp40 = alloca %"nd_item", align 1
+  %WI.ascast = addrspacecast ptr %WI to ptr addrspace(4)
+  %SG.ascast = addrspacecast ptr %SG to ptr addrspace(4)
+  %BranchBody.ascast = addrspacecast ptr %BranchBody to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %ref.tmp15.ascast = addrspacecast ptr %ref.tmp15 to ptr addrspace(4)
+  %Tangle.ascast = addrspacecast ptr %Tangle to ptr addrspace(4)
+  %IsMember.ascast = addrspacecast ptr %IsMember to ptr addrspace(4)
+  %ref.tmp19.ascast = addrspacecast ptr %ref.tmp19 to ptr addrspace(4)
+  %ref.tmp20.ascast = addrspacecast ptr %ref.tmp20 to ptr addrspace(4)
+  %Tangle24.ascast = addrspacecast ptr %Tangle24 to ptr addrspace(4)
+  %IsMember28.ascast = addrspacecast ptr %IsMember28 to ptr addrspace(4)
+  %Tangle33.ascast = addrspacecast ptr %Tangle33 to ptr addrspace(4)
+  %IsMember37.ascast = addrspacecast ptr %IsMember37 to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %item.ascast = addrspacecast ptr %item to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  call spir_func void @Foo40(ptr addrspace(4) dead_on_unwind writable sret(%"range") %WI.ascast, ptr addrspace(4) align 1 %item.ascast) 
+  call spir_func void @Foo41(ptr addrspace(4) dead_on_unwind writable sret(%"nd_item") align 1 %SG.ascast, ptr addrspace(4) align 1 %item.ascast) 
+  %TmpAcc1 = bitcast ptr %BranchBody to ptr
+  %TmpAcc22 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %BarrierAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 1
+  %BarrierAcc3 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 1
+  %0 = getelementptr inbounds i8, ptr addrspace(4) %BranchBody.ascast, i64 64
+  %BroadcastAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 3
+  %BroadcastAcc4 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 2
+  %AnyAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 4
+  %AnyAcc5 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 3
+  %AllAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 5
+  %AllAcc6 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 4
+  %NoneAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 6
+  %NoneAcc7 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 5
+  %ReduceAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 7
+  %ReduceAcc8 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 6
+  %ExScanAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 8
+  %ExScanAcc9 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 7
+  %IncScanAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 9
+  %IncScanAcc10 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 8
+  %ShiftLeftAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 10
+  %ShiftLeftAcc11 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 9
+  %ShiftRightAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 11
+  %ShiftRightAcc12 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 10
+  %SelectAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 12
+  %SelectAcc13 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 11
+  %PermuteXorAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 13
+  %PermuteXorAcc14 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 12
+  store i32 4, ptr %ref.tmp15, align 4
+  call spir_func void @Foo42(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) %WI.ascast, ptr addrspace(4) align 4 %ref.tmp15.ascast) 
+  %retval.ascast.i69 = addrspacecast ptr %retval.i66 to ptr addrspace(4)
+  store ptr addrspace(4) %ref.tmp.ascast, ptr %this.addr.i67
+  %this1.i72 = load ptr addrspace(4), ptr %this.addr.i67
+  %1 = load i64, ptr addrspace(4) %this1.i72
+  store i64 %1, ptr %Result.i68
+  %2 = load i64, ptr %Result.i68
+  %tobool = icmp ne i64 %2, 0
+  br i1 %tobool, label %if.then, label %if.else
+
+if.else:                                          ; preds = %entry
+  store i32 24, ptr %ref.tmp20, align 4
+  call spir_func void @Foo42(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp19.ascast, ptr addrspace(4) %WI.ascast, ptr addrspace(4) align 4 %ref.tmp20.ascast) 
+  %retval.ascast.i53 = addrspacecast ptr %retval.i50 to ptr addrspace(4)
+  store ptr addrspace(4) %ref.tmp19.ascast, ptr %this.addr.i51
+  %this1.i56 = load ptr addrspace(4), ptr %this.addr.i51
+  %3 = load i64, ptr addrspace(4) %this1.i56
+  store i64 %3, ptr %Result.i52
+  %4 = load i64, ptr %Result.i52
+  %tobool22 = icmp ne i64 %4, 0
+  br i1 %tobool22, label %if.then23, label %if.else32
+
+if.else32:                                        ; preds = %if.else
+  call spir_func void @Foo43(ptr addrspace(4) dead_on_unwind writable sret(%"tangle_group") %Tangle33.ascast, ptr byval(%"nd_item") align 1 %agg.tmp34) 
+  store i64 24, ptr %TangleLeader35
+  store i64 8, ptr %TangleSize36
+  %retval.ascast.i = addrspacecast ptr %retval.i to ptr addrspace(4)
+  store ptr addrspace(4) %WI.ascast, ptr %this.addr.i
+  %this1.i = load ptr addrspace(4), ptr %this.addr.i
+  %5 = load i64, ptr addrspace(4) %this1.i
+  store i64 %5, ptr %Result.i
+  %6 = load i64, ptr %Result.i
+  %7 = load i64, ptr %TangleLeader35
+  %8 = load i64, ptr %TangleSize36
+  call spir_func void @Foo69(ptr addrspace(4) %BranchBody.ascast, i64 %6, ptr byval(%"tangle_group") %agg.tmp39, i64 %7, i64 %8, ptr byval(%"nd_item") align 1 %agg.tmp40) 
+  br label %if.end41
+
+if.then23:                                        ; preds = %if.else
+  call spir_func void @Foo43(ptr addrspace(4) dead_on_unwind writable sret(%"tangle_group") %Tangle24.ascast, ptr byval(%"nd_item") align 1 %agg.tmp25) 
+  store i64 4, ptr %TangleLeader26
+  store i64 20, ptr %TangleSize27
+  %retval.ascast.i45 = addrspacecast ptr %retval.i42 to ptr addrspace(4)
+  store ptr addrspace(4) %WI.ascast, ptr %this.addr.i43
+  %this1.i48 = load ptr addrspace(4), ptr %this.addr.i43
+  %9 = load i64, ptr addrspace(4) %this1.i48
+  store i64 %9, ptr %Result.i44
+  %10 = load i64, ptr %Result.i44
+  %11 = load i64, ptr %TangleLeader26
+  %12 = load i64, ptr %TangleSize27
+  call spir_func void @Foo68(ptr addrspace(4) %BranchBody.ascast, i64 %10, ptr byval(%"tangle_group") %agg.tmp30, i64 %11, i64 %12, ptr byval(%"nd_item") align 1 %agg.tmp31) 
+  br label %if.end41
+
+if.then:                                          ; preds = %entry
+  call spir_func void @Foo43(ptr addrspace(4) dead_on_unwind writable sret(%"tangle_group") %Tangle.ascast, ptr byval(%"nd_item") align 1 %agg.tmp) 
+  store i64 0, ptr %TangleLeader
+  store i64 4, ptr %TangleSize
+  %retval.ascast.i61 = addrspacecast ptr %retval.i58 to ptr addrspace(4)
+  store ptr addrspace(4) %WI.ascast, ptr %this.addr.i59
+  %this1.i64 = load ptr addrspace(4), ptr %this.addr.i59
+  %13 = load i64, ptr addrspace(4) %this1.i64
+  store i64 %13, ptr %Result.i60
+  %14 = load i64, ptr %Result.i60
+  %15 = load i64, ptr %TangleLeader
+  %16 = load i64, ptr %TangleSize
+  %TangleSize.addr.ascast.i = addrspacecast ptr %TangleSize.addr.i to ptr addrspace(4)
+  %agg.tmp.ascast.i = addrspacecast ptr %agg.tmp.i to ptr addrspace(4)
+  %agg.tmp5.ascast.i = addrspacecast ptr %agg.tmp5.i to ptr addrspace(4)
+  %agg.tmp8.ascast.i = addrspacecast ptr %agg.tmp8.i to ptr addrspace(4)
+  %agg.tmp15.ascast.i = addrspacecast ptr %agg.tmp15.i to ptr addrspace(4)
+  %agg.tmp24.ascast.i = addrspacecast ptr %agg.tmp24.i to ptr addrspace(4)
+  %agg.tmp35.ascast.i = addrspacecast ptr %agg.tmp35.i to ptr addrspace(4)
+  %agg.tmp46.ascast.i = addrspacecast ptr %agg.tmp46.i to ptr addrspace(4)
+  %agg.tmp50.ascast.i = addrspacecast ptr %agg.tmp50.i to ptr addrspace(4)
+  %agg.tmp54.ascast.i = addrspacecast ptr %agg.tmp54.i to ptr addrspace(4)
+  %agg.tmp58.ascast.i = addrspacecast ptr %agg.tmp58.i to ptr addrspace(4)
+  %agg.tmp61.ascast.i = addrspacecast ptr %agg.tmp61.i to ptr addrspace(4)
+  %agg.tmp65.ascast.i = addrspacecast ptr %agg.tmp65.i to ptr addrspace(4)
+  %agg.tmp69.ascast.i = addrspacecast ptr %agg.tmp69.i to ptr addrspace(4)
+  %agg.tmp79.ascast.i = addrspacecast ptr %agg.tmp79.i to ptr addrspace(4)
+  %agg.tmp88.ascast.i = addrspacecast ptr %agg.tmp88.i to ptr addrspace(4)
+  %agg.tmp92.ascast.i = addrspacecast ptr %agg.tmp92.i to ptr addrspace(4)
+  %ref.tmp.ascast.i = addrspacecast ptr %ref.tmp.i to ptr addrspace(4)
+  %ref.tmp93.ascast.i = addrspacecast ptr %ref.tmp93.i to ptr addrspace(4)
+  %ref.tmp94.ascast.i = addrspacecast ptr %ref.tmp94.i to ptr addrspace(4)
+  %agg.tmp100.ascast.i = addrspacecast ptr %agg.tmp100.i to ptr addrspace(4)
+  %agg.tmp106.ascast.i = addrspacecast ptr %agg.tmp106.i to ptr addrspace(4)
+  store ptr addrspace(4) %BranchBody.ascast, ptr %this.addr.i76
+  store i64 %14, ptr %WI.addr.i
+  %Tangle.ascast.i = addrspacecast ptr %agg.tmp17.ascast.ascast74 to ptr addrspace(4)
+  store i64 %15, ptr %TangleLeader.addr.i
+  store i64 %16, ptr %TangleSize.addr.i
+  %IsMember.ascast.i = addrspacecast ptr %agg.tmp18.ascast.ascast75 to ptr addrspace(4)
+  %this1.i78 = load ptr addrspace(4), ptr %this.addr.i76
+  %17 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp.ascast.i, i64 %17) 
+  %call.i = call spir_func ptr addrspace(4) @Foo70(ptr addrspace(4) %this1.i78, ptr byval(%"range") %agg.tmp.i) 
+  store i64 1, ptr addrspace(4) %call.i
+  call spir_func void @Foo75(ptr byval(%"tangle_group") %agg.tmp2.i, i32 1) 
+  store i64 0, ptr %Visible.i
+  store i64 0, ptr %Other.i
+  br label %for.cond.i
+
+for.cond.i:                                       ; preds = %if.end.i, %if.then
+  %18 = load i64, ptr %Other.i
+  %cmp.i79 = icmp ult i64 %18, 32
+  br i1 %cmp.i79, label %for.body.i, label %for.cond.cleanup.i
+
+for.cond.cleanup.i:                               ; preds = %for.cond.i
+  %19 = load i64, ptr %Visible.i
+  %20 = load i64, ptr %TangleSize.addr.i
+  %cmp7.i = icmp eq i64 %19, %20
+  %BarrierAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 1
+  %21 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp8.ascast.i, i64 %21) 
+  %call9.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %BarrierAcc.i, ptr byval(%"range") %agg.tmp8.i) 
+  %storedv.i = zext i1 %cmp7.i to i8
+  store i8 %storedv.i, ptr addrspace(4) %call9.i, align 1
+  %22 = getelementptr inbounds i8, ptr addrspace(4) %this1.i78, i64 64
+  %call10.i = call spir_func i32 @Foo76(ptr addrspace(4) align 1 %22) 
+  store i32 %call10.i, ptr %OriginalLID.i, align 4
+  %call11.i = call spir_func i32 @Foo90(ptr addrspace(4) %Tangle.ascast.i) 
+  store i32 %call11.i, ptr %LID.i, align 4
+  %23 = load i32, ptr %OriginalLID.i, align 4
+  %call13.i = call spir_func i32 @Foo91(ptr byval(%"tangle_group") %agg.tmp12.i, i32 %23, i32 0) 
+  store i32 %call13.i, ptr %BroadcastResult.i, align 4
+  %24 = load i32, ptr %BroadcastResult.i, align 4
+  %conv.i = zext i32 %24 to i64
+  %25 = load i64, ptr %TangleLeader.addr.i
+  %cmp14.i = icmp eq i64 %conv.i, %25
+  %BroadcastAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 3
+  %26 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp15.ascast.i, i64 %26) 
+  %call16.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %BroadcastAcc.i, ptr byval(%"range") %agg.tmp15.i) 
+  %storedv17.i = zext i1 %cmp14.i to i8
+  store i8 %storedv17.i, ptr addrspace(4) %call16.i, align 1
+  %27 = load i32, ptr %LID.i, align 4
+  %cmp19.i = icmp eq i32 %27, 0
+  %call20.i = call spir_func zeroext i1 @Foo92(ptr byval(%"tangle_group") %agg.tmp18.i, i1 zeroext %cmp19.i) 
+  %storedv21.i = zext i1 %call20.i to i8
+  store i8 %storedv21.i, ptr %AnyResult.i, align 1
+  %28 = load i8, ptr %AnyResult.i, align 1  
+  %loadedv.i = trunc i8 %28 to i1
+  %conv22.i = zext i1 %loadedv.i to i32
+  %AnyAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 4
+  %29 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp24.ascast.i, i64 %29) 
+  %call25.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %AnyAcc.i, ptr byval(%"range") %agg.tmp24.i) 
+  %storedv26.i = zext i1 %loadedv.i to i8
+  store i8 %storedv26.i, ptr addrspace(4) %call25.i, align 1
+  %30 = load i32, ptr %LID.i, align 4
+  %conv28.i = zext i32 %30 to i64
+  %31 = load i64, ptr %TangleSize.addr.i
+  %cmp29.i = icmp ult i64 %conv28.i, %31
+  %call30.i = call spir_func zeroext i1 @Foo67(ptr byval(%"tangle_group") %agg.tmp27.i, i1 zeroext %cmp29.i) 
+  %storedv31.i = zext i1 %call30.i to i8
+  store i8 %storedv31.i, ptr %AllResult.i, align 1
+  %32 = load i8, ptr %AllResult.i, align 1  
+  %loadedv32.i = trunc i8 %32 to i1
+  %conv33.i = zext i1 %loadedv32.i to i32
+  %AllAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 5
+  %33 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp35.ascast.i, i64 %33) 
+  %call36.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %AllAcc.i, ptr byval(%"range") %agg.tmp35.i) 
+  %storedv37.i = zext i1 %loadedv32.i to i8
+  store i8 %storedv37.i, ptr addrspace(4) %call36.i, align 1
+  %34 = load i32, ptr %LID.i, align 4
+  %conv39.i = zext i32 %34 to i64
+  %35 = load i64, ptr %TangleSize.addr.i
+  %cmp40.i = icmp uge i64 %conv39.i, %35
+  %call41.i = call spir_func zeroext i1 @Foo65(ptr byval(%"tangle_group") %agg.tmp38.i, i1 zeroext %cmp40.i) 
+  %storedv42.i = zext i1 %call41.i to i8
+  store i8 %storedv42.i, ptr %NoneResult.i, align 1
+  %36 = load i8, ptr %NoneResult.i, align 1  
+  %loadedv43.i = trunc i8 %36 to i1
+  %conv44.i = zext i1 %loadedv43.i to i32
+  %NoneAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 6
+  %37 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp46.ascast.i, i64 %37) 
+  %call47.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %NoneAcc.i, ptr byval(%"range") %agg.tmp46.i) 
+  %storedv48.i = zext i1 %loadedv43.i to i8
+  store i8 %storedv48.i, ptr addrspace(4) %call47.i, align 1
+  %call51.i = call spir_func i32 @Foo64(ptr byval(%"tangle_group") %agg.tmp49.i, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp50.i) 
+  store i32 %call51.i, ptr %ReduceResult.i, align 4
+  %38 = load i32, ptr %ReduceResult.i, align 4
+  %conv52.i = zext i32 %38 to i64
+  %39 = load i64, ptr %TangleSize.addr.i
+  %cmp53.i = icmp eq i64 %conv52.i, %39
+  %ReduceAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 7
+  %40 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp54.ascast.i, i64 %40) 
+  %call55.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ReduceAcc.i, ptr byval(%"range") %agg.tmp54.i) 
+  %storedv56.i = zext i1 %cmp53.i to i8
+  store i8 %storedv56.i, ptr addrspace(4) %call55.i, align 1
+  %call59.i = call spir_func i32 @Foo63(ptr byval(%"tangle_group") %agg.tmp57.i, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp58.i) 
+  store i32 %call59.i, ptr %ExScanResult.i, align 4
+  %41 = load i32, ptr %ExScanResult.i, align 4
+  %42 = load i32, ptr %LID.i, align 4
+  %cmp60.i = icmp eq i32 %41, %42
+  %ExScanAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 8
+  %43 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp61.ascast.i, i64 %43) 
+  %call62.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ExScanAcc.i, ptr byval(%"range") %agg.tmp61.i) 
+  %storedv63.i = zext i1 %cmp60.i to i8
+  store i8 %storedv63.i, ptr addrspace(4) %call62.i, align 1
+  %call66.i = call spir_func i32 @Foo62(ptr byval(%"tangle_group") %agg.tmp64.i, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp65.i) 
+  store i32 %call66.i, ptr %IncScanResult.i, align 4
+  %44 = load i32, ptr %IncScanResult.i, align 4
+  %45 = load i32, ptr %LID.i, align 4
+  %add67.i = add i32 %45, 1
+  %cmp68.i = icmp eq i32 %44, %add67.i
+  %IncScanAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 9
+  %46 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp69.ascast.i, i64 %46) 
+  %call70.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %IncScanAcc.i, ptr byval(%"range") %agg.tmp69.i) 
+  %storedv71.i = zext i1 %cmp68.i to i8
+  store i8 %storedv71.i, ptr addrspace(4) %call70.i, align 1
+  %47 = load i32, ptr %LID.i, align 4
+  %call73.i = call spir_func i32 @Foo73(ptr byval(%"tangle_group") %agg.tmp72.i, i32 %47, i32 2) 
+  store i32 %call73.i, ptr %ShiftLeftResult.i, align 4
+  %48 = load i32, ptr %LID.i, align 4
+  %add74.i = add i32 %48, 2
+  %conv75.i = zext i32 %add74.i to i64
+  %49 = load i64, ptr %TangleSize.addr.i
+  %cmp76.i = icmp uge i64 %conv75.i, %49
+  br i1 %cmp76.i, label %lor.end.i, label %lor.rhs.i
+
+lor.rhs.i:                                        ; preds = %for.cond.cleanup.i
+  %50 = load i32, ptr %ShiftLeftResult.i, align 4
+  %51 = load i32, ptr %LID.i, align 4
+  %add77.i = add i32 %51, 2
+  %cmp78.i = icmp eq i32 %50, %add77.i
+  br label %lor.end.i
+
+lor.end.i:                                        ; preds = %lor.rhs.i, %for.cond.cleanup.i
+  %52 = phi i1 [ true, %for.cond.cleanup.i ], [ %cmp78.i, %lor.rhs.i ]
+  %ShiftLeftAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 10
+  %53 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp79.ascast.i, i64 %53) 
+  %call80.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ShiftLeftAcc.i, ptr byval(%"range") %agg.tmp79.i) 
+  %storedv81.i = zext i1 %52 to i8
+  store i8 %storedv81.i, ptr addrspace(4) %call80.i, align 1
+  %54 = load i32, ptr %LID.i, align 4
+  %call83.i = call spir_func i32 @Foo53(ptr byval(%"tangle_group") %agg.tmp82.i, i32 %54, i32 2) 
+  store i32 %call83.i, ptr %ShiftRightResult.i, align 4
+  %55 = load i32, ptr %LID.i, align 4
+  %cmp84.i = icmp ult i32 %55, 2
+  br i1 %cmp84.i, label %l1.exit, label %lor.rhs85.i
+
+lor.rhs85.i:                                      ; preds = %lor.end.i
+  %56 = load i32, ptr %ShiftRightResult.i, align 4
+  %57 = load i32, ptr %LID.i, align 4
+  %sub.i = sub i32 %57, 2
+  %cmp86.i = icmp eq i32 %56, %sub.i
+  br label %l1.exit
+
+l1.exit: ; preds = %lor.rhs85.i, %lor.end.i
+  %58 = phi i1 [ true, %lor.end.i ], [ %cmp86.i, %lor.rhs85.i ]
+  %ShiftRightAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 11
+  %59 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp88.ascast.i, i64 %59) 
+  %call89.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ShiftRightAcc.i, ptr byval(%"range") %agg.tmp88.i) 
+  %storedv90.i = zext i1 %58 to i8
+  store i8 %storedv90.i, ptr addrspace(4) %call89.i, align 1
+  %60 = load i32, ptr %LID.i, align 4
+  call spir_func void @Foo51(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp93.ascast.i, ptr addrspace(4) %Tangle.ascast.i) 
+  store i32 2, ptr %ref.tmp94.i, align 4
+  call spir_func void @Foo55(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast.i, ptr addrspace(4) %ref.tmp93.ascast.i, ptr addrspace(4) align 4 %ref.tmp94.ascast.i) 
+  call spir_func void @Foo56(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp92.ascast.i, ptr addrspace(4) %ref.tmp.ascast.i, ptr addrspace(4) %TangleSize.addr.ascast.i) 
+  %call95.i = call spir_func i32 @Foo57(ptr byval(%"tangle_group") %agg.tmp91.i, i32 %60, ptr byval(%"range") %agg.tmp92.i) 
+  store i32 %call95.i, ptr %SelectResult.i, align 4
+  %61 = load i32, ptr %SelectResult.i, align 4
+  %conv96.i = zext i32 %61 to i64
+  %62 = load i32, ptr %LID.i, align 4
+  %add97.i = add i32 %62, 2
+  %conv98.i = zext i32 %add97.i to i64
+  %63 = load i64, ptr %TangleSize.addr.i
+  %rem.i = urem i64 %conv98.i, %63
+  %cmp99.i = icmp eq i64 %conv96.i, %rem.i
+  %SelectAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 12
+  %64 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp100.ascast.i, i64 %64) 
+  %call101.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %SelectAcc.i, ptr byval(%"range") %agg.tmp100.i) 
+  %storedv102.i = zext i1 %cmp99.i to i8
+  store i8 %storedv102.i, ptr addrspace(4) %call101.i, align 1
+  %65 = load i32, ptr %LID.i, align 4
+  %call104.i = call spir_func i32 @Foo58(ptr byval(%"tangle_group") %agg.tmp103.i, i32 %65, i32 2) 
+  store i32 %call104.i, ptr %PermuteXorResult.i, align 4
+  %66 = load i32, ptr %PermuteXorResult.i, align 4
+  %67 = load i32, ptr %LID.i, align 4
+  %xor.i = xor i32 %67, 2
+  %cmp105.i = icmp eq i32 %66, %xor.i
+  %PermuteXorAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 13
+  %68 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp106.ascast.i, i64 %68) 
+  %call107.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %PermuteXorAcc.i, ptr byval(%"range") %agg.tmp106.i) 
+  %storedv108.i = zext i1 %cmp105.i to i8
+  store i8 %storedv108.i, ptr addrspace(4) %call107.i, align 1
+  br label %if.end41
+
+if.end41:                                         ; preds = %if.then23, %if.else32, %l1.exit
+  ret void
+
+for.body.i:                                       ; preds = %for.cond.i
+  %69 = load i64, ptr %Other.i
+  %call3.i = call spir_func zeroext i1 @Foo71(ptr addrspace(4) align 1 %IsMember.ascast.i, i64 %69) 
+  br i1 %call3.i, label %if.then.i, label %if.end.i
+
+if.then.i:                                        ; preds = %for.body.i
+  %70 = load i64, ptr %Other.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp5.ascast.i, i64 %70) 
+  %call6.i = call spir_func ptr addrspace(4) @Foo70(ptr addrspace(4) %this1.i78, ptr byval(%"range") %agg.tmp5.i) 
+  %71 = load i64, ptr addrspace(4) %call6.i
+  %72 = load i64, ptr %Visible.i
+  %add.i = add i64 %72, %71
+  store i64 %add.i, ptr %Visible.i
+  br label %if.end.i
+
+if.end.i:                                         ; preds = %if.then.i, %for.body.i
+  %73 = load i64, ptr %Other.i
+  %inc.i = add i64 %73, 1
+  store i64 %inc.i, ptr %Other.i
+  br label %for.cond.i
+}
+
+define internal spir_func void @Foo40(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) align 1 %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  call spir_func void @Init6(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+define internal spir_func void @Foo41(ptr addrspace(4) dead_on_unwind noalias writable sret(%"nd_item") align 1 %agg.result, ptr addrspace(4) align 1 %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  ret void
+}
+
+
+
+
+define internal spir_func void @Foo42(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) %lhs, ptr addrspace(4) align 4 %rhs) {
+entry:
+  %lhs.addr = alloca ptr addrspace(4)
+  %rhs.addr = alloca ptr addrspace(4)
+  %i = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  store ptr addrspace(4) %lhs, ptr %lhs.addr
+  store ptr addrspace(4) %rhs, ptr %rhs.addr
+  call spir_func void @Foo11(ptr addrspace(4) %agg.result) 
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %1 = load ptr addrspace(4), ptr %lhs.addr
+  %common_array1 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %2 = load i32, ptr %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array1, i64 0, i64 %idxprom
+  %3 = load i64, ptr addrspace(4) %arrayidx
+  %4 = load ptr addrspace(4), ptr %rhs.addr
+  %5 = load i32, ptr addrspace(4) %4, align 4
+  %conv = sext i32 %5 to i64
+  %cmp1 = icmp ult i64 %3, %conv
+  %conv2 = zext i1 %cmp1 to i64
+  %common_array32 = bitcast ptr addrspace(4) %agg.result to ptr addrspace(4)
+  %6 = load i32, ptr %i, align 4
+  %idxprom4 = sext i32 %6 to i64
+  %arrayidx5 = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array32, i64 0, i64 %idxprom4
+  store i64 %conv2, ptr addrspace(4) %arrayidx5
+  %7 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+}
+
+declare void @llvm.assume(i1) 
+
+
+define internal spir_func void @Foo43(ptr addrspace(4) dead_on_unwind noalias writable sret(%"tangle_group") %agg.result, ptr byval(%"nd_item") align 1 %group) {
+entry:
+  %mask = alloca %"ss_sub_group_mask"
+  %agg.tmp = alloca %"nd_item", align 1
+  %agg.tmp1 = alloca %"ss_sub_group_mask"
+  %cleanup.dest.slot = alloca i32, align 4
+  %mask.ascast = addrspacecast ptr %mask to ptr addrspace(4)
+  %group.ascast = addrspacecast ptr %group to ptr addrspace(4)
+  call spir_func void @Foo44(ptr addrspace(4) dead_on_unwind writable sret(%"ss_sub_group_mask") %mask.ascast, ptr byval(%"nd_item") align 1 %agg.tmp, i1 zeroext true) 
+  call spir_func void @Foo45(ptr addrspace(4) %agg.result, ptr byval(%"ss_sub_group_mask") %agg.tmp1) 
+  ret void
+}
+
+
+define internal spir_func void @Foo46(ptr addrspace(4) %this, i64 %dim0) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %dim0.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %dim0, ptr %dim0.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i64, ptr %dim0.addr
+  call spir_func void @Foo60(ptr addrspace(4) %this1, i64 %0) 
+  ret void
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo70(ptr addrspace(4) %this, ptr byval(%"range") %Index) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %LinearIndex = alloca i64
+  %agg.tmp = alloca %"range"
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %call = call spir_func i64 @Foo93(ptr addrspace(4) %this1, ptr byval(%"range") %agg.tmp) 
+  store i64 %call, ptr %LinearIndex
+  %call2 = call spir_func ptr addrspace(1) @Foo94(ptr addrspace(4) %this1) 
+  %0 = load i64, ptr %LinearIndex
+  %arrayidx = getelementptr inbounds nuw i64, ptr addrspace(1) %call2, i64 %0
+  %arrayidx.ascast = addrspacecast ptr addrspace(1) %arrayidx to ptr addrspace(4)
+  ret ptr addrspace(4) %arrayidx.ascast
+}
+
+
+define internal spir_func void @Foo75(ptr byval(%"tangle_group") %G, i32 %FenceScope) {
+entry:
+  %FenceScope.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  store i32 %FenceScope, ptr %FenceScope.addr, align 4
+  %0 = load i32, ptr %FenceScope.addr, align 4
+  call spir_func void @Foo95(ptr byval(%"tangle_group") %agg.tmp, i32 %0, i32 5) 
+  ret void
+}
+
+
+define internal spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %this, ptr byval(%"range") %Index) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %LinearIndex = alloca i64
+  %agg.tmp = alloca %"range"
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %call = call spir_func i64 @Foo77(ptr addrspace(4) %this1, ptr byval(%"range") %agg.tmp) 
+  store i64 %call, ptr %LinearIndex
+  %call2 = call spir_func ptr addrspace(1) @Foo78(ptr addrspace(4) %this1) 
+  %0 = load i64, ptr %LinearIndex
+  %arrayidx = getelementptr inbounds nuw i8, ptr addrspace(1) %call2, i64 %0
+  %arrayidx.ascast = addrspacecast ptr addrspace(1) %arrayidx to ptr addrspace(4)
+  ret ptr addrspace(4) %arrayidx.ascast
+}
+
+
+define internal spir_func i32 @Foo76(ptr addrspace(4) align 1 %this) {
+entry:
+  %retval = alloca i32, align 4
+  %this.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"range"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  call spir_func void @Foo96(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) align 1 %this1) 
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %ref.tmp.ascast, i32 0) 
+  %0 = load i64, ptr addrspace(4) %call
+  %conv = trunc i64 %0 to i32
+  ret i32 %conv
+}
+
+
+define internal spir_func i32 @Foo90(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca i32, align 4
+  %this.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"range"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  call spir_func void @Foo51(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) %this1) 
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %ref.tmp.ascast, i32 0) 
+  %0 = load i64, ptr addrspace(4) %call
+  %conv = trunc i64 %0 to i32
+  ret i32 %conv
+}
+
+
+define internal spir_func i32 @Foo91(ptr byval(%"tangle_group") %g, i32 %x, i32 %linear_local_id) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %linear_local_id.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"range"
+  %agg.tmp2 = alloca %"range"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %agg.tmp1.ascast = addrspacecast ptr %agg.tmp1 to ptr addrspace(4)
+  %agg.tmp2.ascast = addrspacecast ptr %agg.tmp2 to ptr addrspace(4)
+  %g.ascast = addrspacecast ptr %g to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  store i32 %linear_local_id, ptr %linear_local_id.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  call spir_func void @Foo97(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp2.ascast, ptr addrspace(4) %g.ascast) 
+  %1 = load i32, ptr %linear_local_id.addr, align 4
+  %conv = zext i32 %1 to i64
+  call spir_func void @Foo98(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp1.ascast, ptr byval(%"range") %agg.tmp2, i64 %conv) 
+  %call = call spir_func i32 @Bar69(ptr byval(%"tangle_group") %agg.tmp, i32 %0, ptr byval(%"range") %agg.tmp1) 
+  ret i32 %call
+}
+
+
+define internal spir_func zeroext i1 @Foo92(ptr byval(%"tangle_group") %g, i1 zeroext %pred) {
+entry:
+  %retval = alloca i1, align 1
+  %pred.addr = alloca i8, align 1
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %storedv = zext i1 %pred to i8
+  store i8 %storedv, ptr %pred.addr, align 1
+  %0 = load i8, ptr %pred.addr, align 1  
+  %loadedv = trunc i8 %0 to i1
+  %call = call spir_func zeroext i1 @Bar10(ptr byval(%"tangle_group") %agg.tmp, i1 zeroext %loadedv) 
+  ret i1 %call
+}
+
+
+define internal spir_func zeroext i1 @Foo67(ptr byval(%"tangle_group") %g, i1 zeroext %pred) {
+entry:
+  %retval = alloca i1, align 1
+  %pred.addr = alloca i8, align 1
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %storedv = zext i1 %pred to i8
+  store i8 %storedv, ptr %pred.addr, align 1
+  %0 = load i8, ptr %pred.addr, align 1  
+  %loadedv = trunc i8 %0 to i1
+  %call = call spir_func zeroext i1 @Foo66(ptr byval(%"tangle_group") %agg.tmp, i1 zeroext %loadedv) 
+  ret i1 %call
+}
+
+
+define internal spir_func zeroext i1 @Foo65(ptr byval(%"tangle_group") %g, i1 zeroext %pred) {
+entry:
+  %retval = alloca i1, align 1
+  %pred.addr = alloca i8, align 1
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %storedv = zext i1 %pred to i8
+  store i8 %storedv, ptr %pred.addr, align 1
+  %0 = load i8, ptr %pred.addr, align 1  
+  %loadedv = trunc i8 %0 to i1
+  %lnot = xor i1 %loadedv, true
+  %call = call spir_func zeroext i1 @Foo66(ptr byval(%"tangle_group") %agg.tmp, i1 zeroext %lnot) 
+  ret i1 %call
+}
+
+
+define internal spir_func i32 @Foo64(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"nd_item") align 1 %binary_op) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"nd_item", align 1
+  %agg.tmp2 = alloca %"nd_item", align 1
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %binary_op.ascast = addrspacecast ptr %binary_op to ptr addrspace(4)
+  %0 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar11(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"nd_item") align 1 %agg.tmp1, i32 %0, ptr byval(%"nd_item") align 1 %agg.tmp2) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Foo63(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"nd_item") align 1 %binary_op) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %res = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"nd_item", align 1
+  %agg.tmp2 = alloca %"nd_item", align 1
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %binary_op.ascast = addrspacecast ptr %binary_op to ptr addrspace(4)
+  %0 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar12(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"nd_item") align 1 %agg.tmp1, i32 %0, ptr byval(%"nd_item") align 1 %agg.tmp2) 
+  store i32 %call, ptr %res, align 4
+  %1 = load i32, ptr %res, align 4
+  ret i32 %1
+}
+
+
+define internal spir_func i32 @Foo62(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"nd_item") align 1 %binary_op) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"nd_item", align 1
+  %agg.tmp2 = alloca %"nd_item", align 1
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %binary_op.ascast = addrspacecast ptr %binary_op to ptr addrspace(4)
+  %0 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Foo61(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"nd_item") align 1 %agg.tmp1, i32 %0, ptr byval(%"nd_item") align 1 %agg.tmp2) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Foo73(ptr byval(%"tangle_group") %g, i32 %x, i32 %delta) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %delta.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  store i32 %delta, ptr %delta.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %1 = load i32, ptr %delta.addr, align 4
+  %call = call spir_func i32 @Foo72(ptr byval(%"tangle_group") %agg.tmp, i32 %0, i32 %1) 
+  ret i32 %call
+}
+
+
+define internal spir_func zeroext i1 @Foo71(ptr addrspace(4) align 1 %this, i64 %Other) {
+entry:
+  %retval = alloca i1, align 1
+  %this.addr = alloca ptr addrspace(4)
+  %Other.addr = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %Other, ptr %Other.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i64, ptr %Other.addr
+  %cmp = icmp ult i64 %0, 4
+  ret i1 %cmp
+}
+
+
+define internal spir_func i32 @Foo53(ptr byval(%"tangle_group") %g, i32 %x, i32 %delta) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %delta.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  store i32 %delta, ptr %delta.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %1 = load i32, ptr %delta.addr, align 4
+  %call = call spir_func i32 @Foo52(ptr byval(%"tangle_group") %agg.tmp, i32 %0, i32 %1) 
+  ret i32 %call
+}
+
+
+define internal spir_func void @Foo51(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"ss_sub_group_mask"
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %Mask1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %call = call spir_func i32 @Foo47(ptr byval(%"ss_sub_group_mask") %agg.tmp) 
+  %conv = zext i32 %call to i64
+  call spir_func void @Foo46(ptr addrspace(4) %agg.result, i64 %conv) 
+  ret void
+}
+
+
+define internal spir_func void @Foo55(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) %lhs, ptr addrspace(4) align 4 %rhs) {
+entry:
+  %lhs.addr = alloca ptr addrspace(4)
+  %rhs.addr = alloca ptr addrspace(4)
+  %i = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  store ptr addrspace(4) %lhs, ptr %lhs.addr
+  store ptr addrspace(4) %rhs, ptr %rhs.addr
+  call spir_func void @Foo11(ptr addrspace(4) %agg.result) 
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %1 = load ptr addrspace(4), ptr %lhs.addr
+  %common_array2 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %2 = load i32, ptr %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array2, i64 0, i64 %idxprom
+  %3 = load i64, ptr addrspace(4) %arrayidx
+  %4 = load ptr addrspace(4), ptr %rhs.addr
+  %5 = load i32, ptr addrspace(4) %4, align 4
+  %conv = sext i32 %5 to i64
+  %add = add i64 %3, %conv
+  %common_array13 = bitcast ptr addrspace(4) %agg.result to ptr addrspace(4)
+  %6 = load i32, ptr %i, align 4
+  %idxprom2 = sext i32 %6 to i64
+  %arrayidx3 = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array13, i64 0, i64 %idxprom2
+  store i64 %add, ptr addrspace(4) %arrayidx3
+  %7 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+}
+
+
+define internal spir_func void @Foo56(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) %lhs, ptr addrspace(4) %rhs) {
+entry:
+  %lhs.addr = alloca ptr addrspace(4)
+  %rhs.addr = alloca ptr addrspace(4)
+  %i = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  store ptr addrspace(4) %lhs, ptr %lhs.addr
+  store ptr addrspace(4) %rhs, ptr %rhs.addr
+  call spir_func void @Foo11(ptr addrspace(4) %agg.result) 
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %1 = load ptr addrspace(4), ptr %lhs.addr
+  %common_array2 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %2 = load i32, ptr %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array2, i64 0, i64 %idxprom
+  %3 = load i64, ptr addrspace(4) %arrayidx
+  %4 = load ptr addrspace(4), ptr %rhs.addr
+  %5 = load i64, ptr addrspace(4) %4
+  %rem = urem i64 %3, %5
+  %common_array13 = bitcast ptr addrspace(4) %agg.result to ptr addrspace(4)
+  %6 = load i32, ptr %i, align 4
+  %idxprom2 = sext i32 %6 to i64
+  %arrayidx3 = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array13, i64 0, i64 %idxprom2
+  store i64 %rem, ptr addrspace(4) %arrayidx3
+  %7 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+}
+
+
+define internal spir_func i32 @Foo57(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"range") %local_id) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"range"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Foo59(ptr byval(%"tangle_group") %agg.tmp, i32 %0, ptr byval(%"range") %agg.tmp1) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Foo58(ptr byval(%"tangle_group") %g, i32 %x, i32 %mask) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %mask.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"range"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %agg.tmp1.ascast = addrspacecast ptr %agg.tmp1 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  store i32 %mask, ptr %mask.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %1 = load i32, ptr %mask.addr, align 4
+  %conv = zext i32 %1 to i64
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp1.ascast, i64 %conv) 
+  %call = call spir_func i32 @Bar13(ptr byval(%"tangle_group") %agg.tmp, i32 %0, ptr byval(%"range") %agg.tmp1) 
+  ret i32 %call
+}
+
+
+define internal spir_func void @Foo68(ptr addrspace(4) %this, i64 %WI, ptr byval(%"tangle_group") %Tangle, i64 %TangleLeader, i64 %TangleSize, ptr byval(%"nd_item") align 1 %IsMember) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %WI.addr = alloca i64
+  %TangleLeader.addr = alloca i64
+  %TangleSize.addr = alloca i64
+  %agg.tmp = alloca %"range"
+  %agg.tmp2 = alloca %"tangle_group"
+  %Visible = alloca i64
+  %Other = alloca i64
+  %cleanup.dest.slot = alloca i32, align 4
+  %agg.tmp5 = alloca %"range"
+  %agg.tmp8 = alloca %"range"
+  %OriginalLID = alloca i32, align 4
+  %LID = alloca i32, align 4
+  %BroadcastResult = alloca i32, align 4
+  %agg.tmp12 = alloca %"tangle_group"
+  %agg.tmp15 = alloca %"range"
+  %AnyResult = alloca i8, align 1
+  %agg.tmp18 = alloca %"tangle_group"
+  %agg.tmp24 = alloca %"range"
+  %AllResult = alloca i8, align 1
+  %agg.tmp27 = alloca %"tangle_group"
+  %agg.tmp35 = alloca %"range"
+  %NoneResult = alloca i8, align 1
+  %agg.tmp38 = alloca %"tangle_group"
+  %agg.tmp46 = alloca %"range"
+  %ReduceResult = alloca i32, align 4
+  %agg.tmp49 = alloca %"tangle_group"
+  %agg.tmp50 = alloca %"nd_item", align 1
+  %agg.tmp54 = alloca %"range"
+  %ExScanResult = alloca i32, align 4
+  %agg.tmp57 = alloca %"tangle_group"
+  %agg.tmp58 = alloca %"nd_item", align 1
+  %agg.tmp61 = alloca %"range"
+  %IncScanResult = alloca i32, align 4
+  %agg.tmp64 = alloca %"tangle_group"
+  %agg.tmp65 = alloca %"nd_item", align 1
+  %agg.tmp69 = alloca %"range"
+  %ShiftLeftResult = alloca i32, align 4
+  %agg.tmp72 = alloca %"tangle_group"
+  %agg.tmp79 = alloca %"range"
+  %ShiftRightResult = alloca i32, align 4
+  %agg.tmp82 = alloca %"tangle_group"
+  %agg.tmp88 = alloca %"range"
+  %SelectResult = alloca i32, align 4
+  %agg.tmp91 = alloca %"tangle_group"
+  %agg.tmp92 = alloca %"range"
+  %ref.tmp = alloca %"range"
+  %ref.tmp93 = alloca %"range"
+  %ref.tmp94 = alloca i32, align 4
+  %agg.tmp100 = alloca %"range"
+  %PermuteXorResult = alloca i32, align 4
+  %agg.tmp103 = alloca %"tangle_group"
+  %agg.tmp106 = alloca %"range"
+  %TangleSize.addr.ascast = addrspacecast ptr %TangleSize.addr to ptr addrspace(4)
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  %agg.tmp5.ascast = addrspacecast ptr %agg.tmp5 to ptr addrspace(4)
+  %agg.tmp8.ascast = addrspacecast ptr %agg.tmp8 to ptr addrspace(4)
+  %agg.tmp15.ascast = addrspacecast ptr %agg.tmp15 to ptr addrspace(4)
+  %agg.tmp24.ascast = addrspacecast ptr %agg.tmp24 to ptr addrspace(4)
+  %agg.tmp35.ascast = addrspacecast ptr %agg.tmp35 to ptr addrspace(4)
+  %agg.tmp46.ascast = addrspacecast ptr %agg.tmp46 to ptr addrspace(4)
+  %agg.tmp54.ascast = addrspacecast ptr %agg.tmp54 to ptr addrspace(4)
+  %agg.tmp61.ascast = addrspacecast ptr %agg.tmp61 to ptr addrspace(4)
+  %agg.tmp69.ascast = addrspacecast ptr %agg.tmp69 to ptr addrspace(4)
+  %agg.tmp79.ascast = addrspacecast ptr %agg.tmp79 to ptr addrspace(4)
+  %agg.tmp88.ascast = addrspacecast ptr %agg.tmp88 to ptr addrspace(4)
+  %agg.tmp92.ascast = addrspacecast ptr %agg.tmp92 to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %ref.tmp93.ascast = addrspacecast ptr %ref.tmp93 to ptr addrspace(4)
+  %ref.tmp94.ascast = addrspacecast ptr %ref.tmp94 to ptr addrspace(4)
+  %agg.tmp100.ascast = addrspacecast ptr %agg.tmp100 to ptr addrspace(4)
+  %agg.tmp106.ascast = addrspacecast ptr %agg.tmp106 to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %WI, ptr %WI.addr
+  %Tangle.ascast = addrspacecast ptr %Tangle to ptr addrspace(4)
+  store i64 %TangleLeader, ptr %TangleLeader.addr
+  store i64 %TangleSize, ptr %TangleSize.addr
+  %IsMember.ascast = addrspacecast ptr %IsMember to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %TmpAcc1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp.ascast, i64 %0) 
+  %call = call spir_func ptr addrspace(4) @Foo70(ptr addrspace(4) %TmpAcc1, ptr byval(%"range") %agg.tmp) 
+  store i64 1, ptr addrspace(4) %call
+  call spir_func void @Foo75(ptr byval(%"tangle_group") %agg.tmp2, i32 1) 
+  store i64 0, ptr %Visible
+  store i64 0, ptr %Other
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end, %entry
+  %1 = load i64, ptr %Other
+  %cmp = icmp ult i64 %1, 32
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %2 = load i64, ptr %Visible
+  %3 = load i64, ptr %TangleSize.addr
+  %cmp7 = icmp eq i64 %2, %3
+  %BarrierAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 1
+  %4 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp8.ascast, i64 %4) 
+  %call9 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %BarrierAcc, ptr byval(%"range") %agg.tmp8) 
+  %storedv = zext i1 %cmp7 to i8
+  store i8 %storedv, ptr addrspace(4) %call9, align 1
+  %5 = getelementptr inbounds i8, ptr addrspace(4) %this1, i64 64
+  %call10 = call spir_func i32 @Foo76(ptr addrspace(4) align 1 %5) 
+  store i32 %call10, ptr %OriginalLID, align 4
+  %call11 = call spir_func i32 @Foo90(ptr addrspace(4) %Tangle.ascast) 
+  store i32 %call11, ptr %LID, align 4
+  %6 = load i32, ptr %OriginalLID, align 4
+  %call13 = call spir_func i32 @Foo91(ptr byval(%"tangle_group") %agg.tmp12, i32 %6, i32 0) 
+  store i32 %call13, ptr %BroadcastResult, align 4
+  %7 = load i32, ptr %BroadcastResult, align 4
+  %conv = zext i32 %7 to i64
+  %8 = load i64, ptr %TangleLeader.addr
+  %cmp14 = icmp eq i64 %conv, %8
+  %BroadcastAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 3
+  %9 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp15.ascast, i64 %9) 
+  %call16 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %BroadcastAcc, ptr byval(%"range") %agg.tmp15) 
+  %storedv17 = zext i1 %cmp14 to i8
+  store i8 %storedv17, ptr addrspace(4) %call16, align 1
+  %10 = load i32, ptr %LID, align 4
+  %cmp19 = icmp eq i32 %10, 0
+  %call20 = call spir_func zeroext i1 @Foo92(ptr byval(%"tangle_group") %agg.tmp18, i1 zeroext %cmp19) 
+  %storedv21 = zext i1 %call20 to i8
+  store i8 %storedv21, ptr %AnyResult, align 1
+  %11 = load i8, ptr %AnyResult, align 1  
+  %loadedv = trunc i8 %11 to i1
+  %conv22 = zext i1 %loadedv to i32
+  %cmp23 = icmp eq i32 %conv22, 1
+  %AnyAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 4
+  %12 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp24.ascast, i64 %12) 
+  %call25 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %AnyAcc, ptr byval(%"range") %agg.tmp24) 
+  %storedv26 = zext i1 %cmp23 to i8
+  store i8 %storedv26, ptr addrspace(4) %call25, align 1
+  %13 = load i32, ptr %LID, align 4
+  %conv28 = zext i32 %13 to i64
+  %14 = load i64, ptr %TangleSize.addr
+  %cmp29 = icmp ult i64 %conv28, %14
+  %call30 = call spir_func zeroext i1 @Foo67(ptr byval(%"tangle_group") %agg.tmp27, i1 zeroext %cmp29) 
+  %storedv31 = zext i1 %call30 to i8
+  store i8 %storedv31, ptr %AllResult, align 1
+  %15 = load i8, ptr %AllResult, align 1  
+  %loadedv32 = trunc i8 %15 to i1
+  %conv33 = zext i1 %loadedv32 to i32
+  %cmp34 = icmp eq i32 %conv33, 1
+  %AllAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 5
+  %16 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp35.ascast, i64 %16) 
+  %call36 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %AllAcc, ptr byval(%"range") %agg.tmp35) 
+  %storedv37 = zext i1 %cmp34 to i8
+  store i8 %storedv37, ptr addrspace(4) %call36, align 1
+  %17 = load i32, ptr %LID, align 4
+  %conv39 = zext i32 %17 to i64
+  %18 = load i64, ptr %TangleSize.addr
+  %cmp40 = icmp uge i64 %conv39, %18
+  %call41 = call spir_func zeroext i1 @Foo65(ptr byval(%"tangle_group") %agg.tmp38, i1 zeroext %cmp40) 
+  %storedv42 = zext i1 %call41 to i8
+  store i8 %storedv42, ptr %NoneResult, align 1
+  %19 = load i8, ptr %NoneResult, align 1  
+  %loadedv43 = trunc i8 %19 to i1
+  %conv44 = zext i1 %loadedv43 to i32
+  %cmp45 = icmp eq i32 %conv44, 1
+  %NoneAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 6
+  %20 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp46.ascast, i64 %20) 
+  %call47 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %NoneAcc, ptr byval(%"range") %agg.tmp46) 
+  %storedv48 = zext i1 %cmp45 to i8
+  store i8 %storedv48, ptr addrspace(4) %call47, align 1
+  %call51 = call spir_func i32 @Foo64(ptr byval(%"tangle_group") %agg.tmp49, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp50) 
+  store i32 %call51, ptr %ReduceResult, align 4
+  %21 = load i32, ptr %ReduceResult, align 4
+  %conv52 = zext i32 %21 to i64
+  %22 = load i64, ptr %TangleSize.addr
+  %cmp53 = icmp eq i64 %conv52, %22
+  %ReduceAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 7
+  %23 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp54.ascast, i64 %23) 
+  %call55 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ReduceAcc, ptr byval(%"range") %agg.tmp54) 
+  %storedv56 = zext i1 %cmp53 to i8
+  store i8 %storedv56, ptr addrspace(4) %call55, align 1
+  %call59 = call spir_func i32 @Foo63(ptr byval(%"tangle_group") %agg.tmp57, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp58) 
+  store i32 %call59, ptr %ExScanResult, align 4
+  %24 = load i32, ptr %ExScanResult, align 4
+  %25 = load i32, ptr %LID, align 4
+  %cmp60 = icmp eq i32 %24, %25
+  %ExScanAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 8
+  %26 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp61.ascast, i64 %26) 
+  %call62 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ExScanAcc, ptr byval(%"range") %agg.tmp61) 
+  %storedv63 = zext i1 %cmp60 to i8
+  store i8 %storedv63, ptr addrspace(4) %call62, align 1
+  %call66 = call spir_func i32 @Foo62(ptr byval(%"tangle_group") %agg.tmp64, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp65) 
+  store i32 %call66, ptr %IncScanResult, align 4
+  %27 = load i32, ptr %IncScanResult, align 4
+  %28 = load i32, ptr %LID, align 4
+  %add67 = add i32 %28, 1
+  %cmp68 = icmp eq i32 %27, %add67
+  %IncScanAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 9
+  %29 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp69.ascast, i64 %29) 
+  %call70 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %IncScanAcc, ptr byval(%"range") %agg.tmp69) 
+  %storedv71 = zext i1 %cmp68 to i8
+  store i8 %storedv71, ptr addrspace(4) %call70, align 1
+  %30 = load i32, ptr %LID, align 4
+  %call73 = call spir_func i32 @Foo73(ptr byval(%"tangle_group") %agg.tmp72, i32 %30, i32 2) 
+  store i32 %call73, ptr %ShiftLeftResult, align 4
+  %31 = load i32, ptr %LID, align 4
+  %add74 = add i32 %31, 2
+  %conv75 = zext i32 %add74 to i64
+  %32 = load i64, ptr %TangleSize.addr
+  %cmp76 = icmp uge i64 %conv75, %32
+  br i1 %cmp76, label %lor.end, label %lor.rhs
+
+lor.rhs:                                          ; preds = %for.cond.cleanup
+  %33 = load i32, ptr %ShiftLeftResult, align 4
+  %34 = load i32, ptr %LID, align 4
+  %add77 = add i32 %34, 2
+  %cmp78 = icmp eq i32 %33, %add77
+  br label %lor.end
+
+lor.end:                                          ; preds = %lor.rhs, %for.cond.cleanup
+  %35 = phi i1 [ true, %for.cond.cleanup ], [ %cmp78, %lor.rhs ]
+  %ShiftLeftAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 10
+  %36 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp79.ascast, i64 %36) 
+  %call80 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ShiftLeftAcc, ptr byval(%"range") %agg.tmp79) 
+  %storedv81 = zext i1 %35 to i8
+  store i8 %storedv81, ptr addrspace(4) %call80, align 1
+  %37 = load i32, ptr %LID, align 4
+  %call83 = call spir_func i32 @Foo53(ptr byval(%"tangle_group") %agg.tmp82, i32 %37, i32 2) 
+  store i32 %call83, ptr %ShiftRightResult, align 4
+  %38 = load i32, ptr %LID, align 4
+  %cmp84 = icmp ult i32 %38, 2
+  br i1 %cmp84, label %lor.end87, label %lor.rhs85
+
+lor.rhs85:                                        ; preds = %lor.end
+  %39 = load i32, ptr %ShiftRightResult, align 4
+  %40 = load i32, ptr %LID, align 4
+  %sub = sub i32 %40, 2
+  %cmp86 = icmp eq i32 %39, %sub
+  br label %lor.end87
+
+lor.end87:                                        ; preds = %lor.rhs85, %lor.end
+  %41 = phi i1 [ true, %lor.end ], [ %cmp86, %lor.rhs85 ]
+  %ShiftRightAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 11
+  %42 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp88.ascast, i64 %42) 
+  %call89 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ShiftRightAcc, ptr byval(%"range") %agg.tmp88) 
+  %storedv90 = zext i1 %41 to i8
+  store i8 %storedv90, ptr addrspace(4) %call89, align 1
+  %43 = load i32, ptr %LID, align 4
+  call spir_func void @Foo51(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp93.ascast, ptr addrspace(4) %Tangle.ascast) 
+  store i32 2, ptr %ref.tmp94, align 4
+  call spir_func void @Foo55(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) %ref.tmp93.ascast, ptr addrspace(4) align 4 %ref.tmp94.ascast) 
+  call spir_func void @Foo56(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp92.ascast, ptr addrspace(4) %ref.tmp.ascast, ptr addrspace(4) %TangleSize.addr.ascast) 
+  %call95 = call spir_func i32 @Foo57(ptr byval(%"tangle_group") %agg.tmp91, i32 %43, ptr byval(%"range") %agg.tmp92) 
+  store i32 %call95, ptr %SelectResult, align 4
+  %44 = load i32, ptr %SelectResult, align 4
+  %conv96 = zext i32 %44 to i64
+  %45 = load i32, ptr %LID, align 4
+  %add97 = add i32 %45, 2
+  %conv98 = zext i32 %add97 to i64
+  %46 = load i64, ptr %TangleSize.addr
+  %rem = urem i64 %conv98, %46
+  %cmp99 = icmp eq i64 %conv96, %rem
+  %SelectAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 12
+  %47 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp100.ascast, i64 %47) 
+  %call101 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %SelectAcc, ptr byval(%"range") %agg.tmp100) 
+  %storedv102 = zext i1 %cmp99 to i8
+  store i8 %storedv102, ptr addrspace(4) %call101, align 1
+  %48 = load i32, ptr %LID, align 4
+  %call104 = call spir_func i32 @Foo58(ptr byval(%"tangle_group") %agg.tmp103, i32 %48, i32 2) 
+  store i32 %call104, ptr %PermuteXorResult, align 4
+  %49 = load i32, ptr %PermuteXorResult, align 4
+  %50 = load i32, ptr %LID, align 4
+  %xor = xor i32 %50, 2
+  %cmp105 = icmp eq i32 %49, %xor
+  %PermuteXorAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 13
+  %51 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp106.ascast, i64 %51) 
+  %call107 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %PermuteXorAcc, ptr byval(%"range") %agg.tmp106) 
+  %storedv108 = zext i1 %cmp105 to i8
+  store i8 %storedv108, ptr addrspace(4) %call107, align 1
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %52 = load i64, ptr %Other
+  %call3 = call spir_func zeroext i1 @Foo74(ptr addrspace(4) align 1 %IsMember.ascast, i64 %52) 
+  br i1 %call3, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %TmpAcc42 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %53 = load i64, ptr %Other
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp5.ascast, i64 %53) 
+  %call6 = call spir_func ptr addrspace(4) @Foo70(ptr addrspace(4) %TmpAcc42, ptr byval(%"range") %agg.tmp5) 
+  %54 = load i64, ptr addrspace(4) %call6
+  %55 = load i64, ptr %Visible
+  %add = add i64 %55, %54
+  store i64 %add, ptr %Visible
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %56 = load i64, ptr %Other
+  %inc = add i64 %56, 1
+  store i64 %inc, ptr %Other
+  br label %for.cond
+}
+
+
+define internal spir_func void @Foo69(ptr addrspace(4) %this, i64 %WI, ptr byval(%"tangle_group") %Tangle, i64 %TangleLeader, i64 %TangleSize, ptr byval(%"nd_item") align 1 %IsMember) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %WI.addr = alloca i64
+  %TangleLeader.addr = alloca i64
+  %TangleSize.addr = alloca i64
+  %agg.tmp = alloca %"range"
+  %agg.tmp2 = alloca %"tangle_group"
+  %Visible = alloca i64
+  %Other = alloca i64
+  %cleanup.dest.slot = alloca i32, align 4
+  %agg.tmp5 = alloca %"range"
+  %agg.tmp8 = alloca %"range"
+  %OriginalLID = alloca i32, align 4
+  %LID = alloca i32, align 4
+  %BroadcastResult = alloca i32, align 4
+  %agg.tmp12 = alloca %"tangle_group"
+  %agg.tmp15 = alloca %"range"
+  %AnyResult = alloca i8, align 1
+  %agg.tmp18 = alloca %"tangle_group"
+  %agg.tmp24 = alloca %"range"
+  %AllResult = alloca i8, align 1
+  %agg.tmp27 = alloca %"tangle_group"
+  %agg.tmp35 = alloca %"range"
+  %NoneResult = alloca i8, align 1
+  %agg.tmp38 = alloca %"tangle_group"
+  %agg.tmp46 = alloca %"range"
+  %ReduceResult = alloca i32, align 4
+  %agg.tmp49 = alloca %"tangle_group"
+  %agg.tmp50 = alloca %"nd_item", align 1
+  %agg.tmp54 = alloca %"range"
+  %ExScanResult = alloca i32, align 4
+  %agg.tmp57 = alloca %"tangle_group"
+  %agg.tmp58 = alloca %"nd_item", align 1
+  %agg.tmp61 = alloca %"range"
+  %IncScanResult = alloca i32, align 4
+  %agg.tmp64 = alloca %"tangle_group"
+  %agg.tmp65 = alloca %"nd_item", align 1
+  %agg.tmp69 = alloca %"range"
+  %ShiftLeftResult = alloca i32, align 4
+  %agg.tmp72 = alloca %"tangle_group"
+  %agg.tmp79 = alloca %"range"
+  %ShiftRightResult = alloca i32, align 4
+  %agg.tmp82 = alloca %"tangle_group"
+  %agg.tmp88 = alloca %"range"
+  %SelectResult = alloca i32, align 4
+  %agg.tmp91 = alloca %"tangle_group"
+  %agg.tmp92 = alloca %"range"
+  %ref.tmp = alloca %"range"
+  %ref.tmp93 = alloca %"range"
+  %ref.tmp94 = alloca i32, align 4
+  %agg.tmp100 = alloca %"range"
+  %PermuteXorResult = alloca i32, align 4
+  %agg.tmp103 = alloca %"tangle_group"
+  %agg.tmp106 = alloca %"range"
+  %TangleSize.addr.ascast = addrspacecast ptr %TangleSize.addr to ptr addrspace(4)
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  %agg.tmp5.ascast = addrspacecast ptr %agg.tmp5 to ptr addrspace(4)
+  %agg.tmp8.ascast = addrspacecast ptr %agg.tmp8 to ptr addrspace(4)
+  %agg.tmp15.ascast = addrspacecast ptr %agg.tmp15 to ptr addrspace(4)
+  %agg.tmp24.ascast = addrspacecast ptr %agg.tmp24 to ptr addrspace(4)
+  %agg.tmp35.ascast = addrspacecast ptr %agg.tmp35 to ptr addrspace(4)
+  %agg.tmp46.ascast = addrspacecast ptr %agg.tmp46 to ptr addrspace(4)
+  %agg.tmp54.ascast = addrspacecast ptr %agg.tmp54 to ptr addrspace(4)
+  %agg.tmp61.ascast = addrspacecast ptr %agg.tmp61 to ptr addrspace(4)
+  %agg.tmp69.ascast = addrspacecast ptr %agg.tmp69 to ptr addrspace(4)
+  %agg.tmp79.ascast = addrspacecast ptr %agg.tmp79 to ptr addrspace(4)
+  %agg.tmp88.ascast = addrspacecast ptr %agg.tmp88 to ptr addrspace(4)
+  %agg.tmp92.ascast = addrspacecast ptr %agg.tmp92 to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %ref.tmp93.ascast = addrspacecast ptr %ref.tmp93 to ptr addrspace(4)
+  %ref.tmp94.ascast = addrspacecast ptr %ref.tmp94 to ptr addrspace(4)
+  %agg.tmp100.ascast = addrspacecast ptr %agg.tmp100 to ptr addrspace(4)
+  %agg.tmp106.ascast = addrspacecast ptr %agg.tmp106 to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %WI, ptr %WI.addr
+  %Tangle.ascast = addrspacecast ptr %Tangle to ptr addrspace(4)
+  store i64 %TangleLeader, ptr %TangleLeader.addr
+  store i64 %TangleSize, ptr %TangleSize.addr
+  %IsMember.ascast = addrspacecast ptr %IsMember to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %TmpAcc1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp.ascast, i64 %0) 
+  %call = call spir_func ptr addrspace(4) @Foo70(ptr addrspace(4) %TmpAcc1, ptr byval(%"range") %agg.tmp) 
+  store i64 1, ptr addrspace(4) %call
+  call spir_func void @Foo75(ptr byval(%"tangle_group") %agg.tmp2, i32 1) 
+  store i64 0, ptr %Visible
+  store i64 0, ptr %Other
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end, %entry
+  %1 = load i64, ptr %Other
+  %cmp = icmp ult i64 %1, 32
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %2 = load i64, ptr %Visible
+  %3 = load i64, ptr %TangleSize.addr
+  %cmp7 = icmp eq i64 %2, %3
+  %BarrierAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 1
+  %4 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp8.ascast, i64 %4) 
+  %call9 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %BarrierAcc, ptr byval(%"range") %agg.tmp8) 
+  %storedv = zext i1 %cmp7 to i8
+  store i8 %storedv, ptr addrspace(4) %call9, align 1
+  %5 = getelementptr inbounds i8, ptr addrspace(4) %this1, i64 64
+  %call10 = call spir_func i32 @Foo76(ptr addrspace(4) align 1 %5) 
+  store i32 %call10, ptr %OriginalLID, align 4
+  %call11 = call spir_func i32 @Foo90(ptr addrspace(4) %Tangle.ascast) 
+  store i32 %call11, ptr %LID, align 4
+  %6 = load i32, ptr %OriginalLID, align 4
+  %call13 = call spir_func i32 @Foo91(ptr byval(%"tangle_group") %agg.tmp12, i32 %6, i32 0) 
+  store i32 %call13, ptr %BroadcastResult, align 4
+  %7 = load i32, ptr %BroadcastResult, align 4
+  %conv = zext i32 %7 to i64
+  %8 = load i64, ptr %TangleLeader.addr
+  %cmp14 = icmp eq i64 %conv, %8
+  %BroadcastAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 3
+  %9 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp15.ascast, i64 %9) 
+  %call16 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %BroadcastAcc, ptr byval(%"range") %agg.tmp15) 
+  %storedv17 = zext i1 %cmp14 to i8
+  store i8 %storedv17, ptr addrspace(4) %call16, align 1
+  %10 = load i32, ptr %LID, align 4
+  %cmp19 = icmp eq i32 %10, 0
+  %call20 = call spir_func zeroext i1 @Foo92(ptr byval(%"tangle_group") %agg.tmp18, i1 zeroext %cmp19) 
+  %storedv21 = zext i1 %call20 to i8
+  store i8 %storedv21, ptr %AnyResult, align 1
+  %11 = load i8, ptr %AnyResult, align 1  
+  %loadedv = trunc i8 %11 to i1
+  %conv22 = zext i1 %loadedv to i32
+  %cmp23 = icmp eq i32 %conv22, 1
+  %AnyAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 4
+  %12 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp24.ascast, i64 %12) 
+  %call25 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %AnyAcc, ptr byval(%"range") %agg.tmp24) 
+  %storedv26 = zext i1 %cmp23 to i8
+  store i8 %storedv26, ptr addrspace(4) %call25, align 1
+  %13 = load i32, ptr %LID, align 4
+  %conv28 = zext i32 %13 to i64
+  %14 = load i64, ptr %TangleSize.addr
+  %cmp29 = icmp ult i64 %conv28, %14
+  %call30 = call spir_func zeroext i1 @Foo67(ptr byval(%"tangle_group") %agg.tmp27, i1 zeroext %cmp29) 
+  %storedv31 = zext i1 %call30 to i8
+  store i8 %storedv31, ptr %AllResult, align 1
+  %15 = load i8, ptr %AllResult, align 1  
+  %loadedv32 = trunc i8 %15 to i1
+  %conv33 = zext i1 %loadedv32 to i32
+  %cmp34 = icmp eq i32 %conv33, 1
+  %AllAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 5
+  %16 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp35.ascast, i64 %16) 
+  %call36 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %AllAcc, ptr byval(%"range") %agg.tmp35) 
+  %storedv37 = zext i1 %cmp34 to i8
+  store i8 %storedv37, ptr addrspace(4) %call36, align 1
+  %17 = load i32, ptr %LID, align 4
+  %conv39 = zext i32 %17 to i64
+  %18 = load i64, ptr %TangleSize.addr
+  %cmp40 = icmp uge i64 %conv39, %18
+  %call41 = call spir_func zeroext i1 @Foo65(ptr byval(%"tangle_group") %agg.tmp38, i1 zeroext %cmp40) 
+  %storedv42 = zext i1 %call41 to i8
+  store i8 %storedv42, ptr %NoneResult, align 1
+  %19 = load i8, ptr %NoneResult, align 1  
+  %loadedv43 = trunc i8 %19 to i1
+  %conv44 = zext i1 %loadedv43 to i32
+  %cmp45 = icmp eq i32 %conv44, 1
+  %NoneAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 6
+  %20 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp46.ascast, i64 %20) 
+  %call47 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %NoneAcc, ptr byval(%"range") %agg.tmp46) 
+  %storedv48 = zext i1 %cmp45 to i8
+  store i8 %storedv48, ptr addrspace(4) %call47, align 1
+  %call51 = call spir_func i32 @Foo64(ptr byval(%"tangle_group") %agg.tmp49, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp50) 
+  store i32 %call51, ptr %ReduceResult, align 4
+  %21 = load i32, ptr %ReduceResult, align 4
+  %conv52 = zext i32 %21 to i64
+  %22 = load i64, ptr %TangleSize.addr
+  %cmp53 = icmp eq i64 %conv52, %22
+  %ReduceAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 7
+  %23 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp54.ascast, i64 %23) 
+  %call55 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ReduceAcc, ptr byval(%"range") %agg.tmp54) 
+  %storedv56 = zext i1 %cmp53 to i8
+  store i8 %storedv56, ptr addrspace(4) %call55, align 1
+  %call59 = call spir_func i32 @Foo63(ptr byval(%"tangle_group") %agg.tmp57, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp58) 
+  store i32 %call59, ptr %ExScanResult, align 4
+  %24 = load i32, ptr %ExScanResult, align 4
+  %25 = load i32, ptr %LID, align 4
+  %cmp60 = icmp eq i32 %24, %25
+  %ExScanAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 8
+  %26 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp61.ascast, i64 %26) 
+  %call62 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ExScanAcc, ptr byval(%"range") %agg.tmp61) 
+  %storedv63 = zext i1 %cmp60 to i8
+  store i8 %storedv63, ptr addrspace(4) %call62, align 1
+  %call66 = call spir_func i32 @Foo62(ptr byval(%"tangle_group") %agg.tmp64, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp65) 
+  store i32 %call66, ptr %IncScanResult, align 4
+  %27 = load i32, ptr %IncScanResult, align 4
+  %28 = load i32, ptr %LID, align 4
+  %add67 = add i32 %28, 1
+  %cmp68 = icmp eq i32 %27, %add67
+  %IncScanAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 9
+  %29 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp69.ascast, i64 %29) 
+  %call70 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %IncScanAcc, ptr byval(%"range") %agg.tmp69) 
+  %storedv71 = zext i1 %cmp68 to i8
+  store i8 %storedv71, ptr addrspace(4) %call70, align 1
+  %30 = load i32, ptr %LID, align 4
+  %call73 = call spir_func i32 @Foo73(ptr byval(%"tangle_group") %agg.tmp72, i32 %30, i32 2) 
+  store i32 %call73, ptr %ShiftLeftResult, align 4
+  %31 = load i32, ptr %LID, align 4
+  %add74 = add i32 %31, 2
+  %conv75 = zext i32 %add74 to i64
+  %32 = load i64, ptr %TangleSize.addr
+  %cmp76 = icmp uge i64 %conv75, %32
+  br i1 %cmp76, label %lor.end, label %lor.rhs
+
+lor.rhs:                                          ; preds = %for.cond.cleanup
+  %33 = load i32, ptr %ShiftLeftResult, align 4
+  %34 = load i32, ptr %LID, align 4
+  %add77 = add i32 %34, 2
+  %cmp78 = icmp eq i32 %33, %add77
+  br label %lor.end
+
+lor.end:                                          ; preds = %lor.rhs, %for.cond.cleanup
+  %35 = phi i1 [ true, %for.cond.cleanup ], [ %cmp78, %lor.rhs ]
+  %ShiftLeftAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 10
+  %36 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp79.ascast, i64 %36) 
+  %call80 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ShiftLeftAcc, ptr byval(%"range") %agg.tmp79) 
+  %storedv81 = zext i1 %35 to i8
+  store i8 %storedv81, ptr addrspace(4) %call80, align 1
+  %37 = load i32, ptr %LID, align 4
+  %call83 = call spir_func i32 @Foo53(ptr byval(%"tangle_group") %agg.tmp82, i32 %37, i32 2) 
+  store i32 %call83, ptr %ShiftRightResult, align 4
+  %38 = load i32, ptr %LID, align 4
+  %cmp84 = icmp ult i32 %38, 2
+  br i1 %cmp84, label %lor.end87, label %lor.rhs85
+
+lor.rhs85:                                        ; preds = %lor.end
+  %39 = load i32, ptr %ShiftRightResult, align 4
+  %40 = load i32, ptr %LID, align 4
+  %sub = sub i32 %40, 2
+  %cmp86 = icmp eq i32 %39, %sub
+  br label %lor.end87
+
+lor.end87:                                        ; preds = %lor.rhs85, %lor.end
+  %41 = phi i1 [ true, %lor.end ], [ %cmp86, %lor.rhs85 ]
+  %ShiftRightAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 11
+  %42 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp88.ascast, i64 %42) 
+  %call89 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ShiftRightAcc, ptr byval(%"range") %agg.tmp88) 
+  %storedv90 = zext i1 %41 to i8
+  store i8 %storedv90, ptr addrspace(4) %call89, align 1
+  %43 = load i32, ptr %LID, align 4
+  call spir_func void @Foo51(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp93.ascast, ptr addrspace(4) %Tangle.ascast) 
+  store i32 2, ptr %ref.tmp94, align 4
+  call spir_func void @Foo55(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) %ref.tmp93.ascast, ptr addrspace(4) align 4 %ref.tmp94.ascast) 
+  call spir_func void @Foo56(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp92.ascast, ptr addrspace(4) %ref.tmp.ascast, ptr addrspace(4) %TangleSize.addr.ascast) 
+  %call95 = call spir_func i32 @Foo57(ptr byval(%"tangle_group") %agg.tmp91, i32 %43, ptr byval(%"range") %agg.tmp92) 
+  store i32 %call95, ptr %SelectResult, align 4
+  %44 = load i32, ptr %SelectResult, align 4
+  %conv96 = zext i32 %44 to i64
+  %45 = load i32, ptr %LID, align 4
+  %add97 = add i32 %45, 2
+  %conv98 = zext i32 %add97 to i64
+  %46 = load i64, ptr %TangleSize.addr
+  %rem = urem i64 %conv98, %46
+  %cmp99 = icmp eq i64 %conv96, %rem
+  %SelectAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 12
+  %47 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp100.ascast, i64 %47) 
+  %call101 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %SelectAcc, ptr byval(%"range") %agg.tmp100) 
+  %storedv102 = zext i1 %cmp99 to i8
+  store i8 %storedv102, ptr addrspace(4) %call101, align 1
+  %48 = load i32, ptr %LID, align 4
+  %call104 = call spir_func i32 @Foo58(ptr byval(%"tangle_group") %agg.tmp103, i32 %48, i32 2) 
+  store i32 %call104, ptr %PermuteXorResult, align 4
+  %49 = load i32, ptr %PermuteXorResult, align 4
+  %50 = load i32, ptr %LID, align 4
+  %xor = xor i32 %50, 2
+  %cmp105 = icmp eq i32 %49, %xor
+  %PermuteXorAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 13
+  %51 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp106.ascast, i64 %51) 
+  %call107 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %PermuteXorAcc, ptr byval(%"range") %agg.tmp106) 
+  %storedv108 = zext i1 %cmp105 to i8
+  store i8 %storedv108, ptr addrspace(4) %call107, align 1
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %52 = load i64, ptr %Other
+  %call3 = call spir_func zeroext i1 @Bar14(ptr addrspace(4) align 1 %IsMember.ascast, i64 %52) 
+  br i1 %call3, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %TmpAcc42 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %53 = load i64, ptr %Other
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp5.ascast, i64 %53) 
+  %call6 = call spir_func ptr addrspace(4) @Foo70(ptr addrspace(4) %TmpAcc42, ptr byval(%"range") %agg.tmp5) 
+  %54 = load i64, ptr addrspace(4) %call6
+  %55 = load i64, ptr %Visible
+  %add = add i64 %55, %54
+  store i64 %add, ptr %Visible
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %56 = load i64, ptr %Other
+  %inc = add i64 %56, 1
+  store i64 %inc, ptr %Other
+  br label %for.cond
+}
+
+
+define internal spir_func zeroext i1 @Bar14(ptr addrspace(4) align 1 %this, i64 %Other) {
+entry:
+  %retval = alloca i1, align 1
+  %this.addr = alloca ptr addrspace(4)
+  %Other.addr = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %Other, ptr %Other.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i64, ptr %Other.addr
+  %cmp = icmp uge i64 %0, 24
+  br i1 %cmp, label %land.rhs, label %land.end
+
+land.rhs:                                         ; preds = %entry
+  %1 = load i64, ptr %Other.addr
+  %cmp2 = icmp ult i64 %1, 32
+  br label %land.end
+
+land.end:                                         ; preds = %land.rhs, %entry
+  %2 = phi i1 [ false, %entry ], [ %cmp2, %land.rhs ]
+  ret i1 %2
+}
+
+
+define internal spir_func zeroext i1 @Foo74(ptr addrspace(4) align 1 %this, i64 %Other) {
+entry:
+  %retval = alloca i1, align 1
+  %this.addr = alloca ptr addrspace(4)
+  %Other.addr = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %Other, ptr %Other.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i64, ptr %Other.addr
+  %cmp = icmp uge i64 %0, 4
+  br i1 %cmp, label %land.rhs, label %land.end
+
+land.rhs:                                         ; preds = %entry
+  %1 = load i64, ptr %Other.addr
+  %cmp2 = icmp ult i64 %1, 24
+  br label %land.end
+
+land.end:                                         ; preds = %land.rhs, %entry
+  %2 = phi i1 [ false, %entry ], [ %cmp2, %land.rhs ]
+  ret i1 %2
+}
+
+
+define internal spir_func i32 @Bar13(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"range") %mask) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %TargetLocalId = alloca %"range"
+  %ref.tmp = alloca %"range"
+  %TargetId = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"range"
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %x.addr.ascast = addrspacecast ptr %x.addr to ptr addrspace(4)
+  %TargetLocalId.ascast = addrspacecast ptr %TargetLocalId to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %g.ascast = addrspacecast ptr %g to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %mask.ascast = addrspacecast ptr %mask to ptr addrspace(4)
+  %0 = addrspacecast ptr addrspace(1) @_ZSt6ignore to ptr addrspace(4)
+  %call = call spir_func align 1 ptr addrspace(4) @Bar15(ptr addrspace(4) align 1 %0, ptr addrspace(4) %g.ascast) 
+  call spir_func void @Foo51(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) %g.ascast) 
+  call spir_func void @Bar16(ptr addrspace(4) dead_on_unwind writable sret(%"range") %TargetLocalId.ascast, ptr addrspace(4) %ref.tmp.ascast, ptr addrspace(4) %mask.ascast) 
+  %call2 = call spir_func i32 @Foo48(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"range") %agg.tmp1) 
+  store i32 %call2, ptr %TargetId, align 4
+  %call3 = call spir_func i32 @Foo49(ptr addrspace(4) align 4 %x.addr.ascast) 
+  %1 = load i32, ptr %TargetId, align 4
+  %call4 = call spir_func i32 @Foo50(i32 3, i32 %call3, i32 %1) 
+  ret i32 %call4
+}
+
+
+define internal spir_func align 1 ptr addrspace(4) @Bar15(ptr addrspace(4) align 1 %this, ptr addrspace(4) %0) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(4) %0, ptr %.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  ret ptr addrspace(4) %this1
+}
+
+
+define internal spir_func void @Bar16(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) %lhs, ptr addrspace(4) %rhs) {
+entry:
+  %lhs.addr = alloca ptr addrspace(4)
+  %rhs.addr = alloca ptr addrspace(4)
+  %i = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  store ptr addrspace(4) %lhs, ptr %lhs.addr
+  store ptr addrspace(4) %rhs, ptr %rhs.addr
+  call spir_func void @Foo11(ptr addrspace(4) %agg.result) 
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %1 = load ptr addrspace(4), ptr %lhs.addr
+  %common_array2 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %2 = load i32, ptr %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array2, i64 0, i64 %idxprom
+  %3 = load i64, ptr addrspace(4) %arrayidx
+  %4 = load ptr addrspace(4), ptr %rhs.addr
+  %common_array13 = bitcast ptr addrspace(4) %4 to ptr addrspace(4)
+  %5 = load i32, ptr %i, align 4
+  %idxprom2 = sext i32 %5 to i64
+  %arrayidx3 = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array13, i64 0, i64 %idxprom2
+  %6 = load i64, ptr addrspace(4) %arrayidx3
+  %xor = xor i64 %3, %6
+  %common_array44 = bitcast ptr addrspace(4) %agg.result to ptr addrspace(4)
+  %7 = load i32, ptr %i, align 4
+  %idxprom5 = sext i32 %7 to i64
+  %arrayidx6 = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array44, i64 0, i64 %idxprom5
+  store i64 %xor, ptr addrspace(4) %arrayidx6
+  %8 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %8, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+}
+
+
+define internal spir_func i32 @Foo48(ptr byval(%"tangle_group") %g, ptr byval(%"range") %local_id) {
+entry:
+  %retval.i = alloca i64
+  %this.addr.i = alloca ptr addrspace(4)
+  %Result.i = alloca i64
+  %retval = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %local_id.ascast = addrspacecast ptr %local_id to ptr addrspace(4)
+  %retval.ascast.i = addrspacecast ptr %retval.i to ptr addrspace(4)
+  store ptr addrspace(4) %local_id.ascast, ptr %this.addr.i
+  %this1.i = load ptr addrspace(4), ptr %this.addr.i
+  %0 = load i64, ptr addrspace(4) %this1.i
+  store i64 %0, ptr %Result.i
+  %1 = load i64, ptr %Result.i
+  %conv = trunc i64 %1 to i32
+  %call1 = call spir_func i32 @Bar17(ptr byval(%"tangle_group") %agg.tmp, i32 %conv) 
+  ret i32 %call1
+}
+
+
+define internal spir_func i32 @Foo49(ptr addrspace(4) align 4 %x) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %x, ptr %x.addr
+  %0 = load ptr addrspace(4), ptr %x.addr
+  %1 = load i32, ptr addrspace(4) %0, align 4
+  ret i32 %1
+}
+
+declare dso_local spir_func i32 @Foo50(i32, i32, i32) 
+
+
+define internal spir_func i32 @Bar17(ptr byval(%"tangle_group") %Group, i32 %Id) {
+entry:
+  %retval = alloca i32, align 4
+  %Id.addr = alloca i32, align 4
+  %MemberMask = alloca %"vec.16", align 16
+  %agg.tmp = alloca %"ss_sub_group_mask"
+  %agg.tmp1 = alloca %"tangle_group"
+  %Count = alloca i32, align 4
+  %i = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %b = alloca i32, align 4
+  %MemberMask.ascast = addrspacecast ptr %MemberMask to ptr addrspace(4)
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  store i32 %Id, ptr %Id.addr, align 4
+  call spir_func void @Bar18(ptr addrspace(4) dead_on_unwind writable sret(%"ss_sub_group_mask") %agg.tmp.ascast, ptr byval(%"tangle_group") %agg.tmp1) 
+  call spir_func void @Bar19(ptr addrspace(4) dead_on_unwind writable sret(%"vec.16") align 16 %MemberMask.ascast, ptr byval(%"ss_sub_group_mask") %agg.tmp) 
+  store i32 0, ptr %Count, align 4
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.end, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 4
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  store i32 2, ptr %cleanup.dest.slot, align 4
+  br label %cleanup12
+
+for.body:                                         ; preds = %for.cond
+  store i32 0, ptr %b, align 4
+  br label %for.cond2
+
+for.cond2:                                        ; preds = %if.end8, %for.body
+  %1 = load i32, ptr %b, align 4
+  %cmp3 = icmp slt i32 %1, 32
+  br i1 %cmp3, label %for.body5, label %for.cond.cleanup4
+
+for.cond.cleanup4:                                ; preds = %for.cond2
+  store i32 5, ptr %cleanup.dest.slot, align 4
+  br label %cleanup
+
+for.body5:                                        ; preds = %for.cond2
+  %2 = load i32, ptr %i, align 4
+  %call = call spir_func align 4 ptr addrspace(4) @Bar20(ptr addrspace(4) align 16 %MemberMask.ascast, i32 %2) 
+  %3 = load i32, ptr addrspace(4) %call, align 4
+  %4 = load i32, ptr %b, align 4
+  %shl = shl i32 1, %4
+  %and = and i32 %3, %shl
+  %tobool = icmp ne i32 %and, 0
+  br i1 %tobool, label %if.then, label %if.end8
+
+if.then:                                          ; preds = %for.body5
+  %5 = load i32, ptr %Count, align 4
+  %6 = load i32, ptr %Id.addr, align 4
+  %cmp6 = icmp eq i32 %5, %6
+  br i1 %cmp6, label %if.then7, label %if.end
+
+if.end:                                           ; preds = %if.then
+  %7 = load i32, ptr %Count, align 4
+  %inc = add i32 %7, 1
+  store i32 %inc, ptr %Count, align 4
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.end, %for.body5
+  %8 = load i32, ptr %b, align 4
+  %inc9 = add nsw i32 %8, 1
+  store i32 %inc9, ptr %b, align 4
+  br label %for.cond2
+
+if.then7:                                         ; preds = %if.then
+  %9 = load i32, ptr %i, align 4
+  %mul = mul nsw i32 %9, 32
+  %10 = load i32, ptr %b, align 4
+  %add = add nsw i32 %mul, %10
+  store i32 %add, ptr %retval, align 4
+  store i32 1, ptr %cleanup.dest.slot, align 4
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.then7, %for.cond.cleanup4
+  %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4
+  %cond = icmp eq i32 %cleanup.dest, 5
+  br i1 %cond, label %for.end, label %cleanup12
+
+for.end:                                          ; preds = %cleanup
+  %11 = load i32, ptr %i, align 4
+  %inc11 = add nsw i32 %11, 1
+  store i32 %inc11, ptr %i, align 4
+  br label %for.cond
+
+cleanup12:                                        ; preds = %cleanup, %for.cond.cleanup
+  %cleanup.dest13 = load i32, ptr %cleanup.dest.slot, align 4
+  %cond1 = icmp eq i32 %cleanup.dest13, 2
+  br i1 %cond1, label %for.end14, label %cleanup15
+
+for.end14:                                        ; preds = %cleanup12
+  %12 = load i32, ptr %Count, align 4
+  store i32 %12, ptr %retval, align 4
+  store i32 1, ptr %cleanup.dest.slot, align 4
+  br label %cleanup15
+
+cleanup15:                                        ; preds = %cleanup12, %for.end14
+  %13 = load i32, ptr %retval, align 4
+  ret i32 %13
+}
+
+
+define internal spir_func void @Bar18(ptr addrspace(4) dead_on_unwind noalias writable sret(%"ss_sub_group_mask") %agg.result, ptr byval(%"tangle_group") %Group) {
+entry:
+  %Mask1 = bitcast ptr %Group to ptr
+  ret void
+}
+
+
+define internal spir_func void @Bar19(ptr addrspace(4) dead_on_unwind noalias writable sret(%"vec.16") align 16 %agg.result, ptr byval(%"ss_sub_group_mask") %Mask) {
+entry:
+  %TmpMArray = alloca %"struct.std::array.20", align 4
+  %agg.tmp = alloca %"range"
+  %i = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %cleanup.dest.slot2 = alloca i32, align 4
+  %TmpMArray.ascast = addrspacecast ptr %TmpMArray to ptr addrspace(4)
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  %Mask.ascast = addrspacecast ptr %Mask to ptr addrspace(4)
+  call spir_func void @Bar50(ptr addrspace(4) align 4 %TmpMArray.ascast) 
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp.ascast, i64 0) 
+  call spir_func void @Bar51(ptr addrspace(4) %Mask.ascast, ptr addrspace(4) align 4 %TmpMArray.ascast, ptr byval(%"range") %agg.tmp) 
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 4
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, ptr %i, align 4
+  %conv = sext i32 %1 to i64
+  %call = call spir_func align 4 ptr addrspace(4) @Bar57(ptr addrspace(4) align 4 %TmpMArray.ascast, i64 %conv) 
+  %2 = load i32, ptr addrspace(4) %call, align 4
+  %3 = load i32, ptr %i, align 4
+  %call1 = call spir_func align 4 ptr addrspace(4) @Bar20(ptr addrspace(4) align 16 %agg.result, i32 %3) 
+  store i32 %2, ptr addrspace(4) %call1, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %4 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+}
+
+
+define internal spir_func align 4 ptr addrspace(4) @Bar20(ptr addrspace(4) align 16 %this, i32 %i) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %i.addr = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i32 %i, ptr %i.addr, align 4
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %m_Data1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i32, ptr %i.addr, align 4
+  %conv = sext i32 %0 to i64
+  %call = call spir_func align 4 ptr addrspace(4) @_ZNSt5arrayIjLm4EEixEm(ptr addrspace(4) align 4 %m_Data1, i64 %conv) 
+  ret ptr addrspace(4) %call
+}
+
+
+define internal spir_func align 4 ptr addrspace(4) @_ZNSt5arrayIjLm4EEixEm(ptr addrspace(4) align 4 %this, i64 %__n) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %__n.addr = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %__n, ptr %__n.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %_M_elems1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr %__n.addr
+  %call = call spir_func align 4 ptr addrspace(4) @_ZNSt14__array_traitsIjLm4EE6_S_refERA4_Kjm(ptr addrspace(4) align 4 %_M_elems1, i64 %0) 
+  ret ptr addrspace(4) %call
+}
+
+
+define internal spir_func align 4 ptr addrspace(4) @_ZNSt14__array_traitsIjLm4EE6_S_refERA4_Kjm(ptr addrspace(4) align 4 %__t, i64 %__n) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %__t.addr = alloca ptr addrspace(4)
+  %__n.addr = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %__t, ptr %__t.addr
+  store i64 %__n, ptr %__n.addr
+  %0 = load ptr addrspace(4), ptr %__t.addr
+  %1 = load i64, ptr %__n.addr
+  %arrayidx = getelementptr inbounds nuw [4 x i32], ptr addrspace(4) %0, i64 0, i64 %1
+  ret ptr addrspace(4) %arrayidx
+}
+
+
+define internal spir_func void @Bar50(ptr addrspace(4) align 4 %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = inttoptr i64 16 to ptr addrspace(4)
+  br label %arrayinit.body
+
+arrayinit.body:                                   ; preds = %arrayinit.body, %entry
+  %lsr.iv = phi i64 [ %lsr.iv.next, %arrayinit.body ], [ 0, %entry ]
+  %scevgep = getelementptr i8, ptr addrspace(4) %this1, i64 %lsr.iv
+  store i32 0, ptr addrspace(4) %scevgep, align 4
+  %lsr.iv.next = add nuw nsw i64 %lsr.iv, 4
+  %lsr.iv.next1 = inttoptr i64 %lsr.iv.next to ptr addrspace(4)
+  %arrayinit.done = icmp eq ptr addrspace(4) %lsr.iv.next1, %0
+  br i1 %arrayinit.done, label %arrayinit.end2, label %arrayinit.body
+
+arrayinit.end2:                                   ; preds = %arrayinit.body
+  ret void
+}
+
+
+define internal spir_func void @Bar51(ptr addrspace(4) %this, ptr addrspace(4) align 4 %bits, ptr byval(%"range") %pos) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %bits.addr = alloca ptr addrspace(4)
+  %cur_pos = alloca i64
+  %__range4 = alloca ptr addrspace(4)
+  %__begin0 = alloca ptr addrspace(4)
+  %__end0 = alloca ptr addrspace(4)
+  %cleanup.dest.slot = alloca i32, align 4
+  %elem = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"range"
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(4) %bits, ptr %bits.addr
+  %pos.ascast = addrspacecast ptr %pos to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %call = call spir_func i64 @Bar52(ptr addrspace(4) %pos.ascast, i32 0) 
+  store i64 %call, ptr %cur_pos
+  %0 = load ptr addrspace(4), ptr %bits.addr
+  store ptr addrspace(4) %0, ptr %__range4
+  %1 = load ptr addrspace(4), ptr %__range4
+  %call2 = call spir_func ptr addrspace(4) @Bar53(ptr addrspace(4) align 4 %1) 
+  store ptr addrspace(4) %call2, ptr %__begin0
+  %2 = load ptr addrspace(4), ptr %__range4
+  %call3 = call spir_func ptr addrspace(4) @Bar54(ptr addrspace(4) align 4 %2) 
+  store ptr addrspace(4) %call3, ptr %__end0
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %3 = load ptr addrspace(4), ptr %__begin0
+  %4 = load ptr addrspace(4), ptr %__end0
+  %cmp = icmp ne ptr addrspace(4) %3, %4
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %5 = load ptr addrspace(4), ptr %__begin0
+  store ptr addrspace(4) %5, ptr %elem
+  %6 = load i64, ptr %cur_pos
+  %call4 = call spir_func i32 @Bar55(ptr addrspace(4) %this1) 
+  %conv = zext i32 %call4 to i64
+  %cmp5 = icmp ult i64 %6, %conv
+  br i1 %cmp5, label %if.then, label %if.else
+
+if.else:                                          ; preds = %for.body
+  %7 = load ptr addrspace(4), ptr %elem
+  store i32 0, ptr addrspace(4) %7, align 4
+  br label %if.end
+
+if.then:                                          ; preds = %for.body
+  %8 = load ptr addrspace(4), ptr %elem
+  %9 = load i64, ptr %cur_pos
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp.ascast, i64 %9) 
+  call spir_func void @Bar56(ptr addrspace(4) %this1, ptr addrspace(4) align 4 %8, ptr byval(%"range") %agg.tmp) 
+  %10 = load i64, ptr %cur_pos
+  %add = add i64 %10, 32
+  store i64 %add, ptr %cur_pos
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %11 = load ptr addrspace(4), ptr %__begin0
+  %incdec.ptr = getelementptr inbounds nuw i32, ptr addrspace(4) %11, i32 1
+  store ptr addrspace(4) %incdec.ptr, ptr %__begin0
+  br label %for.cond
+}
+
+
+define internal spir_func align 4 ptr addrspace(4) @Bar57(ptr addrspace(4) align 4 %this, i64 %index) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %index.addr = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %index, ptr %index.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %MData1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr %index.addr
+  %arrayidx = getelementptr inbounds nuw [4 x i32], ptr addrspace(4) %MData1, i64 0, i64 %0
+  ret ptr addrspace(4) %arrayidx
+}
+
+
+define internal spir_func i64 @Bar52(ptr addrspace(4) %this, i32 %dimension) {
+entry:
+  %this.addr.i = alloca ptr addrspace(4)
+  %dimension.addr.i = alloca i32, align 4
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %dimension.addr = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i32 %dimension, ptr %dimension.addr, align 4
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i32, ptr %dimension.addr, align 4
+  store ptr addrspace(4) %this1, ptr %this.addr.i
+  store i32 %0, ptr %dimension.addr.i, align 4
+  %this1.i = load ptr addrspace(4), ptr %this.addr.i
+  %common_array1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load i32, ptr %dimension.addr, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array1, i64 0, i64 %idxprom
+  %2 = load i64, ptr addrspace(4) %arrayidx
+  ret i64 %2
+}
+
+
+define internal spir_func ptr addrspace(4) @Bar53(ptr addrspace(4) align 4 %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+
+
+  %this.addr1 = bitcast ptr %this.addr to ptr
+  %this.addr2 = bitcast ptr %this.addr1 to ptr
+  %this1 = load ptr addrspace(4), ptr %this.addr2
+
+
+
+;  %this1 = load ptr addrspace(4), ptr %this.addr
+  %MData1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %arraydecay2 = bitcast ptr addrspace(4) %MData1 to ptr addrspace(4)
+  ret ptr addrspace(4) %arraydecay2
+}
+
+
+define internal spir_func ptr addrspace(4) @Bar54(ptr addrspace(4) align 4 %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+
+
+  %this.addr1 = bitcast ptr %this.addr to ptr
+  %this.addr2 = bitcast ptr %this.addr1 to ptr
+  %this1 = load ptr addrspace(4), ptr %this.addr2
+
+;  %this1 = load ptr addrspace(4), ptr %this.addr
+  %MData1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %arraydecay2 = bitcast ptr addrspace(4) %MData1 to ptr addrspace(4)
+  %add.ptr = getelementptr inbounds nuw i32, ptr addrspace(4) %arraydecay2, i64 4
+  ret ptr addrspace(4) %add.ptr
+}
+
+
+define internal spir_func i32 @Bar55(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca i32, align 4
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %bits_num = getelementptr inbounds nuw %"ss_sub_group_mask", ptr addrspace(4) %this1, i32 0, i32 1
+  %0 = load i64, ptr addrspace(4) %bits_num
+  %conv = trunc i64 %0 to i32
+  ret i32 %conv
+}
+
+
+define internal spir_func void @Bar56(ptr addrspace(4) %this, ptr addrspace(4) align 4 %bits, ptr byval(%"range") %pos) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %bits.addr = alloca ptr addrspace(4)
+  %Res = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(4) %bits, ptr %bits.addr
+  %pos.ascast = addrspacecast ptr %pos to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %Bits1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr addrspace(4) %Bits1
+  store i64 %0, ptr %Res
+  %bits_num = getelementptr inbounds nuw %"ss_sub_group_mask", ptr addrspace(4) %this1, i32 0, i32 1
+  %1 = load i64, ptr addrspace(4) %bits_num
+  %call = call spir_func i64 @Bar58(ptr addrspace(4) %this1, i64 %1) 
+  %2 = load i64, ptr %Res
+  %and = and i64 %2, %call
+  store i64 %and, ptr %Res
+  %call2 = call spir_func i64 @Bar52(ptr addrspace(4) %pos.ascast, i32 0) 
+  %call3 = call spir_func i32 @Bar55(ptr addrspace(4) %this1) 
+  %conv = zext i32 %call3 to i64
+  %cmp = icmp ult i64 %call2, %conv
+  br i1 %cmp, label %if.then, label %if.else
+
+if.else:                                          ; preds = %entry
+  %3 = load ptr addrspace(4), ptr %bits.addr
+  store i32 0, ptr addrspace(4) %3, align 4
+  br label %if.end11
+
+if.then:                                          ; preds = %entry
+  %call4 = call spir_func i64 @Bar52(ptr addrspace(4) %pos.ascast, i32 0) 
+  %cmp5 = icmp ugt i64 %call4, 0
+  br i1 %cmp5, label %if.then6, label %if.end
+
+if.then6:                                         ; preds = %if.then
+  %call7 = call spir_func i64 @Bar52(ptr addrspace(4) %pos.ascast, i32 0) 
+  %4 = load i64, ptr %Res
+  %shr = lshr i64 %4, %call7
+  store i64 %shr, ptr %Res
+  br label %if.end
+
+if.end:                                           ; preds = %if.then6, %if.then
+  %call8 = call spir_func i64 @Bar58(ptr addrspace(4) %this1, i64 32) 
+  %5 = load i64, ptr %Res
+  %and9 = and i64 %5, %call8
+  store i64 %and9, ptr %Res
+  %6 = load i64, ptr %Res
+  %conv10 = trunc i64 %6 to i32
+  %7 = load ptr addrspace(4), ptr %bits.addr
+  store i32 %conv10, ptr addrspace(4) %7, align 4
+  br label %if.end11
+
+if.end11:                                         ; preds = %if.else, %if.end
+  ret void
+}
+
+
+define internal spir_func i64 @Bar58(ptr addrspace(4) %this, i64 %bn) {
+entry:
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %bn.addr = alloca i64
+  %one = alloca i64
+  %cleanup.dest.slot = alloca i32, align 4
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %bn, ptr %bn.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i64, ptr %bn.addr
+  %cmp = icmp ule i64 %0, 64
+  %1 = addrspacecast ptr addrspace(1) @.str.2 to ptr addrspace(4)
+  %2 = addrspacecast ptr addrspace(1) @.str.1 to ptr addrspace(4)
+  %3 = addrspacecast ptr addrspace(1) @__PRETTY_FUNCTION2 to ptr addrspace(4)
+  br i1 %cmp, label %cond.end, label %cond.false
+
+cond.false:                                       ; preds = %entry
+  call spir_func void @__assert_fail(ptr addrspace(4) %1, ptr addrspace(4) %2, i32 327, ptr addrspace(4) %3) 
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.false
+  store i64 1, ptr %one
+  %4 = load i64, ptr %bn.addr
+  %cmp2 = icmp eq i64 %4, 64
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.end:                                           ; preds = %cond.end
+  %5 = load i64, ptr %one
+  %6 = load i64, ptr %bn.addr
+  %shl = shl i64 %5, %6
+  %7 = load i64, ptr %one
+  %sub3 = sub i64 %shl, %7
+  store i64 %sub3, ptr %retval
+  store i32 1, ptr %cleanup.dest.slot, align 4
+  br label %cleanup
+
+if.then:                                          ; preds = %cond.end
+  %8 = load i64, ptr %one
+  %sub = sub i64 0, %8
+  store i64 %sub, ptr %retval
+  store i32 1, ptr %cleanup.dest.slot, align 4
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.end, %if.then
+  %9 = load i64, ptr %retval
+  ret i64 %9
+}
+
+
+
+
+define internal spir_func void @Foo11(ptr addrspace(4) %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  call spir_func void @Foo60(ptr addrspace(4) %this1, i64 0) 
+  ret void
+}
+
+
+define internal spir_func void @Foo60(ptr addrspace(4) %this, i64 %dim0) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %dim0.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %dim0, ptr %dim0.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %common_array1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr %dim0.addr
+  store i64 %0, ptr addrspace(4) %common_array1
+  ret void
+}
+
+
+define internal spir_func i32 @Foo59(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"range") %local_id) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %LocalId = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"range"
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %x.addr.ascast = addrspacecast ptr %x.addr to ptr addrspace(4)
+  %g.ascast = addrspacecast ptr %g to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %call = call spir_func i32 @Foo48(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"range") %agg.tmp1) 
+  store i32 %call, ptr %LocalId, align 4
+  %0 = addrspacecast ptr addrspace(1) @_ZSt6ignore to ptr addrspace(4)
+  %call2 = call spir_func align 1 ptr addrspace(4) @Bar15(ptr addrspace(4) align 1 %0, ptr addrspace(4) %g.ascast) 
+  %call3 = call spir_func i32 @Foo49(ptr addrspace(4) align 4 %x.addr.ascast) 
+  %1 = load i32, ptr %LocalId, align 4
+  %call4 = call spir_func i32 @Foo50(i32 3, i32 %call3, i32 %1) 
+  ret i32 %call4
+}
+
+
+define internal spir_func i32 @Foo47(ptr byval(%"ss_sub_group_mask") %Mask) {
+entry:
+  %retval = alloca i32, align 4
+  %MemberMask = alloca %"vec.16", align 16
+  %agg.tmp = alloca %"ss_sub_group_mask"
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %MemberMask.ascast = addrspacecast ptr %MemberMask to ptr addrspace(4)
+  call spir_func void @Bar19(ptr addrspace(4) dead_on_unwind writable sret(%"vec.16") align 16 %MemberMask.ascast, ptr byval(%"ss_sub_group_mask") %agg.tmp) 
+  %call = call spir_func <4 x i32> @Bar59(ptr addrspace(4) align 16 %MemberMask.ascast) 
+  %call1 = call spir_func i32 @_Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j(i32 3, i32 2, <4 x i32> %call) 
+  ret i32 %call1
+}
+
+
+define internal spir_func <4 x i32> @Bar59(ptr addrspace(4) align 16 %x) {
+entry:
+  %retval = alloca <4 x i32>, align 16
+  %x.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %x, ptr %x.addr
+  %0 = load ptr addrspace(4), ptr %x.addr
+  %call = call spir_func <4 x i32> @Bar60(ptr addrspace(4) align 16 %0) 
+  ret <4 x i32> %call
+}
+
+declare dso_local spir_func i32 @_Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j(i32, i32, <4 x i32>) 
+
+
+define internal spir_func <4 x i32> @Bar60(ptr addrspace(4) align 16 %from) {
+entry:
+  %retval = alloca <4 x i32>, align 16
+  %from.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %from, ptr %from.addr
+  %0 = load ptr addrspace(4), ptr %from.addr
+  %1 = load <4 x i32>, ptr addrspace(4) %0, align 16
+  ret <4 x i32> %1
+}
+
+
+define internal spir_func i32 @Foo52(ptr byval(%"tangle_group") %g, i32 %x, i32 %delta) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %delta.addr = alloca i32, align 4
+  %TargetLocalId = alloca %"range"
+  %TargetId = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp3 = alloca %"range"
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %x.addr.ascast = addrspacecast ptr %x.addr to ptr addrspace(4)
+  %TargetLocalId.ascast = addrspacecast ptr %TargetLocalId to ptr addrspace(4)
+  %g.ascast = addrspacecast ptr %g to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  store i32 %delta, ptr %delta.addr, align 4
+  call spir_func void @Foo51(ptr addrspace(4) dead_on_unwind writable sret(%"range") %TargetLocalId.ascast, ptr addrspace(4) %g.ascast) 
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %TargetLocalId.ascast, i32 0) 
+  %0 = load i64, ptr addrspace(4) %call
+  %1 = load i32, ptr %delta.addr, align 4
+  %conv = zext i32 %1 to i64
+  %cmp = icmp uge i64 %0, %conv
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = load i32, ptr %delta.addr, align 4
+  %conv1 = zext i32 %2 to i64
+  %call2 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %TargetLocalId.ascast, i32 0) 
+  %3 = load i64, ptr addrspace(4) %call2
+  %sub = sub i64 %3, %conv1
+  store i64 %sub, ptr addrspace(4) %call2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %call4 = call spir_func i32 @Foo48(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"range") %agg.tmp3) 
+  store i32 %call4, ptr %TargetId, align 4
+  %call5 = call spir_func i32 @Foo49(ptr addrspace(4) align 4 %x.addr.ascast) 
+  %4 = load i32, ptr %TargetId, align 4
+  %call6 = call spir_func i32 @Foo50(i32 3, i32 %call5, i32 %4) 
+  ret i32 %call6
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %this, i32 %dimension) {
+entry:
+  %this.addr.i = alloca ptr addrspace(4)
+  %dimension.addr.i = alloca i32, align 4
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %dimension.addr = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i32 %dimension, ptr %dimension.addr, align 4
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i32, ptr %dimension.addr, align 4
+  store ptr addrspace(4) %this1, ptr %this.addr.i
+  store i32 %0, ptr %dimension.addr.i, align 4
+  %this1.i = load ptr addrspace(4), ptr %this.addr.i
+  %common_array1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load i32, ptr %dimension.addr, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array1, i64 0, i64 %idxprom
+  ret ptr addrspace(4) %arrayidx
+}
+
+
+define internal spir_func i32 @Foo72(ptr byval(%"tangle_group") %g, i32 %x, i32 %delta) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %delta.addr = alloca i32, align 4
+  %TargetLocalId = alloca %"range"
+  %TargetId = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp6 = alloca %"range"
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %x.addr.ascast = addrspacecast ptr %x.addr to ptr addrspace(4)
+  %TargetLocalId.ascast = addrspacecast ptr %TargetLocalId to ptr addrspace(4)
+  %g.ascast = addrspacecast ptr %g to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  store i32 %delta, ptr %delta.addr, align 4
+  call spir_func void @Foo51(ptr addrspace(4) dead_on_unwind writable sret(%"range") %TargetLocalId.ascast, ptr addrspace(4) %g.ascast) 
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %TargetLocalId.ascast, i32 0) 
+  %0 = load i64, ptr addrspace(4) %call
+  %1 = load i32, ptr %delta.addr, align 4
+  %conv = zext i32 %1 to i64
+  %add = add i64 %0, %conv
+  %call1 = call spir_func i32 @Bar61(ptr addrspace(4) %g.ascast) 
+  %conv2 = zext i32 %call1 to i64
+  %cmp = icmp ult i64 %add, %conv2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = load i32, ptr %delta.addr, align 4
+  %conv3 = zext i32 %2 to i64
+  %call4 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %TargetLocalId.ascast, i32 0) 
+  %3 = load i64, ptr addrspace(4) %call4
+  %add5 = add i64 %3, %conv3
+  store i64 %add5, ptr addrspace(4) %call4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %call7 = call spir_func i32 @Foo48(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"range") %agg.tmp6) 
+  store i32 %call7, ptr %TargetId, align 4
+  %call8 = call spir_func i32 @Foo49(ptr addrspace(4) align 4 %x.addr.ascast) 
+  %4 = load i32, ptr %TargetId, align 4
+  %call9 = call spir_func i32 @Foo50(i32 3, i32 %call8, i32 %4) 
+  ret i32 %call9
+}
+
+
+define internal spir_func i32 @Bar61(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca i32, align 4
+  %this.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"range"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  call spir_func void @Foo97(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) %this1) 
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %ref.tmp.ascast, i32 0) 
+  %0 = load i64, ptr addrspace(4) %call
+  %conv = trunc i64 %0 to i32
+  ret i32 %conv
+}
+
+
+define internal spir_func void @Foo97(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %Mask1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %call = call spir_func i32 @Bar62(ptr addrspace(4) %Mask1) 
+  %conv = zext i32 %call to i64
+  call spir_func void @Foo9(ptr addrspace(4) %agg.result, i64 %conv) 
+  ret void
+}
+
+
+define internal spir_func i32 @Bar62(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca i32, align 4
+  %this.addr = alloca ptr addrspace(4)
+  %count = alloca i32, align 4
+  %word = alloca i64
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  store i32 0, ptr %count, align 4
+  %Bits1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr addrspace(4) %Bits1
+  %bits_num = getelementptr inbounds nuw %"ss_sub_group_mask", ptr addrspace(4) %this1, i32 0, i32 1
+  %1 = load i64, ptr addrspace(4) %bits_num
+  %call = call spir_func i64 @Bar58(ptr addrspace(4) %this1, i64 %1) 
+  %and = and i64 %0, %call
+  store i64 %and, ptr %word
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %2 = load i64, ptr %word
+  %tobool = icmp ne i64 %2, 0
+  br i1 %tobool, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.cond
+  %3 = load i32, ptr %count, align 4
+  ret i32 %3
+
+while.body:                                       ; preds = %while.cond
+  %4 = load i64, ptr %word
+  %sub = sub i64 %4, 1
+  %5 = load i64, ptr %word
+  %and2 = and i64 %5, %sub
+  store i64 %and2, ptr %word
+  %6 = load i32, ptr %count, align 4
+  %inc = add i32 %6, 1
+  store i32 %inc, ptr %count, align 4
+  br label %while.cond
+}
+
+
+define internal spir_func void @Foo9(ptr addrspace(4) %this, i64 %dim0) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %dim0.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %dim0, ptr %dim0.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i64, ptr %dim0.addr
+  call spir_func void @Foo60(ptr addrspace(4) %this1, i64 %0) 
+  ret void
+}
+
+
+define internal spir_func i32 @Foo61(ptr byval(%"tangle_group") %g, ptr byval(%"nd_item") align 1 %0, i32 %x, ptr byval(%"nd_item") align 1 %1){
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"nd_item", align 1
+  %agg.tmp2 = alloca %"nd_item", align 1
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %2 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %3 = addrspacecast ptr %1 to ptr addrspace(4)
+  %4 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar63(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"nd_item") align 1 %agg.tmp1, i32 %4, ptr byval(%"nd_item") align 1 %agg.tmp2) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Bar63(ptr byval(%"tangle_group") %g, ptr byval(%"nd_item") align 1 %0, i32 %x, ptr byval(%"nd_item") align 1 %1){
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %2 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %3 = addrspacecast ptr %1 to ptr addrspace(4)
+  %4 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar64(ptr byval(%"tangle_group") %agg.tmp, i32 %4) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Bar64(ptr byval(%"tangle_group") %0, i32 %x)   {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %Arg = alloca i32, align 4
+  %Ret = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %2 = load i32, ptr %x.addr, align 4
+  store i32 %2, ptr %Arg, align 4
+  %3 = load i32, ptr %Arg, align 4
+  %call = call spir_func i32 @_Z27__spirv_GroupNonUniformIAddIiET_N5__spv5Scope4FlagEjS0_(i32 3, i32 1, i32 %3) 
+  store i32 %call, ptr %Ret, align 4
+  %4 = load i32, ptr %Ret, align 4
+  ret i32 %4
+}
+
+declare dso_local spir_func i32 @_Z27__spirv_GroupNonUniformIAddIiET_N5__spv5Scope4FlagEjS0_(i32, i32, i32) 
+
+
+define internal spir_func i32 @Bar12(ptr byval(%"tangle_group") %g, ptr byval(%"nd_item") align 1 %0, i32 %x, ptr byval(%"nd_item") align 1 %1){
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"nd_item", align 1
+  %agg.tmp2 = alloca %"nd_item", align 1
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %2 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %3 = addrspacecast ptr %1 to ptr addrspace(4)
+  %4 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar65(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"nd_item") align 1 %agg.tmp1, i32 %4, ptr byval(%"nd_item") align 1 %agg.tmp2) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Bar65(ptr byval(%"tangle_group") %g, ptr byval(%"nd_item") align 1 %0, i32 %x, ptr byval(%"nd_item") align 1 %1){
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %2 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %3 = addrspacecast ptr %1 to ptr addrspace(4)
+  %4 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar66(ptr byval(%"tangle_group") %agg.tmp, i32 %4) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Bar66(ptr byval(%"tangle_group") %0, i32 %x)   {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %Arg = alloca i32, align 4
+  %Ret = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %2 = load i32, ptr %x.addr, align 4
+  store i32 %2, ptr %Arg, align 4
+  %3 = load i32, ptr %Arg, align 4
+  %call = call spir_func i32 @_Z27__spirv_GroupNonUniformIAddIiET_N5__spv5Scope4FlagEjS0_(i32 3, i32 2, i32 %3) 
+  store i32 %call, ptr %Ret, align 4
+  %4 = load i32, ptr %Ret, align 4
+  ret i32 %4
+}
+
+
+define internal spir_func i32 @Bar11(ptr byval(%"tangle_group") %g, ptr byval(%"nd_item") align 1 %0, i32 %x, ptr byval(%"nd_item") align 1 %1){
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"nd_item", align 1
+  %agg.tmp2 = alloca %"nd_item", align 1
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %2 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %3 = addrspacecast ptr %1 to ptr addrspace(4)
+  %4 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar67(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"nd_item") align 1 %agg.tmp1, i32 %4, ptr byval(%"nd_item") align 1 %agg.tmp2) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Bar67(ptr byval(%"tangle_group") %g, ptr byval(%"nd_item") align 1 %0, i32 %x, ptr byval(%"nd_item") align 1 %1){
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %2 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %3 = addrspacecast ptr %1 to ptr addrspace(4)
+  %4 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar68(ptr byval(%"tangle_group") %agg.tmp, i32 %4) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Bar68(ptr byval(%"tangle_group") %0, i32 %x)   {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %Arg = alloca i32, align 4
+  %Ret = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %2 = load i32, ptr %x.addr, align 4
+  store i32 %2, ptr %Arg, align 4
+  %3 = load i32, ptr %Arg, align 4
+  %call = call spir_func i32 @_Z27__spirv_GroupNonUniformIAddIiET_N5__spv5Scope4FlagEjS0_(i32 3, i32 0, i32 %3) 
+  store i32 %call, ptr %Ret, align 4
+  %4 = load i32, ptr %Ret, align 4
+  ret i32 %4
+}
+
+
+define internal spir_func zeroext i1 @Foo66(ptr byval(%"tangle_group") %0, i1 zeroext %pred) {
+entry:
+  %retval = alloca i1, align 1
+  %pred.addr = alloca i8, align 1
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  %storedv = zext i1 %pred to i8
+  store i8 %storedv, ptr %pred.addr, align 1
+  %2 = load i8, ptr %pred.addr, align 1  
+  %loadedv = trunc i8 %2 to i1
+  %call = call spir_func zeroext i1 @Foo99(i32 3, i1 zeroext %loadedv) 
+  ret i1 %call
+}
+
+declare dso_local spir_func zeroext i1 @Foo99(i32, i1 zeroext) 
+
+
+define internal spir_func zeroext i1 @Bar10(ptr byval(%"tangle_group") %0, i1 zeroext %pred) {
+entry:
+  %retval = alloca i1, align 1
+  %pred.addr = alloca i8, align 1
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  %storedv = zext i1 %pred to i8
+  store i8 %storedv, ptr %pred.addr, align 1
+  %2 = load i8, ptr %pred.addr, align 1  
+  %loadedv = trunc i8 %2 to i1
+  %call = call spir_func zeroext i1 @_Z26__spirv_GroupNonUniformAnyN5__spv5Scope4FlagEb(i32 3, i1 zeroext %loadedv) 
+  ret i1 %call
+}
+
+declare dso_local spir_func zeroext i1 @_Z26__spirv_GroupNonUniformAnyN5__spv5Scope4FlagEb(i32, i1 zeroext) 
+
+
+define internal spir_func void @Foo98(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr byval(%"range") %0, i64 %linear_id)   {
+entry:
+  %linear_id.addr = alloca i64
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i64 %linear_id, ptr %linear_id.addr
+  %2 = load i64, ptr %linear_id.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.result, i64 %2) 
+  ret void
+}
+
+
+define internal spir_func i32 @Bar69(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"range") %local_id) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"range"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar70(ptr byval(%"tangle_group") %agg.tmp, i32 %0, ptr byval(%"range") %agg.tmp1) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Bar70(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"range") %local_id) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %VecId = alloca %"range"
+  %OCLX = alloca i32, align 4
+  %WideOCLX = alloca i32, align 4
+  %OCLId = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %VecId.ascast = addrspacecast ptr %VecId to ptr addrspace(4)
+  %OCLX.ascast = addrspacecast ptr %OCLX to ptr addrspace(4)
+  %WideOCLX.ascast = addrspacecast ptr %WideOCLX to ptr addrspace(4)
+  %OCLId.ascast = addrspacecast ptr %OCLId to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %local_id.ascast = addrspacecast ptr %local_id to ptr addrspace(4)
+  %0 = load i32, ptr %x.addr, align 4
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %local_id.ascast, i32 0) 
+  %1 = load i64, ptr addrspace(4) %call
+  %call1 = call spir_func i32 @Bar71(ptr byval(%"tangle_group") %agg.tmp, i32 %0, i64 %1) 
+  ret i32 %call1
+}
+
+
+define internal spir_func i32 @Bar71(ptr byval(%"tangle_group") %g, i32 %x, i64 %local_id) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %local_id.addr = alloca i64
+  %LocalId = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %GroupLocalId = alloca i32, align 4
+  %OCLX = alloca i32, align 4
+  %WideOCLX = alloca i32, align 4
+  %OCLId = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %x.addr.ascast = addrspacecast ptr %x.addr to ptr addrspace(4)
+  %GroupLocalId.ascast = addrspacecast ptr %GroupLocalId to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  store i64 %local_id, ptr %local_id.addr
+  %0 = load i64, ptr %local_id.addr
+  %conv = trunc i64 %0 to i32
+  %call = call spir_func i32 @Bar17(ptr byval(%"tangle_group") %agg.tmp, i32 %conv) 
+  store i32 %call, ptr %LocalId, align 4
+  %1 = load i32, ptr %LocalId, align 4
+  store i32 %1, ptr %GroupLocalId, align 4
+  %call1 = call spir_func i32 @Foo49(ptr addrspace(4) align 4 %x.addr.ascast) 
+  store i32 %call1, ptr %OCLX, align 4
+  %2 = load i32, ptr %OCLX, align 4
+  store i32 %2, ptr %WideOCLX, align 4
+  %call2 = call spir_func i32 @Foo49(ptr addrspace(4) align 4 %GroupLocalId.ascast) 
+  store i32 %call2, ptr %OCLId, align 4
+  %3 = load i32, ptr %WideOCLX, align 4
+  %4 = load i32, ptr %OCLId, align 4
+  %call3 = call spir_func i32 @_Z32__spirv_GroupNonUniformBroadcastIjjET_N5__spv5Scope4FlagES0_T0_(i32 3, i32 %3, i32 %4) 
+  ret i32 %call3
+}
+
+declare dso_local spir_func i32 @_Z32__spirv_GroupNonUniformBroadcastIjjET_N5__spv5Scope4FlagES0_T0_(i32, i32, i32) 
+
+
+define internal spir_func void @Foo96(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) align 1 %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %call = call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv() 
+  %conv = zext i32 %call to i64
+  call spir_func void @Foo46(ptr addrspace(4) %agg.result, i64 %conv) 
+  ret void
+}
+
+
+define internal spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv()   {
+entry:
+  %retval = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4
+  ret i32 %0
+}
+
+
+define internal spir_func i64 @Foo77(ptr addrspace(4) %this, ptr byval(%"range") %Id) {
+entry:
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %Result = alloca i64
+  %ref.tmp = alloca %class.anon.15
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %Result.ascast = addrspacecast ptr %Result to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %Id.ascast = addrspacecast ptr %Id to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  store i64 0, ptr %Result
+  %0 = bitcast ptr %ref.tmp to ptr
+  store ptr addrspace(4) %this1, ptr %0
+  %Result2 = getelementptr inbounds %class.anon.15, ptr %ref.tmp, i32 0, i32 1
+  store ptr addrspace(4) %Result.ascast, ptr %Result2
+  %Id3 = getelementptr inbounds %class.anon.15, ptr %ref.tmp, i32 0, i32 2
+  store ptr addrspace(4) %Id.ascast, ptr %Id3
+  call spir_func void @Foo79(ptr addrspace(4) %ref.tmp.ascast) 
+  %1 = load i64, ptr %Result
+  ret i64 %1
+}
+
+
+define internal spir_func ptr addrspace(1) @Foo78(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(1)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = getelementptr inbounds nuw %"accessor", ptr addrspace(4) %this1, i32 0, i32 1
+  %1 = load ptr addrspace(1), ptr addrspace(4) %0
+  ret ptr addrspace(1) %1
+}
+
+
+define internal spir_func void @Foo79(ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"nd_item", align 1
+  store ptr addrspace(4) %f, ptr %f.addr
+  %0 = load ptr addrspace(4), ptr %f.addr
+  call spir_func void @Foo80(ptr byval(%"nd_item") align 1 %agg.tmp, ptr addrspace(4) %0) 
+  ret void
+}
+
+
+define internal spir_func void @Foo80(ptr byval(%"nd_item") align 1 %0, ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"nd_item", align 1
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store ptr addrspace(4) %f, ptr %f.addr
+  %2 = load ptr addrspace(4), ptr %f.addr
+  %call = call spir_func i64 @_ZNKSt17integral_constantImLm0EEcvmEv(ptr addrspace(4) align 1 %ref.tmp.ascast) 
+  call spir_func void @Foo81(ptr addrspace(4) %2, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @_ZNKSt17integral_constantImLm0EEcvmEv(ptr addrspace(4) align 1 %this) {
+entry:
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  ret i64 0
+}
+
+
+define internal spir_func void @Foo81(ptr addrspace(4) %this, i64 %I)  align 2  {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %I.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %I, ptr %I.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load ptr addrspace(4), ptr addrspace(4) %0
+  %Result = getelementptr inbounds nuw %class.anon.15, ptr addrspace(4) %this1, i32 0, i32 1
+  %2 = load ptr addrspace(4), ptr addrspace(4) %Result
+  %3 = load i64, ptr addrspace(4) %2
+  %call = call spir_func ptr addrspace(4) @Bar72(ptr addrspace(4) %1) 
+  %4 = load i64, ptr %I.addr
+  %conv = trunc i64 %4 to i32
+  %call2 = call spir_func i64 @Foo37(ptr addrspace(4) %call, i32 %conv) 
+  %mul = mul i64 %3, %call2
+  %Id = getelementptr inbounds nuw %class.anon.15, ptr addrspace(4) %this1, i32 0, i32 2
+  %5 = load ptr addrspace(4), ptr addrspace(4) %Id
+  %6 = load i64, ptr %I.addr
+  %conv3 = trunc i64 %6 to i32
+  %call4 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %5, i32 %conv3) 
+  %7 = load i64, ptr addrspace(4) %call4
+  %add = add i64 %mul, %7
+  %Result5 = getelementptr inbounds nuw %class.anon.15, ptr addrspace(4) %this1, i32 0, i32 1
+  %8 = load ptr addrspace(4), ptr addrspace(4) %Result5
+  store i64 %add, ptr addrspace(4) %8
+  ret void
+}
+
+
+define internal spir_func ptr addrspace(4) @Bar72(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %MemRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 2
+  ret ptr addrspace(4) %MemRange
+}
+
+
+define internal spir_func i64 @Foo37(ptr addrspace(4) %this, i32 %dimension) {
+entry:
+  %this.addr.i = alloca ptr addrspace(4)
+  %dimension.addr.i = alloca i32, align 4
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %dimension.addr = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i32 %dimension, ptr %dimension.addr, align 4
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i32, ptr %dimension.addr, align 4
+  store ptr addrspace(4) %this1, ptr %this.addr.i
+  store i32 %0, ptr %dimension.addr.i, align 4
+  %this1.i = load ptr addrspace(4), ptr %this.addr.i
+  %common_array1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load i32, ptr %dimension.addr, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array1, i64 0, i64 %idxprom
+  %2 = load i64, ptr addrspace(4) %arrayidx
+  ret i64 %2
+}
+
+
+define internal spir_func void @Foo95(ptr byval(%"tangle_group") %g, i32 %FenceScope, i32 %Order) {
+entry:
+  %FenceScope.addr = alloca i32, align 4
+  %Order.addr = alloca i32, align 4
+  %g.ascast = addrspacecast ptr %g to ptr addrspace(4)
+  store i32 %FenceScope, ptr %FenceScope.addr, align 4
+  store i32 %Order, ptr %Order.addr, align 4
+  %0 = load i32, ptr %FenceScope.addr, align 4
+  %call = call spir_func i32 @Bar73(i32 %0) 
+  %1 = load i32, ptr %Order.addr, align 4
+  %call1 = call spir_func i32 @Bar74(i32 %1) 
+  %or = or i32 %call1, 128
+  %or2 = or i32 %or, 256
+  %or3 = or i32 %or2, 512
+  call spir_func void @_Z21__spirv_MemoryBarrierjj(i32 %call, i32 %or3) 
+  ret void
+}
+
+
+define internal spir_func i32 @Bar73(i32 %Scope){
+entry:
+  %retval = alloca i32, align 4
+  %Scope.addr = alloca i32, align 4
+  store i32 %Scope, ptr %Scope.addr, align 4
+  %0 = load i32, ptr %Scope.addr, align 4
+  switch i32 %0, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+    i32 4, label %sw.bb4
+  ]
+
+sw.bb4:                                           ; preds = %entry
+  store i32 0, ptr %retval, align 4
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  store i32 1, ptr %retval, align 4
+  br label %return
+
+sw.bb2:                                           ; preds = %entry
+  store i32 2, ptr %retval, align 4
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+  store i32 3, ptr %retval, align 4
+  br label %return
+
+sw.bb:                                            ; preds = %entry
+  store i32 4, ptr %retval, align 4
+  br label %return
+
+return:                                           ; preds = %sw.bb4, %sw.bb3, %sw.bb2, %sw.bb1, %sw.bb
+  %1 = load i32, ptr %retval, align 4
+  ret i32 %1
+
+sw.epilog:                                        ; preds = %entry
+  unreachable
+}
+
+
+define internal spir_func i32 @Bar74(i32 %Order){
+entry:
+  %retval = alloca i32, align 4
+  %Order.addr = alloca i32, align 4
+  %SpvOrder = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %Order, ptr %Order.addr, align 4
+  store i32 0, ptr %SpvOrder, align 4
+  %0 = load i32, ptr %Order.addr, align 4
+  switch i32 %0, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 2, label %sw.bb1
+    i32 1, label %sw.bb1
+    i32 3, label %sw.bb2
+    i32 4, label %sw.bb3
+    i32 5, label %sw.bb4
+  ]
+
+sw.bb4:                                           ; preds = %entry
+  store i32 16, ptr %SpvOrder, align 4
+  br label %sw.epilog
+
+sw.bb3:                                           ; preds = %entry
+  store i32 8, ptr %SpvOrder, align 4
+  br label %sw.epilog
+
+sw.bb2:                                           ; preds = %entry
+  store i32 4, ptr %SpvOrder, align 4
+  br label %sw.epilog
+
+sw.bb1:                                           ; preds = %entry, %entry
+  store i32 2, ptr %SpvOrder, align 4
+  br label %sw.epilog
+
+sw.bb:                                            ; preds = %entry
+  store i32 0, ptr %SpvOrder, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb4, %sw.bb3, %sw.bb2, %sw.bb1, %sw.bb, %entry
+  %1 = load i32, ptr %SpvOrder, align 4
+  %or = or i32 %1, 128
+  %or5 = or i32 %or, 256
+  %or6 = or i32 %or5, 512
+  ret i32 %or6
+}
+
+declare dso_local spir_func void @_Z21__spirv_MemoryBarrierjj(i32, i32) 
+
+
+define internal spir_func i64 @Foo93(ptr addrspace(4) %this, ptr byval(%"range") %Id) {
+entry:
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %Result = alloca i64
+  %ref.tmp = alloca %class.anon.15
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %Result.ascast = addrspacecast ptr %Result to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %Id.ascast = addrspacecast ptr %Id to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  store i64 0, ptr %Result
+  %0 = bitcast ptr %ref.tmp to ptr
+  store ptr addrspace(4) %this1, ptr %0
+  %Result2 = getelementptr inbounds %class.anon.15, ptr %ref.tmp, i32 0, i32 1
+  store ptr addrspace(4) %Result.ascast, ptr %Result2
+  %Id3 = getelementptr inbounds %class.anon.15, ptr %ref.tmp, i32 0, i32 2
+  store ptr addrspace(4) %Id.ascast, ptr %Id3
+  call spir_func void @Bar75(ptr addrspace(4) %ref.tmp.ascast) 
+  %1 = load i64, ptr %Result
+  ret i64 %1
+}
+
+
+define internal spir_func ptr addrspace(1) @Foo94(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(1)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = getelementptr inbounds nuw %"accessor", ptr addrspace(4) %this1, i32 0, i32 1
+  %1 = load ptr addrspace(1), ptr addrspace(4) %0
+  ret ptr addrspace(1) %1
+}
+
+
+define internal spir_func void @Bar75(ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"nd_item", align 1
+  store ptr addrspace(4) %f, ptr %f.addr
+  %0 = load ptr addrspace(4), ptr %f.addr
+  call spir_func void @Bar76(ptr byval(%"nd_item") align 1 %agg.tmp, ptr addrspace(4) %0) 
+  ret void
+}
+
+
+define internal spir_func void @Bar76(ptr byval(%"nd_item") align 1 %0, ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"nd_item", align 1
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store ptr addrspace(4) %f, ptr %f.addr
+  %2 = load ptr addrspace(4), ptr %f.addr
+  %call = call spir_func i64 @_ZNKSt17integral_constantImLm0EEcvmEv(ptr addrspace(4) align 1 %ref.tmp.ascast) 
+  call spir_func void @Bar767(ptr addrspace(4) %2, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func void @Bar767(ptr addrspace(4) %this, i64 %I)  align 2  {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %I.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %I, ptr %I.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load ptr addrspace(4), ptr addrspace(4) %0
+  %Result = getelementptr inbounds nuw %class.anon.15, ptr addrspace(4) %this1, i32 0, i32 1
+  %2 = load ptr addrspace(4), ptr addrspace(4) %Result
+  %3 = load i64, ptr addrspace(4) %2
+  %call = call spir_func ptr addrspace(4) @Bar78(ptr addrspace(4) %1) 
+  %4 = load i64, ptr %I.addr
+  %conv = trunc i64 %4 to i32
+  %call2 = call spir_func i64 @Foo37(ptr addrspace(4) %call, i32 %conv) 
+  %mul = mul i64 %3, %call2
+  %Id = getelementptr inbounds nuw %class.anon.15, ptr addrspace(4) %this1, i32 0, i32 2
+  %5 = load ptr addrspace(4), ptr addrspace(4) %Id
+  %6 = load i64, ptr %I.addr
+  %conv3 = trunc i64 %6 to i32
+  %call4 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %5, i32 %conv3) 
+  %7 = load i64, ptr addrspace(4) %call4
+  %add = add i64 %mul, %7
+  %Result5 = getelementptr inbounds nuw %class.anon.15, ptr addrspace(4) %this1, i32 0, i32 1
+  %8 = load ptr addrspace(4), ptr addrspace(4) %Result5
+  store i64 %add, ptr addrspace(4) %8
+  ret void
+}
+
+
+define internal spir_func ptr addrspace(4) @Bar78(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %MemRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 2
+  ret ptr addrspace(4) %MemRange
+}
+
+
+define internal spir_func void @Foo44(ptr addrspace(4) dead_on_unwind noalias writable sret(%"ss_sub_group_mask") %agg.result, ptr byval(%"nd_item") align 1 %g, i1 zeroext %predicate) {
+entry:
+  %predicate.addr = alloca i8, align 1
+  %res = alloca <4 x i32>, align 16
+  %val = alloca i64
+  %ref.tmp = alloca %"range"
+  %cleanup.dest.slot = alloca i32, align 4
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %g.ascast = addrspacecast ptr %g to ptr addrspace(4)
+  %storedv = zext i1 %predicate to i8
+  store i8 %storedv, ptr %predicate.addr, align 1
+  %0 = load i8, ptr %predicate.addr, align 1  
+  %loadedv = trunc i8 %0 to i1
+  %call = call spir_func <4 x i32> @_Z29__spirv_GroupNonUniformBallotjb(i32 3, i1 zeroext %loadedv) 
+  store <4 x i32> %call, ptr %res, align 16
+  %1 = load <4 x i32>, ptr %res, align 16
+  %vecext = extractelement <4 x i32> %1, i32 0
+  %conv = zext i32 %vecext to i64
+  store i64 %conv, ptr %val
+  %2 = load <4 x i32>, ptr %res, align 16
+  %vecext1 = extractelement <4 x i32> %2, i32 1
+  %conv2 = zext i32 %vecext1 to i64
+  %shl = shl i64 %conv2, 32
+  %3 = load i64, ptr %val
+  %or = or i64 %3, %shl
+  store i64 %or, ptr %val
+  %4 = load i64, ptr %val
+  call spir_func void @Bar79(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) align 1 %g.ascast) 
+  %call3 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %ref.tmp.ascast, i32 0) 
+  %5 = load i64, ptr addrspace(4) %call3
+  call spir_func void @Bar80(ptr addrspace(4) dead_on_unwind writable sret(%"ss_sub_group_mask") %agg.result, i64 %4, i64 %5) 
+  ret void
+}
+
+
+define internal spir_func void @Foo45(ptr addrspace(4) %this, ptr byval(%"ss_sub_group_mask") %m) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %Mask1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  ret void
+}
+
+declare dso_local spir_func <4 x i32> @_Z29__spirv_GroupNonUniformBallotjb(i32, i1 zeroext) 
+
+
+define internal spir_func void @Bar79(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) align 1 %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %call = call spir_func i32 @_Z23__spirv_SubgroupMaxSizev() 
+  %conv = zext i32 %call to i64
+  call spir_func void @Foo9(ptr addrspace(4) %agg.result, i64 %conv) 
+  ret void
+}
+
+
+define internal spir_func void @Bar80(ptr addrspace(4) dead_on_unwind noalias writable sret(%"ss_sub_group_mask") %agg.result, i64 %Bits, i64 %BitsNum) {
+entry:
+  %Bits.addr = alloca i64
+  %BitsNum.addr = alloca i64
+  store i64 %Bits, ptr %Bits.addr
+  store i64 %BitsNum, ptr %BitsNum.addr
+  %0 = load i64, ptr %Bits.addr
+  %1 = load i64, ptr %BitsNum.addr
+  call spir_func void @Bar81(ptr addrspace(4) %agg.result, i64 %0, i64 %1) 
+  ret void
+}
+
+
+define internal spir_func void @Bar81(ptr addrspace(4) %this, i64 %rhs, i64 %bn) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %rhs.addr = alloca i64
+  %bn.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %rhs, ptr %rhs.addr
+  store i64 %bn, ptr %bn.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %Bits1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr %rhs.addr
+  %1 = load i64, ptr %bn.addr
+  %call = call spir_func i64 @Bar58(ptr addrspace(4) %this1, i64 %1) 
+  %and = and i64 %0, %call
+  store i64 %and, ptr addrspace(4) %Bits1
+  %bits_num = getelementptr inbounds nuw %"ss_sub_group_mask", ptr addrspace(4) %this1, i32 0, i32 1
+  %2 = load i64, ptr %bn.addr
+  store i64 %2, ptr addrspace(4) %bits_num
+  %bits_num2 = getelementptr inbounds nuw %"ss_sub_group_mask", ptr addrspace(4) %this1, i32 0, i32 1
+  %3 = load i64, ptr addrspace(4) %bits_num2
+  %cmp = icmp ule i64 %3, 64
+  %4 = addrspacecast ptr addrspace(1) @.str to ptr addrspace(4)
+  %5 = addrspacecast ptr addrspace(1) @.str.1 to ptr addrspace(4)
+  %6 = addrspacecast ptr addrspace(1) @__PRETTY_FUNCTION1 to ptr addrspace(4)
+  br i1 %cmp, label %cond.end, label %cond.false
+
+cond.false:                                       ; preds = %entry
+  call spir_func void @__assert_fail(ptr addrspace(4) %4, ptr addrspace(4) %5, i32 324, ptr addrspace(4) %6) 
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.false
+  ret void
+}
+
+
+define internal spir_func i32 @_Z23__spirv_SubgroupMaxSizev()   {
+entry:
+  %retval = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupMaxSize, align 4
+  ret i32 %0
+}
+
+
+define internal spir_func void @Init6(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @Inv1(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+
+define internal spir_func void @Inv1(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  %call = call spir_func i64 @Inv2() 
+  call spir_func void @Foo46(ptr addrspace(4) %agg.result, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @Inv2() {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %call = call spir_func i64 @_Z28__spirv_GlobalInvocationId_xv() 
+  ret i64 %call
+}
+
+
+define internal spir_func i64 @_Z28__spirv_GlobalInvocationId_xv()   {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId, align 32
+  %1 = extractelement <3 x i64> %0, i64 0
+  ret i64 %1
+}
+
+
+define internal spir_func void @Foo7(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @Foo8(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+
+define internal spir_func void @Init1(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @Inv3(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+
+define internal spir_func void @Init2(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @InitSize1(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+
+define internal spir_func void @Init3(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @InitSize2(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+
+define internal spir_func void @Init4(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @InitSize3(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+
+define internal spir_func void @Init5(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @InitSize4(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+
+define internal spir_func void @Foo23(ptr addrspace(4) dead_on_unwind noalias writable sret(%"group") %agg.result, ptr addrspace(4) %Global, ptr addrspace(4) %Local, ptr addrspace(4) %Group, ptr addrspace(4) %Index) {
+entry:
+  %Global.addr = alloca ptr addrspace(4)
+  %Local.addr = alloca ptr addrspace(4)
+  %Group.addr = alloca ptr addrspace(4)
+  %Index.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"range"
+  store ptr addrspace(4) %Global, ptr %Global.addr
+  store ptr addrspace(4) %Local, ptr %Local.addr
+  store ptr addrspace(4) %Group, ptr %Group.addr
+  store ptr addrspace(4) %Index, ptr %Index.addr
+  %0 = load ptr addrspace(4), ptr %Global.addr
+  %1 = load ptr addrspace(4), ptr %Local.addr
+  %2 = load ptr addrspace(4), ptr %Group.addr
+  %3 = load ptr addrspace(4), ptr %Index.addr
+  call spir_func void @Bar82(ptr addrspace(4) %agg.result, ptr addrspace(4) %0, ptr addrspace(4) %1, ptr byval(%"range") %agg.tmp, ptr addrspace(4) %3) 
+  ret void
+}
+
+
+define internal spir_func void @Foo24(ptr addrspace(4) dead_on_unwind noalias writable sret(%"item") %agg.result, ptr addrspace(4) %Extent, ptr addrspace(4) %Index, ptr addrspace(4) %Offset) {
+entry:
+  %Extent.addr = alloca ptr addrspace(4)
+  %Index.addr = alloca ptr addrspace(4)
+  %Offset.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %Extent, ptr %Extent.addr
+  store ptr addrspace(4) %Index, ptr %Index.addr
+  store ptr addrspace(4) %Offset, ptr %Offset.addr
+  %0 = load ptr addrspace(4), ptr %Extent.addr
+  %1 = load ptr addrspace(4), ptr %Index.addr
+  %2 = load ptr addrspace(4), ptr %Offset.addr
+  call spir_func void @Foo29(ptr addrspace(4) %agg.result, ptr addrspace(4) %0, ptr addrspace(4) %1, ptr addrspace(4) %2) 
+  ret void
+}
+
+
+define internal spir_func void @Foo25(ptr addrspace(4) dead_on_unwind noalias writable sret(%"item.22") %agg.result, ptr addrspace(4) %Extent, ptr addrspace(4) %Index) {
+entry:
+  %Extent.addr = alloca ptr addrspace(4)
+  %Index.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %Extent, ptr %Extent.addr
+  store ptr addrspace(4) %Index, ptr %Index.addr
+  %0 = load ptr addrspace(4), ptr %Extent.addr
+  %1 = load ptr addrspace(4), ptr %Index.addr
+  call spir_func void @Foo27(ptr addrspace(4) %agg.result, ptr addrspace(4) %0, ptr addrspace(4) %1) 
+  ret void
+}
+
+
+define internal spir_func void @Foo26(ptr addrspace(4) dead_on_unwind noalias writable sret(%"nd_item") align 1 %agg.result, ptr addrspace(4) %Global, ptr addrspace(4) %Local, ptr addrspace(4) %Group) {
+entry:
+  %Global.addr = alloca ptr addrspace(4)
+  %Local.addr = alloca ptr addrspace(4)
+  %Group.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %Global, ptr %Global.addr
+  store ptr addrspace(4) %Local, ptr %Local.addr
+  store ptr addrspace(4) %Group, ptr %Group.addr
+  %0 = load ptr addrspace(4), ptr %Global.addr
+  %1 = load ptr addrspace(4), ptr %Local.addr
+  %2 = load ptr addrspace(4), ptr %Group.addr
+  call spir_func void @Foo28(ptr addrspace(4) align 1 %agg.result, ptr addrspace(4) %0, ptr addrspace(4) %1, ptr addrspace(4) %2) 
+  ret void
+}
+
+
+define internal spir_func void @Foo28(ptr addrspace(4) align 1 %this, ptr addrspace(4) %0, ptr addrspace(4) %1, ptr addrspace(4) %2) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %.addr = alloca ptr addrspace(4)
+  %.addr1 = alloca ptr addrspace(4)
+  %.addr2 = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(4) %0, ptr %.addr
+  store ptr addrspace(4) %1, ptr %.addr1
+  store ptr addrspace(4) %2, ptr %.addr2
+  %this3 = load ptr addrspace(4), ptr %this.addr
+  ret void
+}
+
+
+define internal spir_func void @Foo27(ptr addrspace(4) %this, ptr addrspace(4) %extent, ptr addrspace(4) %index) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %extent.addr = alloca ptr addrspace(4)
+  %index.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(4) %extent, ptr %extent.addr
+  store ptr addrspace(4) %index, ptr %index.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %MImpl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %MExtent2 = bitcast ptr addrspace(4) %MImpl1 to ptr addrspace(4)
+  %0 = load ptr addrspace(4), ptr %extent.addr
+  %MIndex = getelementptr inbounds nuw %"sd_ItemBase.23", ptr addrspace(4) %MImpl1, i32 0, i32 1
+  %1 = load ptr addrspace(4), ptr %index.addr
+  ret void
+}
+
+
+
+
+define internal spir_func void @Foo29(ptr addrspace(4) %this, ptr addrspace(4) %extent, ptr addrspace(4) %index, ptr addrspace(4) %offset) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %extent.addr = alloca ptr addrspace(4)
+  %index.addr = alloca ptr addrspace(4)
+  %offset.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(4) %extent, ptr %extent.addr
+  store ptr addrspace(4) %index, ptr %index.addr
+  store ptr addrspace(4) %offset, ptr %offset.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %MImpl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %MExtent2 = bitcast ptr addrspace(4) %MImpl1 to ptr addrspace(4)
+  %0 = load ptr addrspace(4), ptr %extent.addr
+  %MIndex = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %MImpl1, i32 0, i32 1
+  %1 = load ptr addrspace(4), ptr %index.addr
+  %MOffset = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %MImpl1, i32 0, i32 2
+  %2 = load ptr addrspace(4), ptr %offset.addr
+  ret void
+}
+
+
+define internal spir_func void @Bar82(ptr addrspace(4) %this, ptr addrspace(4) %G, ptr addrspace(4) %L, ptr byval(%"range") %GroupRange, ptr addrspace(4) %I) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %G.addr = alloca ptr addrspace(4)
+  %L.addr = alloca ptr addrspace(4)
+  %I.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(4) %G, ptr %G.addr
+  store ptr addrspace(4) %L, ptr %L.addr
+  store ptr addrspace(4) %I, ptr %I.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %globalRange1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load ptr addrspace(4), ptr %G.addr
+  %localRange = getelementptr inbounds nuw %"group", ptr addrspace(4) %this1, i32 0, i32 1
+  %1 = load ptr addrspace(4), ptr %L.addr
+  %groupRange = getelementptr inbounds nuw %"group", ptr addrspace(4) %this1, i32 0, i32 2
+  %index = getelementptr inbounds nuw %"group", ptr addrspace(4) %this1, i32 0, i32 3
+  %2 = load ptr addrspace(4), ptr %I.addr
+  ret void
+}
+
+
+define internal spir_func void @InitSize4(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  %call = call spir_func i64 @_ZN7__spirv15getGlobalOffsetILi0EEEmv() 
+  call spir_func void @Foo46(ptr addrspace(4) %agg.result, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @_ZN7__spirv15getGlobalOffsetILi0EEEmv() {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %call = call spir_func i64 @_Z22__spirv_GlobalOffset_xv() 
+  ret i64 %call
+}
+
+
+define internal spir_func i64 @_Z22__spirv_GlobalOffset_xv()   {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInGlobalOffset, align 32
+  %1 = extractelement <3 x i64> %0, i64 0
+  ret i64 %1
+}
+
+
+define internal spir_func void @InitSize3(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  %call = call spir_func i64 @_ZN7__spirv20getLocalInvocationIdILi0EEEmv() 
+  call spir_func void @Foo46(ptr addrspace(4) %agg.result, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @_ZN7__spirv20getLocalInvocationIdILi0EEEmv() {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %call = call spir_func i64 @_Z27__spirv_LocalInvocationId_xv() 
+  ret i64 %call
+}
+
+
+define internal spir_func i64 @_Z27__spirv_LocalInvocationId_xv()   {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInLocalInvocationId, align 32
+  %1 = extractelement <3 x i64> %0, i64 0
+  ret i64 %1
+}
+
+
+define internal spir_func void @InitSize2(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  %call = call spir_func i64 @_ZN7__spirv14getWorkgroupIdILi0EEEmv() 
+  call spir_func void @Foo46(ptr addrspace(4) %agg.result, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @_ZN7__spirv14getWorkgroupIdILi0EEEmv() {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %call = call spir_func i64 @_Z21__spirv_WorkgroupId_xv() 
+  ret i64 %call
+}
+
+
+define internal spir_func i64 @_Z21__spirv_WorkgroupId_xv()   {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInWorkgroupId, align 32
+  %1 = extractelement <3 x i64> %0, i64 0
+  ret i64 %1
+}
+
+
+define internal spir_func void @InitSize1(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  %call = call spir_func i64 @_ZN7__spirv16getNumWorkgroupsILi0EEEmv() 
+  call spir_func void @Foo9(ptr addrspace(4) %agg.result, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @_ZN7__spirv16getNumWorkgroupsILi0EEEmv() {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %call = call spir_func i64 @_Z23__spirv_NumWorkgroups_xv() 
+  ret i64 %call
+}
+
+
+define internal spir_func i64 @_Z23__spirv_NumWorkgroups_xv()   {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInNumWorkgroups, align 32
+  %1 = extractelement <3 x i64> %0, i64 0
+  ret i64 %1
+}
+
+
+define internal spir_func void @Inv3(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  %call = call spir_func i64 @_ZN7__spirv16getWorkgroupSizeILi0EEEmv() 
+  call spir_func void @Foo9(ptr addrspace(4) %agg.result, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @_ZN7__spirv16getWorkgroupSizeILi0EEEmv() {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %call = call spir_func i64 @_Z23__spirv_WorkgroupSize_xv() 
+  ret i64 %call
+}
+
+
+define internal spir_func i64 @_Z23__spirv_WorkgroupSize_xv()   {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInWorkgroupSize, align 32
+  %1 = extractelement <3 x i64> %0, i64 0
+  ret i64 %1
+}
+
+
+define internal spir_func void @Foo8(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  %call = call spir_func i64 @_ZN7__spirv13getGlobalSizeILi0EEEmv() 
+  call spir_func void @Foo9(ptr addrspace(4) %agg.result, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @_ZN7__spirv13getGlobalSizeILi0EEEmv() {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %call = call spir_func i64 @_Z20__spirv_GlobalSize_xv() 
+  ret i64 %call
+}
+
+
+define internal spir_func i64 @_Z20__spirv_GlobalSize_xv()   {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInGlobalSize, align 32
+  %1 = extractelement <3 x i64> %0, i64 0
+  ret i64 %1
+}
+
+
+define internal spir_func void @Foo30(ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"nd_item", align 1
+  store ptr addrspace(4) %f, ptr %f.addr
+  %0 = load ptr addrspace(4), ptr %f.addr
+  call spir_func void @Foo33(ptr byval(%"nd_item") align 1 %agg.tmp, ptr addrspace(4) %0) 
+  ret void
+}
+
+
+define internal spir_func i64 @Foo32(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %TotalOffset = alloca i64
+  %ref.tmp = alloca %class.anon.7
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %TotalOffset.ascast = addrspacecast ptr %TotalOffset to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  store i64 0, ptr %TotalOffset
+  %0 = bitcast ptr %ref.tmp to ptr
+  store ptr addrspace(4) %this1, ptr %0
+  %TotalOffset2 = getelementptr inbounds %class.anon.7, ptr %ref.tmp, i32 0, i32 1
+  store ptr addrspace(4) %TotalOffset.ascast, ptr %TotalOffset2
+  call spir_func void @Foo34(ptr addrspace(4) %ref.tmp.ascast) 
+  %1 = load i64, ptr %TotalOffset
+  ret i64 %1
+}
+
+
+define internal spir_func void @Foo34(ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"nd_item", align 1
+  store ptr addrspace(4) %f, ptr %f.addr
+  %0 = load ptr addrspace(4), ptr %f.addr
+  call spir_func void @Foo35(ptr byval(%"nd_item") align 1 %agg.tmp, ptr addrspace(4) %0) 
+  ret void
+}
+
+
+define internal spir_func void @Foo35(ptr byval(%"nd_item") align 1 %0, ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"nd_item", align 1
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store ptr addrspace(4) %f, ptr %f.addr
+  %2 = load ptr addrspace(4), ptr %f.addr
+  %call = call spir_func i64 @_ZNKSt17integral_constantImLm0EEcvmEv(ptr addrspace(4) align 1 %ref.tmp.ascast) 
+  call spir_func void @Foo36(ptr addrspace(4) %2, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func void @Foo36(ptr addrspace(4) %this, i64 %I)  align 2  {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %I.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %I, ptr %I.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load ptr addrspace(4), ptr addrspace(4) %0
+  %TotalOffset = getelementptr inbounds nuw %class.anon.7, ptr addrspace(4) %this1, i32 0, i32 1
+  %2 = load ptr addrspace(4), ptr addrspace(4) %TotalOffset
+  %3 = load i64, ptr addrspace(4) %2
+  %impl1 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %MemRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 2
+  %4 = load i64, ptr %I.addr
+  %conv = trunc i64 %4 to i32
+  %call = call spir_func i64 @Foo37(ptr addrspace(4) %MemRange, i32 %conv) 
+  %mul = mul i64 %3, %call
+  %TotalOffset2 = getelementptr inbounds nuw %class.anon.7, ptr addrspace(4) %this1, i32 0, i32 1
+  %5 = load ptr addrspace(4), ptr addrspace(4) %TotalOffset2
+  store i64 %mul, ptr addrspace(4) %5
+  %impl32 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %Offset3 = bitcast ptr addrspace(4) %impl32 to ptr addrspace(4)
+  %6 = load i64, ptr %I.addr
+  %conv4 = trunc i64 %6 to i32
+  %call5 = call spir_func i64 @Foo37(ptr addrspace(4) %Offset3, i32 %conv4) 
+  %TotalOffset6 = getelementptr inbounds nuw %class.anon.7, ptr addrspace(4) %this1, i32 0, i32 1
+  %7 = load ptr addrspace(4), ptr addrspace(4) %TotalOffset6
+  %8 = load i64, ptr addrspace(4) %7
+  %add = add i64 %8, %call5
+  store i64 %add, ptr addrspace(4) %7
+  ret void
+}
+
+
+define internal spir_func void @Foo33(ptr byval(%"nd_item") align 1 %0, ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"nd_item", align 1
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store ptr addrspace(4) %f, ptr %f.addr
+  %2 = load ptr addrspace(4), ptr %f.addr
+  %call = call spir_func i64 @_ZNKSt17integral_constantImLm0EEcvmEv(ptr addrspace(4) align 1 %ref.tmp.ascast) 
+  call spir_func void @Foo38(ptr addrspace(4) %2, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func void @Foo38(ptr addrspace(4) %this, i64 %I)  align 2  {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %I.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %I, ptr %I.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load ptr addrspace(4), ptr addrspace(4) %0
+  %Offset = getelementptr inbounds nuw %class.anon.6, ptr addrspace(4) %this1, i32 0, i32 1
+  %2 = load ptr addrspace(4), ptr addrspace(4) %Offset
+  %3 = load i64, ptr %I.addr
+  %conv = trunc i64 %3 to i32
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %2, i32 %conv) 
+  %4 = load i64, ptr addrspace(4) %call
+  %call2 = call spir_func ptr addrspace(4) @Foo39(ptr addrspace(4) %1) 
+  %5 = load i64, ptr %I.addr
+  %conv3 = trunc i64 %5 to i32
+  %call4 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %call2, i32 %conv3) 
+  store i64 %4, ptr addrspace(4) %call4
+  %AccessRange = getelementptr inbounds nuw %class.anon.6, ptr addrspace(4) %this1, i32 0, i32 2
+  %6 = load ptr addrspace(4), ptr addrspace(4) %AccessRange
+  %7 = load i64, ptr %I.addr
+  %conv5 = trunc i64 %7 to i32
+  %call6 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %6, i32 %conv5) 
+  %8 = load i64, ptr addrspace(4) %call6
+  %call7 = call spir_func ptr addrspace(4) @Foo40A(ptr addrspace(4) %1) 
+  %9 = load i64, ptr %I.addr
+  %conv8 = trunc i64 %9 to i32
+  %call9 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %call7, i32 %conv8) 
+  store i64 %8, ptr addrspace(4) %call9
+  %MemRange = getelementptr inbounds nuw %class.anon.6, ptr addrspace(4) %this1, i32 0, i32 3
+  %10 = load ptr addrspace(4), ptr addrspace(4) %MemRange
+  %11 = load i64, ptr %I.addr
+  %conv10 = trunc i64 %11 to i32
+  %call11 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %10, i32 %conv10) 
+  %12 = load i64, ptr addrspace(4) %call11
+  %call12 = call spir_func ptr addrspace(4) @Foo41A(ptr addrspace(4) %1) 
+  %13 = load i64, ptr %I.addr
+  %conv13 = trunc i64 %13 to i32
+  %call14 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %call12, i32 %conv13) 
+  store i64 %12, ptr addrspace(4) %call14
+  ret void
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo39(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %Offset2 = bitcast ptr addrspace(4) %impl1 to ptr addrspace(4)
+  ret ptr addrspace(4) %Offset2
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo40A(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %AccessRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 1
+  ret ptr addrspace(4) %AccessRange
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo41A(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %MemRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 2
+  ret ptr addrspace(4) %MemRange
+}
+
+
+define internal spir_func void @Foo13(ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"nd_item", align 1
+  store ptr addrspace(4) %f, ptr %f.addr
+  %0 = load ptr addrspace(4), ptr %f.addr
+  call spir_func void @Foo14(ptr byval(%"nd_item") align 1 %agg.tmp, ptr addrspace(4) %0) 
+  ret void
+}
+
+
+define internal spir_func i64 @Foo21(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %TotalOffset = alloca i64
+  %ref.tmp = alloca %class.anon.7
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %TotalOffset.ascast = addrspacecast ptr %TotalOffset to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  store i64 0, ptr %TotalOffset
+  %0 = bitcast ptr %ref.tmp to ptr
+  store ptr addrspace(4) %this1, ptr %0
+  %TotalOffset2 = getelementptr inbounds %class.anon.7, ptr %ref.tmp, i32 0, i32 1
+  store ptr addrspace(4) %TotalOffset.ascast, ptr %TotalOffset2
+  call spir_func void @Bar83(ptr addrspace(4) %ref.tmp.ascast) 
+  %1 = load i64, ptr %TotalOffset
+  ret i64 %1
+}
+
+
+define internal spir_func void @Bar83(ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"nd_item", align 1
+  store ptr addrspace(4) %f, ptr %f.addr
+  %0 = load ptr addrspace(4), ptr %f.addr
+  call spir_func void @Bar84(ptr byval(%"nd_item") align 1 %agg.tmp, ptr addrspace(4) %0) 
+  ret void
+}
+
+
+define internal spir_func void @Bar84(ptr byval(%"nd_item") align 1 %0, ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"nd_item", align 1
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store ptr addrspace(4) %f, ptr %f.addr
+  %2 = load ptr addrspace(4), ptr %f.addr
+  %call = call spir_func i64 @_ZNKSt17integral_constantImLm0EEcvmEv(ptr addrspace(4) align 1 %ref.tmp.ascast) 
+  call spir_func void @Bar85(ptr addrspace(4) %2, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func void @Bar85(ptr addrspace(4) %this, i64 %I)  align 2  {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %I.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %I, ptr %I.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load ptr addrspace(4), ptr addrspace(4) %0
+  %TotalOffset = getelementptr inbounds nuw %class.anon.7, ptr addrspace(4) %this1, i32 0, i32 1
+  %2 = load ptr addrspace(4), ptr addrspace(4) %TotalOffset
+  %3 = load i64, ptr addrspace(4) %2
+  %impl1 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %MemRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 2
+  %4 = load i64, ptr %I.addr
+  %conv = trunc i64 %4 to i32
+  %call = call spir_func i64 @Foo37(ptr addrspace(4) %MemRange, i32 %conv) 
+  %mul = mul i64 %3, %call
+  %TotalOffset2 = getelementptr inbounds nuw %class.anon.7, ptr addrspace(4) %this1, i32 0, i32 1
+  %5 = load ptr addrspace(4), ptr addrspace(4) %TotalOffset2
+  store i64 %mul, ptr addrspace(4) %5
+  %impl32 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %Offset3 = bitcast ptr addrspace(4) %impl32 to ptr addrspace(4)
+  %6 = load i64, ptr %I.addr
+  %conv4 = trunc i64 %6 to i32
+  %call5 = call spir_func i64 @Foo37(ptr addrspace(4) %Offset3, i32 %conv4) 
+  %TotalOffset6 = getelementptr inbounds nuw %class.anon.7, ptr addrspace(4) %this1, i32 0, i32 1
+  %7 = load ptr addrspace(4), ptr addrspace(4) %TotalOffset6
+  %8 = load i64, ptr addrspace(4) %7
+  %add = add i64 %8, %call5
+  store i64 %add, ptr addrspace(4) %7
+  ret void
+}
+
+
+define internal spir_func void @Foo14(ptr byval(%"nd_item") align 1 %0, ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"nd_item", align 1
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store ptr addrspace(4) %f, ptr %f.addr
+  %2 = load ptr addrspace(4), ptr %f.addr
+  %call = call spir_func i64 @_ZNKSt17integral_constantImLm0EEcvmEv(ptr addrspace(4) align 1 %ref.tmp.ascast) 
+  call spir_func void @Foo15(ptr addrspace(4) %2, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func void @Foo15(ptr addrspace(4) %this, i64 %I)  align 2  {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %I.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %I, ptr %I.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load ptr addrspace(4), ptr addrspace(4) %0
+  %Offset = getelementptr inbounds nuw %class.anon.6, ptr addrspace(4) %this1, i32 0, i32 1
+  %2 = load ptr addrspace(4), ptr addrspace(4) %Offset
+  %3 = load i64, ptr %I.addr
+  %conv = trunc i64 %3 to i32
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %2, i32 %conv) 
+  %4 = load i64, ptr addrspace(4) %call
+  %call2 = call spir_func ptr addrspace(4) @Foo17(ptr addrspace(4) %1) 
+  %5 = load i64, ptr %I.addr
+  %conv3 = trunc i64 %5 to i32
+  %call4 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %call2, i32 %conv3) 
+  store i64 %4, ptr addrspace(4) %call4
+  %AccessRange = getelementptr inbounds nuw %class.anon.6, ptr addrspace(4) %this1, i32 0, i32 2
+  %6 = load ptr addrspace(4), ptr addrspace(4) %AccessRange
+  %7 = load i64, ptr %I.addr
+  %conv5 = trunc i64 %7 to i32
+  %call6 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %6, i32 %conv5) 
+  %8 = load i64, ptr addrspace(4) %call6
+  %call7 = call spir_func ptr addrspace(4) @Foo18(ptr addrspace(4) %1) 
+  %9 = load i64, ptr %I.addr
+  %conv8 = trunc i64 %9 to i32
+  %call9 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %call7, i32 %conv8) 
+  store i64 %8, ptr addrspace(4) %call9
+  %MemRange = getelementptr inbounds nuw %class.anon.6, ptr addrspace(4) %this1, i32 0, i32 3
+  %10 = load ptr addrspace(4), ptr addrspace(4) %MemRange
+  %11 = load i64, ptr %I.addr
+  %conv10 = trunc i64 %11 to i32
+  %call11 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %10, i32 %conv10) 
+  %12 = load i64, ptr addrspace(4) %call11
+  %call12 = call spir_func ptr addrspace(4) @Foo19(ptr addrspace(4) %1) 
+  %13 = load i64, ptr %I.addr
+  %conv13 = trunc i64 %13 to i32
+  %call14 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %call12, i32 %conv13) 
+  store i64 %12, ptr addrspace(4) %call14
+  ret void
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo17(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %Offset2 = bitcast ptr addrspace(4) %impl1 to ptr addrspace(4)
+  ret ptr addrspace(4) %Offset2
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo18(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %AccessRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 1
+  ret ptr addrspace(4) %AccessRange
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo19(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %MemRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 2
+  ret ptr addrspace(4) %MemRange
+}
+
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) 
+
+
+define internal spir_func void @Foo12(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @Foo9(ptr addrspace(4) %agg.result, i64 0) 
+  ret void
+}
+
+
+define internal spir_func void @Foo10(ptr addrspace(4) %this, ptr byval(%"range") %Offset, ptr byval(%"range") %AccessRange, ptr byval(%"range") %MemoryRange) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %Offset21 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %AccessRange3 = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %this1, i32 0, i32 1
+  %MemRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %this1, i32 0, i32 2
+  ret void
+}
+
+
+define internal spir_func void @__assert_fail(ptr addrspace(4) %expr, ptr addrspace(4) %file, i32 %line, ptr addrspace(4) %func)   {
+entry:
+  %call = tail call spir_func i64 @_Z28__spirv_GlobalInvocationId_xv() 
+  %call1 = tail call spir_func i64 @_Z28__spirv_GlobalInvocationId_yv() 
+  %call2 = tail call spir_func i64 @_Z28__spirv_GlobalInvocationId_zv() 
+  %call3 = tail call spir_func i64 @_Z27__spirv_LocalInvocationId_xv() 
+  %call4 = tail call spir_func i64 @_Z27__spirv_LocalInvocationId_yv() 
+  %call5 = tail call spir_func i64 @_Z27__spirv_LocalInvocationId_zv() 
+  tail call spir_func void @__devicelib_assert_fail(ptr addrspace(4) %expr, ptr addrspace(4) %file, i32 %line, ptr addrspace(4) %func, i64 %call, i64 %call1, i64 %call2, i64 %call3, i64 %call4, i64 %call5) 
+  ret void
+}
+
+
+define internal spir_func i64 @_Z28__spirv_GlobalInvocationId_yv() local_unnamed_addr   {
+entry:
+  %0 = getelementptr inbounds i8, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId, i64 8
+  %1 = load i64, ptr addrspace(1) %0
+  ret i64 %1
+}
+
+
+define internal spir_func i64 @_Z28__spirv_GlobalInvocationId_zv() local_unnamed_addr   {
+entry:
+  %0 = getelementptr inbounds i8, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId, i64 16
+  %1 = load i64, ptr addrspace(1) %0, align 16
+  ret i64 %1
+}
+
+
+define internal spir_func i64 @_Z27__spirv_LocalInvocationId_yv() local_unnamed_addr   {
+entry:
+  %0 = getelementptr inbounds i8, ptr addrspace(1) @__spirv_BuiltInLocalInvocationId, i64 8
+  %1 = load i64, ptr addrspace(1) %0
+  ret i64 %1
+}
+
+
+define internal spir_func i64 @_Z27__spirv_LocalInvocationId_zv() local_unnamed_addr   {
+entry:
+  %0 = getelementptr inbounds i8, ptr addrspace(1) @__spirv_BuiltInLocalInvocationId, i64 16
+  %1 = load i64, ptr addrspace(1) %0, align 16
+  ret i64 %1
+}
+
+
+define internal spir_func void @__devicelib_assert_fail(ptr addrspace(4) %expr, ptr addrspace(4) %file, i32 %line, ptr addrspace(4) %func, i64 %gid0, i64 %gid1, i64 %gid2, i64 %lid0, i64 %lid1, i64 %lid2) local_unnamed_addr   {
+entry:
+  %call.i = tail call spir_func i32 @_Z29__spirv_AtomicCompareExchangePU3AS1iN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagES5_ii(ptr addrspace(1) @SPIR_AssertHappenedMem, i32 1, i32 16, i32 16, i32 1, i32 0) 
+  %cmp = icmp eq i32 %call.i, 0
+  %0 = getelementptr inbounds nuw i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 4
+  %1 = getelementptr inbounds nuw i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 261
+  %2 = getelementptr inbounds nuw i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 518
+  br i1 %cmp, label %if.then, label %if.end82
+
+if.then:                                          ; preds = %entry
+  store i32 %line, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 648)
+  store i64 %gid0, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 656)
+  store i64 %gid1, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 664)
+  store i64 %gid2, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 672)
+  store i64 %lid0, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 680)
+  store i64 %lid1, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 688)
+  store i64 %lid2, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 696)
+  %tobool.not = icmp eq ptr addrspace(4) %expr, null
+  br i1 %tobool.not, label %if.end, label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %if.then
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.preheader, %for.inc
+  %ExprLength.0 = phi i32 [ %inc, %for.inc ], [ 0, %for.cond.preheader ]
+  %C.0 = phi ptr addrspace(4) [ %incdec.ptr, %for.inc ], [ %expr, %for.cond.preheader ]
+  %3 = load i8, ptr addrspace(4) %C.0, align 1
+  %cmp2.not = icmp eq i8 %3, 0
+  br i1 %cmp2.not, label %if.end, label %for.inc
+
+for.inc:                                          ; preds = %for.cond
+  %incdec.ptr = getelementptr inbounds nuw i8, ptr addrspace(4) %C.0, i64 1
+  %inc = add nuw nsw i32 %ExprLength.0, 1
+  br label %for.cond
+
+if.end:                                           ; preds = %for.cond, %if.then
+  %ExprLength.1 = phi i32 [ 0, %if.then ], [ %ExprLength.0, %for.cond ]
+  %tobool3.not = icmp eq ptr addrspace(4) %file, null
+  br i1 %tobool3.not, label %if.end16, label %for.cond6.preheader
+
+for.cond6.preheader:                              ; preds = %if.end
+  br label %for.cond6
+
+for.cond6:                                        ; preds = %for.cond6.preheader, %for.inc12
+  %FileLength.0 = phi i32 [ %inc14, %for.inc12 ], [ 0, %for.cond6.preheader ]
+  %C5.0 = phi ptr addrspace(4) [ %incdec.ptr13, %for.inc12 ], [ %file, %for.cond6.preheader ]
+  %4 = load i8, ptr addrspace(4) %C5.0, align 1
+  %cmp8.not = icmp eq i8 %4, 0
+  br i1 %cmp8.not, label %if.end16, label %for.inc12
+
+for.inc12:                                        ; preds = %for.cond6
+  %incdec.ptr13 = getelementptr inbounds nuw i8, ptr addrspace(4) %C5.0, i64 1
+  %inc14 = add nuw nsw i32 %FileLength.0, 1
+  br label %for.cond6
+
+if.end16:                                         ; preds = %for.cond6, %if.end
+  %FileLength.1 = phi i32 [ 0, %if.end ], [ %FileLength.0, %for.cond6 ]
+  %tobool17.not = icmp eq ptr addrspace(4) %func, null
+  br i1 %tobool17.not, label %if.end30.thread, label %for.cond20.preheader
+
+for.cond20.preheader:                             ; preds = %if.end16
+  br label %for.cond20
+
+for.cond20:                                       ; preds = %for.cond20.preheader, %for.inc26
+  %FuncLength.0 = phi i32 [ %inc28, %for.inc26 ], [ 0, %for.cond20.preheader ]
+  %C19.0 = phi ptr addrspace(4) [ %incdec.ptr27, %for.inc26 ], [ %func, %for.cond20.preheader ]
+  %5 = load i8, ptr addrspace(4) %C19.0, align 1
+  %cmp22.not = icmp eq i8 %5, 0
+  br i1 %cmp22.not, label %if.end30, label %for.inc26
+
+for.inc26:                                        ; preds = %for.cond20
+  %incdec.ptr27 = getelementptr inbounds nuw i8, ptr addrspace(4) %C19.0, i64 1
+  %inc28 = add i32 %FuncLength.0, 1
+  br label %for.cond20
+
+if.end30:                                         ; preds = %for.cond20
+  %spec.select = tail call i32 @llvm.umin.i32(i32 %ExprLength.1, i32 256)
+  %MaxFileIdx.0 = tail call i32 @llvm.umin.i32(i32 %FileLength.1, i32 256)
+  %spec.select126 = tail call i32 @llvm.umin.i32(i32 %FuncLength.0, i32 128)
+  br label %6
+
+if.end30.thread:                                  ; preds = %if.end16
+  %spec.select116 = tail call i32 @llvm.umin.i32(i32 %ExprLength.1, i32 256)
+  %MaxFileIdx.0118 = tail call i32 @llvm.umin.i32(i32 %FileLength.1, i32 256)
+  br label %6
+
+6:                                                ; preds = %if.end30, %if.end30.thread
+  %MaxFileIdx.0124 = phi i32 [ %MaxFileIdx.0118, %if.end30.thread ], [ %MaxFileIdx.0, %if.end30 ]
+  %spec.select122 = phi i32 [ %spec.select116, %if.end30.thread ], [ %spec.select, %if.end30 ]
+  %7 = phi i32 [ 0, %if.end30.thread ], [ %spec.select126, %if.end30 ]
+  br label %for.cond40
+
+for.cond40:                                       ; preds = %for.body44, %6
+  %lsr.iv9 = phi ptr addrspace(4) [ %scevgep10, %for.body44 ], [ %expr, %6 ]
+  %lsr.iv7 = phi ptr addrspace(1) [ %scevgep8, %for.body44 ], [ %0, %6 ]
+  %Idx.0 = phi i32 [ 0, %6 ], [ %inc48, %for.body44 ]
+  %cmp41 = icmp ult i32 %Idx.0, %spec.select122
+  br i1 %cmp41, label %for.body44, label %for.cond.cleanup42
+
+for.cond.cleanup42:                               ; preds = %for.cond40
+  %idxprom50 = zext nneg i32 %spec.select122 to i64
+  %arrayidx51 = getelementptr inbounds [257 x i8], ptr addrspace(1) %0, i64 0, i64 %idxprom50
+  store i8 0, ptr addrspace(1) %arrayidx51, align 1
+  br label %for.cond53
+
+for.cond53:                                       ; preds = %for.body57, %for.cond.cleanup42
+  %lsr.iv5 = phi ptr addrspace(4) [ %scevgep6, %for.body57 ], [ %file, %for.cond.cleanup42 ]
+  %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %for.body57 ], [ %1, %for.cond.cleanup42 ]
+  %Idx52.0 = phi i32 [ 0, %for.cond.cleanup42 ], [ %inc63, %for.body57 ]
+  %cmp54 = icmp ult i32 %Idx52.0, %MaxFileIdx.0124
+  br i1 %cmp54, label %for.body57, label %for.cond.cleanup55
+
+for.cond.cleanup55:                               ; preds = %for.cond53
+  %idxprom65 = zext nneg i32 %MaxFileIdx.0124 to i64
+  %arrayidx66 = getelementptr inbounds [257 x i8], ptr addrspace(1) %1, i64 0, i64 %idxprom65
+  store i8 0, ptr addrspace(1) %arrayidx66, align 1
+  br label %for.cond68
+
+for.cond68:                                       ; preds = %for.body72, %for.cond.cleanup55
+  %lsr.iv1 = phi ptr addrspace(4) [ %scevgep2, %for.body72 ], [ %func, %for.cond.cleanup55 ]
+  %lsr.iv = phi ptr addrspace(1) [ %scevgep, %for.body72 ], [ %2, %for.cond.cleanup55 ]
+  %Idx67.0 = phi i32 [ 0, %for.cond.cleanup55 ], [ %inc78, %for.body72 ]
+  %cmp69 = icmp ult i32 %Idx67.0, %7
+  br i1 %cmp69, label %for.body72, label %for.cond.cleanup70
+
+for.cond.cleanup70:                               ; preds = %for.cond68
+  %idxprom80 = zext nneg i32 %7 to i64
+  %arrayidx81 = getelementptr inbounds [129 x i8], ptr addrspace(1) %2, i64 0, i64 %idxprom80
+  store i8 0, ptr addrspace(1) %arrayidx81, align 1
+  tail call spir_func void @_Z19__spirv_AtomicStorePU3AS1iN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEi(ptr addrspace(1) @SPIR_AssertHappenedMem, i32 1, i32 16, i32 2) 
+  br label %if.end82
+
+if.end82:                                         ; preds = %for.cond.cleanup70, %entry
+  ret void
+
+for.body72:                                       ; preds = %for.cond68
+  %8 = load i8, ptr addrspace(4) %lsr.iv1, align 1
+  store i8 %8, ptr addrspace(1) %lsr.iv, align 1
+  %inc78 = add nuw nsw i32 %Idx67.0, 1
+  %scevgep = getelementptr i8, ptr addrspace(1) %lsr.iv, i64 1
+  %scevgep2 = getelementptr i8, ptr addrspace(4) %lsr.iv1, i64 1
+  br label %for.cond68
+
+for.body57:                                       ; preds = %for.cond53
+  %9 = load i8, ptr addrspace(4) %lsr.iv5, align 1
+  store i8 %9, ptr addrspace(1) %lsr.iv3, align 1
+  %inc63 = add nuw nsw i32 %Idx52.0, 1
+  %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 1
+  %scevgep6 = getelementptr i8, ptr addrspace(4) %lsr.iv5, i64 1
+  br label %for.cond53
+
+for.body44:                                       ; preds = %for.cond40
+  %10 = load i8, ptr addrspace(4) %lsr.iv9, align 1
+  store i8 %10, ptr addrspace(1) %lsr.iv7, align 1
+  %inc48 = add nuw nsw i32 %Idx.0, 1
+  %scevgep8 = getelementptr i8, ptr addrspace(1) %lsr.iv7, i64 1
+  %scevgep10 = getelementptr i8, ptr addrspace(4) %lsr.iv9, i64 1
+  br label %for.cond40
+}
+
+declare extern_weak dso_local spir_func i32 @_Z29__spirv_AtomicCompareExchangePU3AS1iN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagES5_ii(ptr addrspace(1), i32, i32, i32, i32, i32) local_unnamed_addr 
+declare extern_weak dso_local spir_func void @_Z19__spirv_AtomicStorePU3AS1iN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEi(ptr addrspace(1), i32, i32, i32) local_unnamed_addr 
+declare i32 @llvm.umin.i32(i32, i32) 

>From 6ce07f15851c836d7f8e9c2e4816906f4f3a5610 Mon Sep 17 00:00:00 2001
From: "Levytskyy, Vyacheslav" <vyacheslav.levytskyy at intel.com>
Date: Mon, 2 Dec 2024 06:41:17 -0800
Subject: [PATCH 2/2] add a shorter test case of a return type mismatch

---
 .../CodeGen/SPIRV/pointers/tangled-ret.ll     | 235 ++++++++++++++++++
 1 file changed, 235 insertions(+)
 create mode 100644 llvm/test/CodeGen/SPIRV/pointers/tangled-ret.ll

diff --git a/llvm/test/CodeGen/SPIRV/pointers/tangled-ret.ll b/llvm/test/CodeGen/SPIRV/pointers/tangled-ret.ll
new file mode 100644
index 00000000000000..985893029db890
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/tangled-ret.ll
@@ -0,0 +1,235 @@
+; The only pass criterion is that spirv-val considers output valid.
+
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+%subgr = type { i64, i64 }
+%t_range = type { %t_arr }
+%t_arr = type { [1 x i64] }
+%t_arr2 = type { [4 x i32] }
+
+define internal spir_func noundef i32 @geti32() {
+entry:
+  ret i32 100
+}
+
+define internal spir_func noundef i64 @geti64() {
+entry:
+  ret i64 200
+}
+
+define internal spir_func void @enable_if(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %this, i64 noundef %dim0) {
+entry:
+  %this.addr = alloca ptr addrspace(4), align 8
+  %dim0.addr = alloca i64, align 8
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  store i64 %dim0, ptr %dim0.addr, align 8
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %0 = load i64, ptr %dim0.addr, align 8
+  call spir_func void @enable_if_2(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %this1, i64 noundef %0)
+  ret void
+}
+
+
+define internal spir_func void @test(ptr addrspace(4) noundef align 8 dereferenceable_or_null(16) %this, ptr addrspace(4) noundef align 4 dereferenceable(16) %bits, ptr noundef byval(%t_range) align 8 %pos) {
+entry:
+  %this.addr = alloca ptr addrspace(4), align 8
+  %bits.addr = alloca ptr addrspace(4), align 8
+  %cur_pos = alloca i64, align 8
+  %__range4 = alloca ptr addrspace(4), align 8
+  %__begin0 = alloca ptr addrspace(4), align 8
+  %__end0 = alloca ptr addrspace(4), align 8
+  %cleanup.dest.slot = alloca i32, align 4
+  %elem = alloca ptr addrspace(4), align 8
+  %agg.tmp = alloca %t_range, align 8
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  store ptr addrspace(4) %bits, ptr %bits.addr, align 8
+  %pos.ascast = addrspacecast ptr %pos to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %call = call spir_func noundef i64 @getp(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %pos.ascast, i32 noundef 0)
+  store i64 %call, ptr %cur_pos, align 8
+  %0 = load ptr addrspace(4), ptr %bits.addr, align 8
+  store ptr addrspace(4) %0, ptr %__range4, align 8
+  %1 = load ptr addrspace(4), ptr %__range4, align 8
+  %call2 = call spir_func noundef ptr addrspace(4) @beginp(ptr addrspace(4) noundef align 4 dereferenceable_or_null(16) %1)
+  store ptr addrspace(4) %call2, ptr %__begin0, align 8
+  %2 = load ptr addrspace(4), ptr %__range4, align 8
+  %call3 = call spir_func noundef ptr addrspace(4) @endp(ptr addrspace(4) noundef align 4 dereferenceable_or_null(16) %2)
+  store ptr addrspace(4) %call3, ptr %__end0, align 8
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %3 = load ptr addrspace(4), ptr %__begin0, align 8
+  %4 = load ptr addrspace(4), ptr %__end0, align 8
+  %cmp = icmp ne ptr addrspace(4) %3, %4
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %5 = load ptr addrspace(4), ptr %__begin0, align 8
+  store ptr addrspace(4) %5, ptr %elem, align 8
+  %6 = load i64, ptr %cur_pos, align 8
+  %call4 = call spir_func noundef i32 @maskp(ptr addrspace(4) noundef align 8 dereferenceable_or_null(16) %this1)
+  %conv = zext i32 %call4 to i64
+  %cmp5 = icmp ult i64 %6, %conv
+  br i1 %cmp5, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %7 = load ptr addrspace(4), ptr %elem, align 8
+  %8 = load i64, ptr %cur_pos, align 8
+  call spir_func void @enable_if(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %agg.tmp.ascast, i64 noundef %8)
+  call spir_func void @extract_bits(ptr addrspace(4) noundef align 8 dereferenceable_or_null(16) %this1, ptr addrspace(4) noundef align 4 dereferenceable(4) %7, ptr noundef byval(%t_range) align 8 %agg.tmp)
+  %9 = load i64, ptr %cur_pos, align 8
+  %add = add i64 %9, 32
+  store i64 %add, ptr %cur_pos, align 8
+  br label %if.end
+
+if.else:                                          ; preds = %for.body
+  %10 = load ptr addrspace(4), ptr %elem, align 8
+  store i32 0, ptr addrspace(4) %10, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %11 = load ptr addrspace(4), ptr %__begin0, align 8
+  %incdec.ptr = getelementptr inbounds nuw i32, ptr addrspace(4) %11, i32 1
+  store ptr addrspace(4) %incdec.ptr, ptr %__begin0, align 8
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void
+}
+
+define internal spir_func noundef i64 @getp(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %this, i32 noundef %dimension) {
+entry:
+  %this.addr.i = alloca ptr addrspace(4), align 8
+  %dimension.addr.i = alloca i32, align 4
+  %retval = alloca i64, align 8
+  %this.addr = alloca ptr addrspace(4), align 8
+  %dimension.addr = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  store i32 %dimension, ptr %dimension.addr, align 4
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %0 = load i32, ptr %dimension.addr, align 4
+  store ptr addrspace(4) %this1, ptr %this.addr.i, align 8
+  store i32 %0, ptr %dimension.addr.i, align 4
+  %this1.i = load ptr addrspace(4), ptr %this.addr.i, align 8
+  %common_array1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load i32, ptr %dimension.addr, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array1, i64 0, i64 %idxprom
+  %2 = load i64, ptr addrspace(4) %arrayidx, align 8
+  ret i64 %2
+}
+
+define internal spir_func noundef ptr addrspace(4) @beginp(ptr addrspace(4) noundef align 4 dereferenceable_or_null(16) %this) {
+entry:
+  %retval = alloca ptr addrspace(4), align 8
+  %this.addr = alloca ptr addrspace(4), align 8
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %MData1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %arraydecay2 = bitcast ptr addrspace(4) %MData1 to ptr addrspace(4)
+  ret ptr addrspace(4) %arraydecay2
+}
+
+define internal spir_func noundef ptr addrspace(4) @endp(ptr addrspace(4) noundef align 4 dereferenceable_or_null(16) %this) {
+entry:
+  %retval = alloca ptr addrspace(4), align 8
+  %this.addr = alloca ptr addrspace(4), align 8
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %MData1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %arraydecay2 = bitcast ptr addrspace(4) %MData1 to ptr addrspace(4)
+  %add.ptr = getelementptr inbounds nuw i32, ptr addrspace(4) %arraydecay2, i64 4
+  ret ptr addrspace(4) %add.ptr
+}
+
+define internal spir_func noundef i32 @maskp(ptr addrspace(4) noundef align 8 dereferenceable_or_null(16) %this) {
+entry:
+  %retval = alloca i32, align 4
+  %this.addr = alloca ptr addrspace(4), align 8
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %bits_num = getelementptr inbounds nuw %subgr, ptr addrspace(4) %this1, i32 0, i32 1
+  %0 = load i64, ptr addrspace(4) %bits_num, align 8
+  %conv = trunc i64 %0 to i32
+  ret i32 %conv
+}
+
+define internal spir_func void @enable_if_2(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %this, i64 noundef %dim0) {
+entry:
+  %this.addr = alloca ptr addrspace(4), align 8
+  %dim0.addr = alloca i64, align 8
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  store i64 %dim0, ptr %dim0.addr, align 8
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %common_array1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr %dim0.addr, align 8
+  store i64 %0, ptr addrspace(4) %common_array1, align 8
+  ret void
+}
+
+define internal spir_func void @extract_bits(ptr addrspace(4) noundef align 8 dereferenceable_or_null(16) %this, ptr addrspace(4) noundef align 4 dereferenceable(4) %bits, ptr noundef byval(%t_range) align 8 %pos) {
+entry:
+  %this.addr = alloca ptr addrspace(4), align 8
+  %bits.addr = alloca ptr addrspace(4), align 8
+  %Res = alloca i64, align 8
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  store ptr addrspace(4) %bits, ptr %bits.addr, align 8
+  %pos.ascast = addrspacecast ptr %pos to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %Bits1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr addrspace(4) %Bits1, align 8
+  store i64 %0, ptr %Res, align 8
+  %bits_num = getelementptr inbounds nuw %subgr, ptr addrspace(4) %this1, i32 0, i32 1
+  %1 = load i64, ptr addrspace(4) %bits_num, align 8
+  %call = call spir_func noundef i64 @geti64()
+  %2 = load i64, ptr %Res, align 8
+  %and = and i64 %2, %call
+  store i64 %and, ptr %Res, align 8
+  %call2 = call spir_func noundef i64 @geti64()
+  %call3 = call spir_func noundef i32 @geti32()
+  %conv = zext i32 %call3 to i64
+  %cmp = icmp ult i64 %call2, %conv
+  br i1 %cmp, label %if.then, label %if.else
+
+if.else:                                          ; preds = %entry
+  %3 = load ptr addrspace(4), ptr %bits.addr, align 8
+  store i32 0, ptr addrspace(4) %3, align 4
+  br label %if.end11
+
+if.then:                                          ; preds = %entry
+  %call4 = call spir_func noundef i64 @geti64()
+  %cmp5 = icmp ugt i64 %call4, 0
+  br i1 %cmp5, label %if.then6, label %if.end
+
+if.then6:                                         ; preds = %if.then
+  %call7 = call spir_func noundef i64 @geti64()
+  %4 = load i64, ptr %Res, align 8
+  %shr = lshr i64 %4, %call7
+  store i64 %shr, ptr %Res, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then6, %if.then
+  %call8 = call spir_func noundef i64 @geti64()
+  %5 = load i64, ptr %Res, align 8
+  %and9 = and i64 %5, %call8
+  store i64 %and9, ptr %Res, align 8
+  %6 = load i64, ptr %Res, align 8
+  %conv10 = trunc i64 %6 to i32
+  %7 = load ptr addrspace(4), ptr %bits.addr, align 8
+  store i32 %conv10, ptr addrspace(4) %7, align 4
+  br label %if.end11
+
+if.end11:                                         ; preds = %if.else, %if.end
+  ret void
+}



More information about the llvm-commits mailing list