[llvm] 46ef333 - [AArch64] Add and cmp cost model tests. NFC

Thu Jun 29 05:29:40 PDT 2023

Author: David Green
Date: 2023-06-29T13:29:34+01:00
New Revision: 46ef3337ea9947aba3ab8c82b9c6befd24526d55

URL: https://github.com/llvm/llvm-project/commit/46ef3337ea9947aba3ab8c82b9c6befd24526d55
DIFF: https://github.com/llvm/llvm-project/commit/46ef3337ea9947aba3ab8c82b9c6befd24526d55.diff

LOG: [AArch64] Add and cmp cost model tests. NFC

See D153611. Tests for the cost of icmp(and, 0) are added, in addition to
expanding the extractelements-to-shuffle.ll test, which has always been a bit
simple, to include a more complete example with both a vector and scalar
version. The icmp(and, 0) costs are targetting at improving the second when the
cost of vector inserts and extracts is lowered.

Added: 
    

Modified: 
    llvm/test/Analysis/CostModel/AArch64/cmp.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Analysis/CostModel/AArch64/cmp.ll b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
index e243c637a65708..d25724349b4cbc 100644

--- a/llvm/test/Analysis/CostModel/AArch64/cmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
@@ -4,51 +4,120 @@
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
-define i32 @cmps() {
+define void @cmps() {
 ; CHECK-THROUGHPUT-LABEL: 'cmps'
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0 = icmp slt i8 undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = icmp ult i16 undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a2 = icmp sge i32 undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a3 = icmp ne i64 undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4 = icmp slt <16 x i8> undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5 = icmp ult <8 x i16> undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a6 = icmp sge <4 x i32> undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %a10 = fcmp olt <8 x half> undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a11 = fcmp oge <4 x float> undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a12 = fcmp oge <2 x double> undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = icmp slt i8 undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c16 = icmp ult i16 undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c32 = icmp sge i32 undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c64 = icmp ne i64 undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %c128 = icmp ult i128 undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cv16i8 = icmp slt <16 x i8> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cv8i16 = icmp ult <8 x i16> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cv4i32 = icmp sge <4 x i32> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cf16 = fcmp oge half undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cf32 = fcmp ogt float undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cf64 = fcmp ogt double undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %cfv816 = fcmp olt <8 x half> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cfv432 = fcmp oge <4 x float> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cfv264 = fcmp oge <2 x double> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SIZE-LABEL: 'cmps'
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0 = icmp slt i8 undef, undef
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = icmp ult i16 undef, undef
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a2 = icmp sge i32 undef, undef
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a3 = icmp ne i64 undef, undef
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4 = icmp slt <16 x i8> undef, undef
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5 = icmp ult <8 x i16> undef, undef
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a6 = icmp sge <4 x i32> undef, undef
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10 = fcmp olt <8 x half> undef, undef
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a11 = fcmp oge <4 x float> undef, undef
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a12 = fcmp oge <2 x double> undef, undef
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = icmp slt i8 undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c16 = icmp ult i16 undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c32 = icmp sge i32 undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c64 = icmp ne i64 undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c128 = icmp ult i128 undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cv16i8 = icmp slt <16 x i8> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cv8i16 = icmp ult <8 x i16> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cv4i32 = icmp sge <4 x i32> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cf16 = fcmp oge half undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cf32 = fcmp ogt float undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cf64 = fcmp ogt double undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cfv816 = fcmp olt <8 x half> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cfv432 = fcmp oge <4 x float> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cfv264 = fcmp oge <2 x double> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %a0 = icmp slt i8 undef, undef
-  %a1 = icmp ult i16 undef, undef
-  %a2 = icmp sge i32 undef, undef
-  %a3 = icmp ne i64 undef, undef
-  %a4 = icmp slt <16 x i8> undef, undef
-  %a5 = icmp ult <8 x i16> undef, undef
-  %a6 = icmp sge <4 x i32> undef, undef
-  %a7 = fcmp oge half undef, undef
-  %a8 = fcmp ogt float undef, undef
-  %a9 = fcmp ogt double undef, undef
-  %a10 = fcmp olt <8 x half> undef, undef
-  %a11 = fcmp oge <4 x float> undef, undef
-  %a12 = fcmp oge <2 x double> undef, undef
-  ret i32 undef
+  %c8 = icmp slt i8 undef, undef
+  %c16 = icmp ult i16 undef, undef
+  %c32 = icmp sge i32 undef, undef
+  %c64 = icmp ne i64 undef, undef
+  %c128 = icmp ult i128 undef, undef
+  %cv16i8 = icmp slt <16 x i8> undef, undef
+  %cv8i16 = icmp ult <8 x i16> undef, undef
+  %cv4i32 = icmp sge <4 x i32> undef, undef
+  %cf16 = fcmp oge half undef, undef
+  %cf32 = fcmp ogt float undef, undef
+  %cf64 = fcmp ogt double undef, undef
+  %cfv816 = fcmp olt <8 x half> undef, undef
+  %cfv432 = fcmp oge <4 x float> undef, undef
+  %cfv264 = fcmp oge <2 x double> undef, undef
+  ret void
+}
+
+
+define void @andcmp() {
+; CHECK-THROUGHPUT-LABEL: 'andcmp'
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = and i8 undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = icmp eq i8 %a8, 0
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a16 = and i16 undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c16 = icmp ne i16 %a16, 0
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = and i32 undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c32 = icmp eq i32 %a32, 0
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a64 = and i64 undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c64 = icmp ne i64 %a64, 0
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a128 = and i128 undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %c128 = icmp eq i128 %a128, 0
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %av16i8 = and <16 x i8> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cv16i8 = icmp ne <16 x i8> %av16i8, zeroinitializer
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %av8i16 = and <8 x i16> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cv8i16 = icmp eq <8 x i16> %av8i16, zeroinitializer
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %av4i32 = and <4 x i32> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cv4i32 = icmp ne <4 x i32> %av4i32, zeroinitializer
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c32not0 = icmp eq i32 %a32, 1
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c64sle = icmp sle i64 %a64, 0
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-SIZE-LABEL: 'andcmp'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = and i8 undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = icmp eq i8 %a8, 0
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a16 = and i16 undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c16 = icmp ne i16 %a16, 0
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = and i32 undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c32 = icmp eq i32 %a32, 0
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a64 = and i64 undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c64 = icmp ne i64 %a64, 0
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a128 = and i128 undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c128 = icmp eq i128 %a128, 0
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %av16i8 = and <16 x i8> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cv16i8 = icmp ne <16 x i8> %av16i8, zeroinitializer
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %av8i16 = and <8 x i16> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cv8i16 = icmp eq <8 x i16> %av8i16, zeroinitializer
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %av4i32 = and <4 x i32> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cv4i32 = icmp ne <4 x i32> %av4i32, zeroinitializer
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c32not0 = icmp eq i32 %a32, 1
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c64sle = icmp sle i64 %a64, 0
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %a8 = and i8 undef, undef
+  %c8 = icmp eq i8 %a8, 0
+  %a16 = and i16 undef, undef
+  %c16 = icmp ne i16 %a16, 0
+  %a32 = and i32 undef, undef
+  %c32 = icmp eq i32 %a32, 0
+  %a64 = and i64 undef, undef
+  %c64 = icmp ne i64 %a64, 0
+  %a128 = and i128 undef, undef
+  %c128 = icmp eq i128 %a128, zeroinitializer
+  %av16i8 = and <16 x i8> undef, undef
+  %cv16i8 = icmp ne <16 x i8> %av16i8, zeroinitializer
+  %av8i16 = and <8 x i16> undef, undef
+  %cv8i16 = icmp eq <8 x i16> %av8i16, zeroinitializer
+  %av4i32 = and <4 x i32> undef, undef
+  %cv4i32 = icmp ne <4 x i32> %av4i32, zeroinitializer
+
+  %c32not0 = icmp eq i32 %a32, 1
+  %c64sle = icmp sle i64 %a64, 0
+  ret void
 }

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index 61aa9110e123c7..e60e356e5cd819 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -1,53 +1,641 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=aarch64 -aarch64-insert-extract-base-cost=3 | FileCheck %s
-
-define void @test(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1:%.*]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP0:%.*]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = or i64 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
-; CHECK-NEXT:    br label [[TMP17:%.*]]
-; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i32 [ [[TMP22:%.*]], [[TMP17]] ], [ [[TMP6]], [[TMP3:%.*]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP9]], [[TMP3]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP13]], [[TMP3]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP16]], [[TMP3]] ]
-; CHECK-NEXT:    [[TMP22]] = or i32 [[TMP18]], 0
-; CHECK-NEXT:    br label [[TMP17]]
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=aarch64 | FileCheck %s
+
+; This test is two versions of the IR extracted from a DSP routine performing
+; boolean distance. The first involves a mixture of vector intrinsics and
+; scalar code, the second is just scalar. They should both ideally not be
+; vectorized any more than the input (unless the codegen has improved
+; significantly).
+
+define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef readonly %pB, i32 noundef %numberOfBools, ptr nocapture noundef writeonly %cTT, ptr nocapture noundef writeonly %cFF, ptr nocapture noundef writeonly %cTF, ptr nocapture noundef writeonly %cFT) {
+; CHECK-LABEL: @dist_vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT264:%.*]] = icmp ult i32 [[NUMBEROFBOOLS:%.*]], 128
+; CHECK-NEXT:    br i1 [[CMP_NOT264]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[NUMBEROFBOOLS]], 7
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i32 [[SHR]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 16
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[PA:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[PA_ADDR_0271:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[WHILE_BODY]] ], [ [[PA]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[PB_ADDR_0270:%.*]] = phi ptr [ [[ADD_PTR8:%.*]], [[WHILE_BODY]] ], [ [[PB:%.*]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[NBBOOLBLOCK_0269:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[SHR]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP4TT_0268:%.*]] = phi <2 x i64> [ [[ADD_I:%.*]], [[WHILE_BODY]] ], [ zeroinitializer, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP4FF_0267:%.*]] = phi <2 x i64> [ [[ADD_I253:%.*]], [[WHILE_BODY]] ], [ zeroinitializer, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP4TF_0266:%.*]] = phi <2 x i64> [ [[ADD_I258:%.*]], [[WHILE_BODY]] ], [ zeroinitializer, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP4FT_0265:%.*]] = phi <2 x i64> [ [[ADD_I263:%.*]], [[WHILE_BODY]] ], [ zeroinitializer, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[PA_ADDR_0271]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[PB_ADDR_0270]], align 4
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, ptr [[PA_ADDR_0271]], i64 4
+; CHECK-NEXT:    [[ADD_PTR8]] = getelementptr inbounds i32, ptr [[PB_ADDR_0270]], i64 4
+; CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[TMP4]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[NOT_I242:%.*]] = xor <4 x i32> [[TMP5]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[AND_I243:%.*]] = and <4 x i32> [[NOT_I242]], [[NOT_I]]
+; CHECK-NEXT:    [[AND_I245:%.*]] = and <4 x i32> [[TMP4]], [[NOT_I242]]
+; CHECK-NEXT:    [[AND_I247:%.*]] = and <4 x i32> [[TMP5]], [[NOT_I]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[AND_I]] to <16 x i8>
+; CHECK-NEXT:    [[VCNTQ_V_I:%.*]] = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP6]])
+; CHECK-NEXT:    [[VPADDL_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[VCNTQ_V_I]])
+; CHECK-NEXT:    [[VPADDL1_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]])
+; CHECK-NEXT:    [[VPADDL1_I248:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADDL1_I]])
+; CHECK-NEXT:    [[ADD_I]] = add <2 x i64> [[VPADDL1_I248]], [[TMP4TT_0268]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[AND_I243]] to <16 x i8>
+; CHECK-NEXT:    [[VCNTQ_V_I249:%.*]] = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP7]])
+; CHECK-NEXT:    [[VPADDL_I250:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[VCNTQ_V_I249]])
+; CHECK-NEXT:    [[VPADDL1_I251:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I250]])
+; CHECK-NEXT:    [[VPADDL1_I252:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADDL1_I251]])
+; CHECK-NEXT:    [[ADD_I253]] = add <2 x i64> [[VPADDL1_I252]], [[TMP4FF_0267]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[AND_I245]] to <16 x i8>
+; CHECK-NEXT:    [[VCNTQ_V_I254:%.*]] = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP8]])
+; CHECK-NEXT:    [[VPADDL_I255:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[VCNTQ_V_I254]])
+; CHECK-NEXT:    [[VPADDL1_I256:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I255]])
+; CHECK-NEXT:    [[VPADDL1_I257:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADDL1_I256]])
+; CHECK-NEXT:    [[ADD_I258]] = add <2 x i64> [[VPADDL1_I257]], [[TMP4TF_0266]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[AND_I247]] to <16 x i8>
+; CHECK-NEXT:    [[VCNTQ_V_I259:%.*]] = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP9]])
+; CHECK-NEXT:    [[VPADDL_I260:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[VCNTQ_V_I259]])
+; CHECK-NEXT:    [[VPADDL1_I261:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I260]])
+; CHECK-NEXT:    [[VPADDL1_I262:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADDL1_I261]])
+; CHECK-NEXT:    [[ADD_I263]] = add <2 x i64> [[VPADDL1_I262]], [[TMP4FT_0265]]
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[NBBOOLBLOCK_0269]], -1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[SCEVGEP311:%.*]] = getelementptr i8, ptr [[PB]], i64 [[TMP3]]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[TMP4FT_0_LCSSA:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[ADD_I263]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP4TF_0_LCSSA:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[ADD_I258]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP4FF_0_LCSSA:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[ADD_I253]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP4TT_0_LCSSA:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 0
+; CHECK-NEXT:    [[VGETQ_LANE45:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[VGETQ_LANE]], [[VGETQ_LANE45]]
+; CHECK-NEXT:    [[CONV48:%.*]] = trunc i64 [[ADD]] to i32
+; CHECK-NEXT:    [[VGETQ_LANE51:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 0
+; CHECK-NEXT:    [[VGETQ_LANE55:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 1
+; CHECK-NEXT:    [[ADD57:%.*]] = add i64 [[VGETQ_LANE51]], [[VGETQ_LANE55]]
+; CHECK-NEXT:    [[CONV60:%.*]] = trunc i64 [[ADD57]] to i32
+; CHECK-NEXT:    [[VGETQ_LANE63:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 0
+; CHECK-NEXT:    [[VGETQ_LANE67:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 1
+; CHECK-NEXT:    [[ADD69:%.*]] = add i64 [[VGETQ_LANE63]], [[VGETQ_LANE67]]
+; CHECK-NEXT:    [[CONV72:%.*]] = trunc i64 [[ADD69]] to i32
+; CHECK-NEXT:    [[VGETQ_LANE75:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 0
+; CHECK-NEXT:    [[VGETQ_LANE79:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 1
+; CHECK-NEXT:    [[ADD81:%.*]] = add i64 [[VGETQ_LANE75]], [[VGETQ_LANE79]]
+; CHECK-NEXT:    [[CONV84:%.*]] = trunc i64 [[ADD81]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[NUMBEROFBOOLS]], 127
+; CHECK-NEXT:    [[CMP86284:%.*]] = icmp ugt i32 [[AND]], 31
+; CHECK-NEXT:    br i1 [[CMP86284]], label [[WHILE_BODY88:%.*]], label [[WHILE_END122:%.*]]
+; CHECK:       while.body88:
+; CHECK-NEXT:    [[PA_ADDR_1291:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_END121:%.*]] ], [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[PB_ADDR_1290:%.*]] = phi ptr [ [[INCDEC_PTR89:%.*]], [[WHILE_END121]] ], [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CTT_0289:%.*]] = phi i32 [ [[ADD99:%.*]], [[WHILE_END121]] ], [ [[CONV48]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CFF_0288:%.*]] = phi i32 [ [[ADD106:%.*]], [[WHILE_END121]] ], [ [[CONV60]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CTF_0287:%.*]] = phi i32 [ [[ADD113:%.*]], [[WHILE_END121]] ], [ [[CONV72]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CFT_0286:%.*]] = phi i32 [ [[ADD120:%.*]], [[WHILE_END121]] ], [ [[CONV84]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[NBBOOLBLOCK_1285:%.*]] = phi i32 [ [[SUB:%.*]], [[WHILE_END121]] ], [ [[AND]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4
+; CHECK-NEXT:    br label [[WHILE_BODY93:%.*]]
+; CHECK:       while.body93:
+; CHECK-NEXT:    [[_CTT_1283:%.*]] = phi i32 [ [[_CTT_0289]], [[WHILE_BODY88]] ], [ [[ADD99]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[_CFF_1282:%.*]] = phi i32 [ [[_CFF_0288]], [[WHILE_BODY88]] ], [ [[ADD106]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[_CTF_1281:%.*]] = phi i32 [ [[_CTF_0287]], [[WHILE_BODY88]] ], [ [[ADD113]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[_CFT_1280:%.*]] = phi i32 [ [[_CFT_0286]], [[WHILE_BODY88]] ], [ [[ADD120]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[A_0279:%.*]] = phi i32 [ [[TMP10]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[B_0278:%.*]] = phi i32 [ [[TMP11]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[SHIFT_0277:%.*]] = phi i32 [ 0, [[WHILE_BODY88]] ], [ [[INC:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[AND94:%.*]] = and i32 [[A_0279]], 1
+; CHECK-NEXT:    [[AND95:%.*]] = and i32 [[B_0278]], 1
+; CHECK-NEXT:    [[SHR96]] = lshr i32 [[A_0279]], 1
+; CHECK-NEXT:    [[SHR97]] = lshr i32 [[B_0278]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[AND94]], 0
+; CHECK-NEXT:    [[TOBOOL98:%.*]] = icmp ne i32 [[AND95]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL98]], i1 false
+; CHECK-NEXT:    [[LAND_EXT:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[ADD99]] = add i32 [[_CTT_1283]], [[LAND_EXT]]
+; CHECK-NEXT:    [[TOBOOL100:%.*]] = icmp eq i32 [[AND94]], 0
+; CHECK-NEXT:    [[TOBOOL103:%.*]] = icmp eq i32 [[AND95]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL103]], i1 false
+; CHECK-NEXT:    [[LAND_EXT105:%.*]] = zext i1 [[TMP13]] to i32
+; CHECK-NEXT:    [[ADD106]] = add i32 [[_CFF_1282]], [[LAND_EXT105]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL103]], i1 false
+; CHECK-NEXT:    [[LAND_EXT112:%.*]] = zext i1 [[TMP14]] to i32
+; CHECK-NEXT:    [[ADD113]] = add i32 [[_CTF_1281]], [[LAND_EXT112]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL98]], i1 false
+; CHECK-NEXT:    [[LAND_EXT119:%.*]] = zext i1 [[TMP15]] to i32
+; CHECK-NEXT:    [[ADD120]] = add i32 [[_CFT_1280]], [[LAND_EXT119]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[SHIFT_0277]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], 32
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[WHILE_END121]], label [[WHILE_BODY93]]
+; CHECK:       while.end121:
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[PA_ADDR_1291]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR89]] = getelementptr inbounds i32, ptr [[PB_ADDR_1290]], i64 1
+; CHECK-NEXT:    [[SUB]] = add nsw i32 [[NBBOOLBLOCK_1285]], -32
+; CHECK-NEXT:    [[CMP86:%.*]] = icmp ugt i32 [[SUB]], 31
+; CHECK-NEXT:    br i1 [[CMP86]], label [[WHILE_BODY88]], label [[WHILE_END122]]
+; CHECK:       while.end122:
+; CHECK-NEXT:    [[NBBOOLBLOCK_1_LCSSA:%.*]] = phi i32 [ [[AND]], [[WHILE_END]] ], [ [[SUB]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[_CFT_0_LCSSA:%.*]] = phi i32 [ [[CONV84]], [[WHILE_END]] ], [ [[ADD120]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[_CTF_0_LCSSA:%.*]] = phi i32 [ [[CONV72]], [[WHILE_END]] ], [ [[ADD113]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[_CFF_0_LCSSA:%.*]] = phi i32 [ [[CONV60]], [[WHILE_END]] ], [ [[ADD106]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[_CTT_0_LCSSA:%.*]] = phi i32 [ [[CONV48]], [[WHILE_END]] ], [ [[ADD99]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[PB_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR89]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[PA_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[CMP130_NOT299:%.*]] = icmp eq i32 [[NBBOOLBLOCK_1_LCSSA]], 0
+; CHECK-NEXT:    br i1 [[CMP130_NOT299]], label [[WHILE_END166:%.*]], label [[WHILE_BODY132_PREHEADER:%.*]]
+; CHECK:       while.body132.preheader:
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4
+; CHECK-NEXT:    [[SUB125:%.*]] = sub nuw nsw i32 32, [[NBBOOLBLOCK_1_LCSSA]]
+; CHECK-NEXT:    [[SHR128:%.*]] = lshr i32 [[TMP16]], [[SUB125]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4
+; CHECK-NEXT:    [[SHR126:%.*]] = lshr i32 [[TMP17]], [[SUB125]]
+; CHECK-NEXT:    br label [[WHILE_BODY132:%.*]]
+; CHECK:       while.body132:
+; CHECK-NEXT:    [[_CTT_2306:%.*]] = phi i32 [ [[ADD142:%.*]], [[WHILE_BODY132]] ], [ [[_CTT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[_CFF_2305:%.*]] = phi i32 [ [[ADD150:%.*]], [[WHILE_BODY132]] ], [ [[_CFF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[_CTF_2304:%.*]] = phi i32 [ [[ADD157:%.*]], [[WHILE_BODY132]] ], [ [[_CTF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[_CFT_2303:%.*]] = phi i32 [ [[ADD164:%.*]], [[WHILE_BODY132]] ], [ [[_CFT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[NBBOOLBLOCK_2302:%.*]] = phi i32 [ [[DEC165:%.*]], [[WHILE_BODY132]] ], [ [[NBBOOLBLOCK_1_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[A_1301:%.*]] = phi i32 [ [[SHR135:%.*]], [[WHILE_BODY132]] ], [ [[SHR126]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[B_1300:%.*]] = phi i32 [ [[SHR136:%.*]], [[WHILE_BODY132]] ], [ [[SHR128]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[AND133:%.*]] = and i32 [[A_1301]], 1
+; CHECK-NEXT:    [[AND134:%.*]] = and i32 [[B_1300]], 1
+; CHECK-NEXT:    [[SHR135]] = lshr i32 [[A_1301]], 1
+; CHECK-NEXT:    [[SHR136]] = lshr i32 [[B_1300]], 1
+; CHECK-NEXT:    [[TOBOOL137:%.*]] = icmp ne i32 [[AND133]], 0
+; CHECK-NEXT:    [[TOBOOL139:%.*]] = icmp ne i32 [[AND134]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL139]], i1 false
+; CHECK-NEXT:    [[LAND_EXT141:%.*]] = zext i1 [[TMP18]] to i32
+; CHECK-NEXT:    [[ADD142]] = add i32 [[_CTT_2306]], [[LAND_EXT141]]
+; CHECK-NEXT:    [[TOBOOL144:%.*]] = icmp eq i32 [[AND133]], 0
+; CHECK-NEXT:    [[TOBOOL147:%.*]] = icmp eq i32 [[AND134]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL147]], i1 false
+; CHECK-NEXT:    [[LAND_EXT149:%.*]] = zext i1 [[TMP19]] to i32
+; CHECK-NEXT:    [[ADD150]] = add i32 [[_CFF_2305]], [[LAND_EXT149]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL147]], i1 false
+; CHECK-NEXT:    [[LAND_EXT156:%.*]] = zext i1 [[TMP20]] to i32
+; CHECK-NEXT:    [[ADD157]] = add i32 [[_CTF_2304]], [[LAND_EXT156]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL139]], i1 false
+; CHECK-NEXT:    [[LAND_EXT163:%.*]] = zext i1 [[TMP21]] to i32
+; CHECK-NEXT:    [[ADD164]] = add i32 [[_CFT_2303]], [[LAND_EXT163]]
+; CHECK-NEXT:    [[DEC165]] = add nsw i32 [[NBBOOLBLOCK_2302]], -1
+; CHECK-NEXT:    [[CMP130_NOT:%.*]] = icmp eq i32 [[DEC165]], 0
+; CHECK-NEXT:    br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]]
+; CHECK:       while.end166:
+; CHECK-NEXT:    [[_CFT_2_LCSSA:%.*]] = phi i32 [ [[_CFT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD164]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    [[_CTF_2_LCSSA:%.*]] = phi i32 [ [[_CTF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD157]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    [[_CFF_2_LCSSA:%.*]] = phi i32 [ [[_CFF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD150]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    [[_CTT_2_LCSSA:%.*]] = phi i32 [ [[_CTT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD142]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    store i32 [[_CTT_2_LCSSA]], ptr [[CTT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[_CFF_2_LCSSA]], ptr [[CFF:%.*]], align 4
+; CHECK-NEXT:    store i32 [[_CTF_2_LCSSA]], ptr [[CTF:%.*]], align 4
+; CHECK-NEXT:    store i32 [[_CFT_2_LCSSA]], ptr [[CFT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp.not264 = icmp ult i32 %numberOfBools, 128
+  br i1 %cmp.not264, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  %shr = lshr i32 %numberOfBools, 7
+  %0 = add nsw i32 %shr, -1
+  %1 = zext i32 %0 to i64
+  %2 = shl nuw nsw i64 %1, 4
+  %3 = add nuw nsw i64 %2, 16
+  %scevgep = getelementptr i8, ptr %pA, i64 %3
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %pA.addr.0271 = phi ptr [ %add.ptr, %while.body ], [ %pA, %while.body.preheader ]
+  %pB.addr.0270 = phi ptr [ %add.ptr8, %while.body ], [ %pB, %while.body.preheader ]
+  %nbBoolBlock.0269 = phi i32 [ %dec, %while.body ], [ %shr, %while.body.preheader ]
+  %tmp4tt.0268 = phi <2 x i64> [ %add.i, %while.body ], [ zeroinitializer, %while.body.preheader ]
+  %tmp4ff.0267 = phi <2 x i64> [ %add.i253, %while.body ], [ zeroinitializer, %while.body.preheader ]
+  %tmp4tf.0266 = phi <2 x i64> [ %add.i258, %while.body ], [ zeroinitializer, %while.body.preheader ]
+  %tmp4ft.0265 = phi <2 x i64> [ %add.i263, %while.body ], [ zeroinitializer, %while.body.preheader ]
+  %4 = load <4 x i32>, ptr %pA.addr.0271, align 4
+  %5 = load <4 x i32>, ptr %pB.addr.0270, align 4
+  %add.ptr = getelementptr inbounds i32, ptr %pA.addr.0271, i64 4
+  %add.ptr8 = getelementptr inbounds i32, ptr %pB.addr.0270, i64 4
+  %and.i = and <4 x i32> %5, %4
+  %not.i = xor <4 x i32> %4, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %not.i242 = xor <4 x i32> %5, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and.i243 = and <4 x i32> %not.i242, %not.i
+  %and.i245 = and <4 x i32> %4, %not.i242
+  %and.i247 = and <4 x i32> %5, %not.i
+  %6 = bitcast <4 x i32> %and.i to <16 x i8>
+  %vcntq_v.i = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %6)
+  %vpaddl.i = tail call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %vcntq_v.i)
+  %vpaddl1.i = tail call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %vpaddl.i)
+  %vpaddl1.i248 = tail call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %vpaddl1.i)
+  %add.i = add <2 x i64> %vpaddl1.i248, %tmp4tt.0268
+  %7 = bitcast <4 x i32> %and.i243 to <16 x i8>
+  %vcntq_v.i249 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %7)
+  %vpaddl.i250 = tail call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %vcntq_v.i249)
+  %vpaddl1.i251 = tail call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %vpaddl.i250)
+  %vpaddl1.i252 = tail call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %vpaddl1.i251)
+  %add.i253 = add <2 x i64> %vpaddl1.i252, %tmp4ff.0267
+  %8 = bitcast <4 x i32> %and.i245 to <16 x i8>
+  %vcntq_v.i254 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %8)
+  %vpaddl.i255 = tail call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %vcntq_v.i254)
+  %vpaddl1.i256 = tail call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %vpaddl.i255)
+  %vpaddl1.i257 = tail call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %vpaddl1.i256)
+  %add.i258 = add <2 x i64> %vpaddl1.i257, %tmp4tf.0266
+  %9 = bitcast <4 x i32> %and.i247 to <16 x i8>
+  %vcntq_v.i259 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %9)
+  %vpaddl.i260 = tail call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %vcntq_v.i259)
+  %vpaddl1.i261 = tail call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %vpaddl.i260)
+  %vpaddl1.i262 = tail call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %vpaddl1.i261)
+  %add.i263 = add <2 x i64> %vpaddl1.i262, %tmp4ft.0265
+  %dec = add nsw i32 %nbBoolBlock.0269, -1
+  %cmp.not = icmp eq i32 %dec, 0
+  br i1 %cmp.not, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %scevgep311 = getelementptr i8, ptr %pB, i64 %3
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %tmp4ft.0.lcssa = phi <2 x i64> [ zeroinitializer, %entry ], [ %add.i263, %while.end.loopexit ]
+  %tmp4tf.0.lcssa = phi <2 x i64> [ zeroinitializer, %entry ], [ %add.i258, %while.end.loopexit ]
+  %tmp4ff.0.lcssa = phi <2 x i64> [ zeroinitializer, %entry ], [ %add.i253, %while.end.loopexit ]
+  %tmp4tt.0.lcssa = phi <2 x i64> [ zeroinitializer, %entry ], [ %add.i, %while.end.loopexit ]
+  %pB.addr.0.lcssa = phi ptr [ %pB, %entry ], [ %scevgep311, %while.end.loopexit ]
+  %pA.addr.0.lcssa = phi ptr [ %pA, %entry ], [ %scevgep, %while.end.loopexit ]
+  %vgetq_lane = extractelement <2 x i64> %tmp4tt.0.lcssa, i64 0
+  %vgetq_lane45 = extractelement <2 x i64> %tmp4tt.0.lcssa, i64 1
+  %add = add i64 %vgetq_lane, %vgetq_lane45
+  %conv48 = trunc i64 %add to i32
+  %vgetq_lane51 = extractelement <2 x i64> %tmp4ff.0.lcssa, i64 0
+  %vgetq_lane55 = extractelement <2 x i64> %tmp4ff.0.lcssa, i64 1
+  %add57 = add i64 %vgetq_lane51, %vgetq_lane55
+  %conv60 = trunc i64 %add57 to i32
+  %vgetq_lane63 = extractelement <2 x i64> %tmp4tf.0.lcssa, i64 0
+  %vgetq_lane67 = extractelement <2 x i64> %tmp4tf.0.lcssa, i64 1
+  %add69 = add i64 %vgetq_lane63, %vgetq_lane67
+  %conv72 = trunc i64 %add69 to i32
+  %vgetq_lane75 = extractelement <2 x i64> %tmp4ft.0.lcssa, i64 0
+  %vgetq_lane79 = extractelement <2 x i64> %tmp4ft.0.lcssa, i64 1
+  %add81 = add i64 %vgetq_lane75, %vgetq_lane79
+  %conv84 = trunc i64 %add81 to i32
+  %and = and i32 %numberOfBools, 127
+  %cmp86284 = icmp ugt i32 %and, 31
+  br i1 %cmp86284, label %while.body88, label %while.end122
+
+while.body88:                                     ; preds = %while.end, %while.end121
+  %pA.addr.1291 = phi ptr [ %incdec.ptr, %while.end121 ], [ %pA.addr.0.lcssa, %while.end ]
+  %pB.addr.1290 = phi ptr [ %incdec.ptr89, %while.end121 ], [ %pB.addr.0.lcssa, %while.end ]
+  %_ctt.0289 = phi i32 [ %add99, %while.end121 ], [ %conv48, %while.end ]
+  %_cff.0288 = phi i32 [ %add106, %while.end121 ], [ %conv60, %while.end ]
+  %_ctf.0287 = phi i32 [ %add113, %while.end121 ], [ %conv72, %while.end ]
+  %_cft.0286 = phi i32 [ %add120, %while.end121 ], [ %conv84, %while.end ]
+  %nbBoolBlock.1285 = phi i32 [ %sub, %while.end121 ], [ %and, %while.end ]
+  %10 = load i32, ptr %pA.addr.1291, align 4
+  %11 = load i32, ptr %pB.addr.1290, align 4
+  br label %while.body93
+
+while.body93:                                     ; preds = %while.body88, %while.body93
+  %_ctt.1283 = phi i32 [ %_ctt.0289, %while.body88 ], [ %add99, %while.body93 ]
+  %_cff.1282 = phi i32 [ %_cff.0288, %while.body88 ], [ %add106, %while.body93 ]
+  %_ctf.1281 = phi i32 [ %_ctf.0287, %while.body88 ], [ %add113, %while.body93 ]
+  %_cft.1280 = phi i32 [ %_cft.0286, %while.body88 ], [ %add120, %while.body93 ]
+  %a.0279 = phi i32 [ %10, %while.body88 ], [ %shr96, %while.body93 ]
+  %b.0278 = phi i32 [ %11, %while.body88 ], [ %shr97, %while.body93 ]
+  %shift.0277 = phi i32 [ 0, %while.body88 ], [ %inc, %while.body93 ]
+  %and94 = and i32 %a.0279, 1
+  %and95 = and i32 %b.0278, 1
+  %shr96 = lshr i32 %a.0279, 1
+  %shr97 = lshr i32 %b.0278, 1
+  %tobool = icmp ne i32 %and94, 0
+  %tobool98 = icmp ne i32 %and95, 0
+  %12 = select i1 %tobool, i1 %tobool98, i1 false
+  %land.ext = zext i1 %12 to i32
+  %add99 = add i32 %_ctt.1283, %land.ext
+  %tobool100 = icmp eq i32 %and94, 0
+  %tobool103 = icmp eq i32 %and95, 0
+  %13 = select i1 %tobool100, i1 %tobool103, i1 false
+  %land.ext105 = zext i1 %13 to i32
+  %add106 = add i32 %_cff.1282, %land.ext105
+  %14 = select i1 %tobool, i1 %tobool103, i1 false
+  %land.ext112 = zext i1 %14 to i32
+  %add113 = add i32 %_ctf.1281, %land.ext112
+  %15 = select i1 %tobool100, i1 %tobool98, i1 false
+  %land.ext119 = zext i1 %15 to i32
+  %add120 = add i32 %_cft.1280, %land.ext119
+  %inc = add nuw nsw i32 %shift.0277, 1
+  %exitcond.not = icmp eq i32 %inc, 32
+  br i1 %exitcond.not, label %while.end121, label %while.body93
+
+while.end121:                                     ; preds = %while.body93
+  %incdec.ptr = getelementptr inbounds i32, ptr %pA.addr.1291, i64 1
+  %incdec.ptr89 = getelementptr inbounds i32, ptr %pB.addr.1290, i64 1
+  %sub = add nsw i32 %nbBoolBlock.1285, -32
+  %cmp86 = icmp ugt i32 %sub, 31
+  br i1 %cmp86, label %while.body88, label %while.end122
+
+while.end122:                                     ; preds = %while.end121, %while.end
+  %nbBoolBlock.1.lcssa = phi i32 [ %and, %while.end ], [ %sub, %while.end121 ]
+  %_cft.0.lcssa = phi i32 [ %conv84, %while.end ], [ %add120, %while.end121 ]
+  %_ctf.0.lcssa = phi i32 [ %conv72, %while.end ], [ %add113, %while.end121 ]
+  %_cff.0.lcssa = phi i32 [ %conv60, %while.end ], [ %add106, %while.end121 ]
+  %_ctt.0.lcssa = phi i32 [ %conv48, %while.end ], [ %add99, %while.end121 ]
+  %pB.addr.1.lcssa = phi ptr [ %pB.addr.0.lcssa, %while.end ], [ %incdec.ptr89, %while.end121 ]
+  %pA.addr.1.lcssa = phi ptr [ %pA.addr.0.lcssa, %while.end ], [ %incdec.ptr, %while.end121 ]
+  %cmp130.not299 = icmp eq i32 %nbBoolBlock.1.lcssa, 0
+  br i1 %cmp130.not299, label %while.end166, label %while.body132.preheader
+
+while.body132.preheader:                          ; preds = %while.end122
+  %16 = load i32, ptr %pB.addr.1.lcssa, align 4
+  %sub125 = sub nuw nsw i32 32, %nbBoolBlock.1.lcssa
+  %shr128 = lshr i32 %16, %sub125
+  %17 = load i32, ptr %pA.addr.1.lcssa, align 4
+  %shr126 = lshr i32 %17, %sub125
+  br label %while.body132
+
+while.body132:                                    ; preds = %while.body132.preheader, %while.body132
+  %_ctt.2306 = phi i32 [ %add142, %while.body132 ], [ %_ctt.0.lcssa, %while.body132.preheader ]
+  %_cff.2305 = phi i32 [ %add150, %while.body132 ], [ %_cff.0.lcssa, %while.body132.preheader ]
+  %_ctf.2304 = phi i32 [ %add157, %while.body132 ], [ %_ctf.0.lcssa, %while.body132.preheader ]
+  %_cft.2303 = phi i32 [ %add164, %while.body132 ], [ %_cft.0.lcssa, %while.body132.preheader ]
+  %nbBoolBlock.2302 = phi i32 [ %dec165, %while.body132 ], [ %nbBoolBlock.1.lcssa, %while.body132.preheader ]
+  %a.1301 = phi i32 [ %shr135, %while.body132 ], [ %shr126, %while.body132.preheader ]
+  %b.1300 = phi i32 [ %shr136, %while.body132 ], [ %shr128, %while.body132.preheader ]
+  %and133 = and i32 %a.1301, 1
+  %and134 = and i32 %b.1300, 1
+  %shr135 = lshr i32 %a.1301, 1
+  %shr136 = lshr i32 %b.1300, 1
+  %tobool137 = icmp ne i32 %and133, 0
+  %tobool139 = icmp ne i32 %and134, 0
+  %18 = select i1 %tobool137, i1 %tobool139, i1 false
+  %land.ext141 = zext i1 %18 to i32
+  %add142 = add i32 %_ctt.2306, %land.ext141
+  %tobool144 = icmp eq i32 %and133, 0
+  %tobool147 = icmp eq i32 %and134, 0
+  %19 = select i1 %tobool144, i1 %tobool147, i1 false
+  %land.ext149 = zext i1 %19 to i32
+  %add150 = add i32 %_cff.2305, %land.ext149
+  %20 = select i1 %tobool137, i1 %tobool147, i1 false
+  %land.ext156 = zext i1 %20 to i32
+  %add157 = add i32 %_ctf.2304, %land.ext156
+  %21 = select i1 %tobool144, i1 %tobool139, i1 false
+  %land.ext163 = zext i1 %21 to i32
+  %add164 = add i32 %_cft.2303, %land.ext163
+  %dec165 = add nsw i32 %nbBoolBlock.2302, -1
+  %cmp130.not = icmp eq i32 %dec165, 0
+  br i1 %cmp130.not, label %while.end166, label %while.body132
+
+while.end166:                                     ; preds = %while.body132, %while.end122
+  %_cft.2.lcssa = phi i32 [ %_cft.0.lcssa, %while.end122 ], [ %add164, %while.body132 ]
+  %_ctf.2.lcssa = phi i32 [ %_ctf.0.lcssa, %while.end122 ], [ %add157, %while.body132 ]
+  %_cff.2.lcssa = phi i32 [ %_cff.0.lcssa, %while.end122 ], [ %add150, %while.body132 ]
+  %_ctt.2.lcssa = phi i32 [ %_ctt.0.lcssa, %while.end122 ], [ %add142, %while.body132 ]
+  store i32 %_ctt.2.lcssa, ptr %cTT, align 4
+  store i32 %_cff.2.lcssa, ptr %cFF, align 4
+  store i32 %_ctf.2.lcssa, ptr %cTF, align 4
+  store i32 %_cft.2.lcssa, ptr %cFT, align 4
+  ret void
+}
+
+define void @scalar(ptr nocapture noundef readonly %pA, ptr nocapture noundef readonly %pB, i32 noundef %numberOfBools, ptr nocapture noundef writeonly %cTT, ptr nocapture noundef writeonly %cFF, ptr nocapture noundef writeonly %cTF, ptr nocapture noundef writeonly %cFT) {
+; CHECK-LABEL: @scalar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP117:%.*]] = icmp ugt i32 [[NUMBEROFBOOLS:%.*]], 31
+; CHECK-NEXT:    br i1 [[CMP117]], label [[WHILE_BODY:%.*]], label [[WHILE_END29:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[_CFT_0124:%.*]] = phi i32 [ [[ADD28:%.*]], [[WHILE_END:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[_CTF_0123:%.*]] = phi i32 [ [[ADD21:%.*]], [[WHILE_END]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[_CFF_0122:%.*]] = phi i32 [ [[ADD14:%.*]], [[WHILE_END]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[_CTT_0121:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_END]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[PA_ADDR_0120:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_END]] ], [ [[PA:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[PB_ADDR_0119:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[WHILE_END]] ], [ [[PB:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[NUMBEROFBOOLS_ADDR_0118:%.*]] = phi i32 [ [[SUB:%.*]], [[WHILE_END]] ], [ [[NUMBEROFBOOLS]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PA_ADDR_0120]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PB_ADDR_0119]], align 4
+; CHECK-NEXT:    br label [[WHILE_BODY4:%.*]]
+; CHECK:       while.body4:
+; CHECK-NEXT:    [[SHIFT_0116:%.*]] = phi i32 [ 0, [[WHILE_BODY]] ], [ [[INC:%.*]], [[WHILE_BODY4]] ]
+; CHECK-NEXT:    [[B_0115:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY]] ], [ [[SHR6:%.*]], [[WHILE_BODY4]] ]
+; CHECK-NEXT:    [[A_0114:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY]] ], [ [[SHR:%.*]], [[WHILE_BODY4]] ]
+; CHECK-NEXT:    [[_CFT_1113:%.*]] = phi i32 [ [[_CFT_0124]], [[WHILE_BODY]] ], [ [[ADD28]], [[WHILE_BODY4]] ]
+; CHECK-NEXT:    [[_CTF_1112:%.*]] = phi i32 [ [[_CTF_0123]], [[WHILE_BODY]] ], [ [[ADD21]], [[WHILE_BODY4]] ]
+; CHECK-NEXT:    [[_CFF_1111:%.*]] = phi i32 [ [[_CFF_0122]], [[WHILE_BODY]] ], [ [[ADD14]], [[WHILE_BODY4]] ]
+; CHECK-NEXT:    [[_CTT_1110:%.*]] = phi i32 [ [[_CTT_0121]], [[WHILE_BODY]] ], [ [[ADD]], [[WHILE_BODY4]] ]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[A_0114]], 1
+; CHECK-NEXT:    [[AND5:%.*]] = and i32 [[B_0115]], 1
+; CHECK-NEXT:    [[SHR]] = lshr i32 [[A_0114]], 1
+; CHECK-NEXT:    [[SHR6]] = lshr i32 [[B_0115]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[TOBOOL7:%.*]] = icmp ne i32 [[AND5]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL7]], i1 false
+; CHECK-NEXT:    [[LAND_EXT:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[ADD]] = add i32 [[_CTT_1110]], [[LAND_EXT]]
+; CHECK-NEXT:    [[TOBOOL8:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[AND5]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TOBOOL8]], i1 [[TOBOOL11]], i1 false
+; CHECK-NEXT:    [[LAND_EXT13:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[ADD14]] = add i32 [[_CFF_1111]], [[LAND_EXT13]]
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL11]], i1 false
+; CHECK-NEXT:    [[LAND_EXT20:%.*]] = zext i1 [[TMP4]] to i32
+; CHECK-NEXT:    [[ADD21]] = add i32 [[_CTF_1112]], [[LAND_EXT20]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TOBOOL8]], i1 [[TOBOOL7]], i1 false
+; CHECK-NEXT:    [[LAND_EXT27:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[ADD28]] = add i32 [[_CFT_1113]], [[LAND_EXT27]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[SHIFT_0116]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], 32
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[WHILE_END]], label [[WHILE_BODY4]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[PA_ADDR_0120]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i32, ptr [[PB_ADDR_0119]], i64 1
+; CHECK-NEXT:    [[SUB]] = add i32 [[NUMBEROFBOOLS_ADDR_0118]], -32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SUB]], 31
+; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_END29]]
+; CHECK:       while.end29:
+; CHECK-NEXT:    [[NUMBEROFBOOLS_ADDR_0_LCSSA:%.*]] = phi i32 [ [[NUMBEROFBOOLS]], [[ENTRY]] ], [ [[SUB]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[INCDEC_PTR1]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[INCDEC_PTR]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CTT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CFF_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD14]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CTF_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD21]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CFT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD28]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[CMP37_NOT131:%.*]] = icmp eq i32 [[NUMBEROFBOOLS_ADDR_0_LCSSA]], 0
+; CHECK-NEXT:    br i1 [[CMP37_NOT131]], label [[WHILE_END71:%.*]], label [[WHILE_BODY38_PREHEADER:%.*]]
+; CHECK:       while.body38.preheader:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[PB_ADDR_0_LCSSA]], align 4
+; CHECK-NEXT:    [[SUB32:%.*]] = sub nuw nsw i32 32, [[NUMBEROFBOOLS_ADDR_0_LCSSA]]
+; CHECK-NEXT:    [[SHR35:%.*]] = lshr i32 [[TMP6]], [[SUB32]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[PA_ADDR_0_LCSSA]], align 4
+; CHECK-NEXT:    [[SHR33:%.*]] = lshr i32 [[TMP7]], [[SUB32]]
+; CHECK-NEXT:    br label [[WHILE_BODY38:%.*]]
+; CHECK:       while.body38:
+; CHECK-NEXT:    [[B_1138:%.*]] = phi i32 [ [[SHR42:%.*]], [[WHILE_BODY38]] ], [ [[SHR35]], [[WHILE_BODY38_PREHEADER]] ]
+; CHECK-NEXT:    [[A_1137:%.*]] = phi i32 [ [[SHR41:%.*]], [[WHILE_BODY38]] ], [ [[SHR33]], [[WHILE_BODY38_PREHEADER]] ]
+; CHECK-NEXT:    [[_CFT_2136:%.*]] = phi i32 [ [[ADD70:%.*]], [[WHILE_BODY38]] ], [ [[_CFT_0_LCSSA]], [[WHILE_BODY38_PREHEADER]] ]
+; CHECK-NEXT:    [[_CTF_2135:%.*]] = phi i32 [ [[ADD63:%.*]], [[WHILE_BODY38]] ], [ [[_CTF_0_LCSSA]], [[WHILE_BODY38_PREHEADER]] ]
+; CHECK-NEXT:    [[_CFF_2134:%.*]] = phi i32 [ [[ADD56:%.*]], [[WHILE_BODY38]] ], [ [[_CFF_0_LCSSA]], [[WHILE_BODY38_PREHEADER]] ]
+; CHECK-NEXT:    [[_CTT_2133:%.*]] = phi i32 [ [[ADD48:%.*]], [[WHILE_BODY38]] ], [ [[_CTT_0_LCSSA]], [[WHILE_BODY38_PREHEADER]] ]
+; CHECK-NEXT:    [[NUMBEROFBOOLS_ADDR_1132:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY38]] ], [ [[NUMBEROFBOOLS_ADDR_0_LCSSA]], [[WHILE_BODY38_PREHEADER]] ]
+; CHECK-NEXT:    [[AND39:%.*]] = and i32 [[A_1137]], 1
+; CHECK-NEXT:    [[AND40:%.*]] = and i32 [[B_1138]], 1
+; CHECK-NEXT:    [[SHR41]] = lshr i32 [[A_1137]], 1
+; CHECK-NEXT:    [[SHR42]] = lshr i32 [[B_1138]], 1
+; CHECK-NEXT:    [[TOBOOL43:%.*]] = icmp ne i32 [[AND39]], 0
+; CHECK-NEXT:    [[TOBOOL45:%.*]] = icmp ne i32 [[AND40]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TOBOOL43]], i1 [[TOBOOL45]], i1 false
+; CHECK-NEXT:    [[LAND_EXT47:%.*]] = zext i1 [[TMP8]] to i32
+; CHECK-NEXT:    [[ADD48]] = add i32 [[_CTT_2133]], [[LAND_EXT47]]
+; CHECK-NEXT:    [[TOBOOL50:%.*]] = icmp eq i32 [[AND39]], 0
+; CHECK-NEXT:    [[TOBOOL53:%.*]] = icmp eq i32 [[AND40]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TOBOOL50]], i1 [[TOBOOL53]], i1 false
+; CHECK-NEXT:    [[LAND_EXT55:%.*]] = zext i1 [[TMP9]] to i32
+; CHECK-NEXT:    [[ADD56]] = add i32 [[_CFF_2134]], [[LAND_EXT55]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TOBOOL43]], i1 [[TOBOOL53]], i1 false
+; CHECK-NEXT:    [[LAND_EXT62:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[ADD63]] = add i32 [[_CTF_2135]], [[LAND_EXT62]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TOBOOL50]], i1 [[TOBOOL45]], i1 false
+; CHECK-NEXT:    [[LAND_EXT69:%.*]] = zext i1 [[TMP11]] to i32
+; CHECK-NEXT:    [[ADD70]] = add i32 [[_CFT_2136]], [[LAND_EXT69]]
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[NUMBEROFBOOLS_ADDR_1132]], -1
+; CHECK-NEXT:    [[CMP37_NOT:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP37_NOT]], label [[WHILE_END71]], label [[WHILE_BODY38]]
+; CHECK:       while.end71:
+; CHECK-NEXT:    [[_CTT_2_LCSSA:%.*]] = phi i32 [ [[_CTT_0_LCSSA]], [[WHILE_END29]] ], [ [[ADD48]], [[WHILE_BODY38]] ]
+; CHECK-NEXT:    [[_CFF_2_LCSSA:%.*]] = phi i32 [ [[_CFF_0_LCSSA]], [[WHILE_END29]] ], [ [[ADD56]], [[WHILE_BODY38]] ]
+; CHECK-NEXT:    [[_CTF_2_LCSSA:%.*]] = phi i32 [ [[_CTF_0_LCSSA]], [[WHILE_END29]] ], [ [[ADD63]], [[WHILE_BODY38]] ]
+; CHECK-NEXT:    [[_CFT_2_LCSSA:%.*]] = phi i32 [ [[_CFT_0_LCSSA]], [[WHILE_END29]] ], [ [[ADD70]], [[WHILE_BODY38]] ]
+; CHECK-NEXT:    store i32 [[_CTT_2_LCSSA]], ptr [[CTT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[_CFF_2_LCSSA]], ptr [[CFF:%.*]], align 4
+; CHECK-NEXT:    store i32 [[_CTF_2_LCSSA]], ptr [[CTF:%.*]], align 4
+; CHECK-NEXT:    store i32 [[_CFT_2_LCSSA]], ptr [[CFT:%.*]], align 4
+; CHECK-NEXT:    ret void
 ;
-  %4 = extractelement <2 x i64> %1, i64 0
-  %5 = or i64 %4, 0
-  %6 = trunc i64 %5 to i32
-  %7 = extractelement <2 x i64> %0, i64 0
-  %8 = or i64 %7, 0
-  %9 = trunc i64 %8 to i32
-  %10 = extractelement <2 x i64> %2, i64 0
-  %11 = extractelement <2 x i64> %2, i64 1
-  %12 = or i64 %10, %11
-  %13 = trunc i64 %12 to i32
-  %14 = extractelement <2 x i64> %0, i64 0
-  %15 = or i64 %14, 0
-  %16 = trunc i64 %15 to i32
-  br label %17
-
-17:
-  %18 = phi i32 [ %22, %17 ], [ %6, %3 ]
-  %19 = phi i32 [ %23, %17 ], [ %9, %3 ]
-  %20 = phi i32 [ %24, %17 ], [ %13, %3 ]
-  %21 = phi i32 [ %25, %17 ], [ %16, %3 ]
-  %22 = or i32 %18, 0
-  %23 = add i32 0, 0
-  %24 = add i32 0, 0
-  %25 = add i32 0, 0
-  br label %17
+entry:
+  %cmp117 = icmp ugt i32 %numberOfBools, 31
+  br i1 %cmp117, label %while.body, label %while.end29
+
+while.body:                                       ; preds = %entry, %while.end
+  %_cft.0124 = phi i32 [ %add28, %while.end ], [ 0, %entry ]
+  %_ctf.0123 = phi i32 [ %add21, %while.end ], [ 0, %entry ]
+  %_cff.0122 = phi i32 [ %add14, %while.end ], [ 0, %entry ]
+  %_ctt.0121 = phi i32 [ %add, %while.end ], [ 0, %entry ]
+  %pA.addr.0120 = phi ptr [ %incdec.ptr, %while.end ], [ %pA, %entry ]
+  %pB.addr.0119 = phi ptr [ %incdec.ptr1, %while.end ], [ %pB, %entry ]
+  %numberOfBools.addr.0118 = phi i32 [ %sub, %while.end ], [ %numberOfBools, %entry ]
+  %0 = load i32, ptr %pA.addr.0120, align 4
+  %1 = load i32, ptr %pB.addr.0119, align 4
+  br label %while.body4
+
+while.body4:                                      ; preds = %while.body, %while.body4
+  %shift.0116 = phi i32 [ 0, %while.body ], [ %inc, %while.body4 ]
+  %b.0115 = phi i32 [ %1, %while.body ], [ %shr6, %while.body4 ]
+  %a.0114 = phi i32 [ %0, %while.body ], [ %shr, %while.body4 ]
+  %_cft.1113 = phi i32 [ %_cft.0124, %while.body ], [ %add28, %while.body4 ]
+  %_ctf.1112 = phi i32 [ %_ctf.0123, %while.body ], [ %add21, %while.body4 ]
+  %_cff.1111 = phi i32 [ %_cff.0122, %while.body ], [ %add14, %while.body4 ]
+  %_ctt.1110 = phi i32 [ %_ctt.0121, %while.body ], [ %add, %while.body4 ]
+  %and = and i32 %a.0114, 1
+  %and5 = and i32 %b.0115, 1
+  %shr = lshr i32 %a.0114, 1
+  %shr6 = lshr i32 %b.0115, 1
+  %tobool = icmp ne i32 %and, 0
+  %tobool7 = icmp ne i32 %and5, 0
+  %2 = select i1 %tobool, i1 %tobool7, i1 false
+  %land.ext = zext i1 %2 to i32
+  %add = add i32 %_ctt.1110, %land.ext
+  %tobool8 = icmp eq i32 %and, 0
+  %tobool11 = icmp eq i32 %and5, 0
+  %3 = select i1 %tobool8, i1 %tobool11, i1 false
+  %land.ext13 = zext i1 %3 to i32
+  %add14 = add i32 %_cff.1111, %land.ext13
+  %4 = select i1 %tobool, i1 %tobool11, i1 false
+  %land.ext20 = zext i1 %4 to i32
+  %add21 = add i32 %_ctf.1112, %land.ext20
+  %5 = select i1 %tobool8, i1 %tobool7, i1 false
+  %land.ext27 = zext i1 %5 to i32
+  %add28 = add i32 %_cft.1113, %land.ext27
+  %inc = add nuw nsw i32 %shift.0116, 1
+  %exitcond.not = icmp eq i32 %inc, 32
+  br i1 %exitcond.not, label %while.end, label %while.body4
+
+while.end:                                        ; preds = %while.body4
+  %incdec.ptr = getelementptr inbounds i32, ptr %pA.addr.0120, i64 1
+  %incdec.ptr1 = getelementptr inbounds i32, ptr %pB.addr.0119, i64 1
+  %sub = add i32 %numberOfBools.addr.0118, -32
+  %cmp = icmp ugt i32 %sub, 31
+  br i1 %cmp, label %while.body, label %while.end29
+
+while.end29:                                      ; preds = %while.end, %entry
+  %numberOfBools.addr.0.lcssa = phi i32 [ %numberOfBools, %entry ], [ %sub, %while.end ]
+  %pB.addr.0.lcssa = phi ptr [ %pB, %entry ], [ %incdec.ptr1, %while.end ]
+  %pA.addr.0.lcssa = phi ptr [ %pA, %entry ], [ %incdec.ptr, %while.end ]
+  %_ctt.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end ]
+  %_cff.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %while.end ]
+  %_ctf.0.lcssa = phi i32 [ 0, %entry ], [ %add21, %while.end ]
+  %_cft.0.lcssa = phi i32 [ 0, %entry ], [ %add28, %while.end ]
+  %cmp37.not131 = icmp eq i32 %numberOfBools.addr.0.lcssa, 0
+  br i1 %cmp37.not131, label %while.end71, label %while.body38.preheader
+
+while.body38.preheader:                           ; preds = %while.end29
+  %6 = load i32, ptr %pB.addr.0.lcssa, align 4
+  %sub32 = sub nuw nsw i32 32, %numberOfBools.addr.0.lcssa
+  %shr35 = lshr i32 %6, %sub32
+  %7 = load i32, ptr %pA.addr.0.lcssa, align 4
+  %shr33 = lshr i32 %7, %sub32
+  br label %while.body38
+
+while.body38:                                     ; preds = %while.body38.preheader, %while.body38
+  %b.1138 = phi i32 [ %shr42, %while.body38 ], [ %shr35, %while.body38.preheader ]
+  %a.1137 = phi i32 [ %shr41, %while.body38 ], [ %shr33, %while.body38.preheader ]
+  %_cft.2136 = phi i32 [ %add70, %while.body38 ], [ %_cft.0.lcssa, %while.body38.preheader ]
+  %_ctf.2135 = phi i32 [ %add63, %while.body38 ], [ %_ctf.0.lcssa, %while.body38.preheader ]
+  %_cff.2134 = phi i32 [ %add56, %while.body38 ], [ %_cff.0.lcssa, %while.body38.preheader ]
+  %_ctt.2133 = phi i32 [ %add48, %while.body38 ], [ %_ctt.0.lcssa, %while.body38.preheader ]
+  %numberOfBools.addr.1132 = phi i32 [ %dec, %while.body38 ], [ %numberOfBools.addr.0.lcssa, %while.body38.preheader ]
+  %and39 = and i32 %a.1137, 1
+  %and40 = and i32 %b.1138, 1
+  %shr41 = lshr i32 %a.1137, 1
+  %shr42 = lshr i32 %b.1138, 1
+  %tobool43 = icmp ne i32 %and39, 0
+  %tobool45 = icmp ne i32 %and40, 0
+  %8 = select i1 %tobool43, i1 %tobool45, i1 false
+  %land.ext47 = zext i1 %8 to i32
+  %add48 = add i32 %_ctt.2133, %land.ext47
+  %tobool50 = icmp eq i32 %and39, 0
+  %tobool53 = icmp eq i32 %and40, 0
+  %9 = select i1 %tobool50, i1 %tobool53, i1 false
+  %land.ext55 = zext i1 %9 to i32
+  %add56 = add i32 %_cff.2134, %land.ext55
+  %10 = select i1 %tobool43, i1 %tobool53, i1 false
+  %land.ext62 = zext i1 %10 to i32
+  %add63 = add i32 %_ctf.2135, %land.ext62
+  %11 = select i1 %tobool50, i1 %tobool45, i1 false
+  %land.ext69 = zext i1 %11 to i32
+  %add70 = add i32 %_cft.2136, %land.ext69
+  %dec = add nsw i32 %numberOfBools.addr.1132, -1
+  %cmp37.not = icmp eq i32 %dec, 0
+  br i1 %cmp37.not, label %while.end71, label %while.body38
+
+while.end71:                                      ; preds = %while.body38, %while.end29
+  %_ctt.2.lcssa = phi i32 [ %_ctt.0.lcssa, %while.end29 ], [ %add48, %while.body38 ]
+  %_cff.2.lcssa = phi i32 [ %_cff.0.lcssa, %while.end29 ], [ %add56, %while.body38 ]
+  %_ctf.2.lcssa = phi i32 [ %_ctf.0.lcssa, %while.end29 ], [ %add63, %while.body38 ]
+  %_cft.2.lcssa = phi i32 [ %_cft.0.lcssa, %while.end29 ], [ %add70, %while.body38 ]
+  store i32 %_ctt.2.lcssa, ptr %cTT, align 4
+  store i32 %_cff.2.lcssa, ptr %cFF, align 4
+  store i32 %_ctf.2.lcssa, ptr %cTF, align 4
+  store i32 %_cft.2.lcssa, ptr %cFT, align 4
+  ret void
 }
+
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) #1
+declare <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) #2
+declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) #2
+declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) #2