[clang] [llvm] [NFC][PowerPC] Add test case for lockdown of vector compare greater than support for Zero vector comparisons (PR #147246)

Mon Jul 7 00:41:23 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: None (Himadhith)

<details>
<summary>Changes</summary>

NFC patch to add testcase for locking down the support of Zero vector comparisons using the `vcmpgtuh (vector compare greater than unsigned halfword)` instruction. 
Currently `vcmpequh (vector compare equal unsigned halfword)` is in use.

---
Full diff: https://github.com/llvm/llvm-project/pull/147246.diff


2 Files Affected:

- (added) clang/test/CodeGen/PowerPC/check-zero-vector.c (+143) 
- (added) llvm/test/CodeGen/PowerPC/check-zero-vector.ll (+239) 


``````````diff

diff --git a/clang/test/CodeGen/PowerPC/check-zero-vector.c b/clang/test/CodeGen/PowerPC/check-zero-vector.c
new file mode 100644
index 0000000000000..82f782faf9a5e
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/check-zero-vector.c
@@ -0,0 +1,143 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1  -triple powerpc64-unknown-unknown -emit-llvm %s -o -  | FileCheck %s --check-prefix=POWERPC_64
+// RUN: %clang_cc1  -triple powerpc64le-unknown-unknown -emit-llvm %s -o -  | FileCheck %s --check-prefix=POWERPC_64LE
+// RUN: %clang_cc1  -triple powerpc-unknown-unknown -emit-llvm %s -o -  | FileCheck %s --check-prefix=POWERPC_32
+
+// POWERPC_64-LABEL: define dso_local signext i32 @test_Greater_than(
+// POWERPC_64-SAME: ptr noundef [[COLAUTHS:%.*]]) #[[ATTR0:[0-9]+]] {
+// POWERPC_64-NEXT:  [[ENTRY:.*:]]
+// POWERPC_64-NEXT:    [[COLAUTHS_ADDR:%.*]] = alloca ptr, align 8
+// POWERPC_64-NEXT:    [[RESULT:%.*]] = alloca i16, align 2
+// POWERPC_64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// POWERPC_64-NEXT:    store ptr [[COLAUTHS]], ptr [[COLAUTHS_ADDR]], align 8
+// POWERPC_64-NEXT:    store i16 0, ptr [[RESULT]], align 2
+// POWERPC_64-NEXT:    store i32 0, ptr [[I]], align 4
+// POWERPC_64-NEXT:    br label %[[FOR_COND:.*]]
+// POWERPC_64:       [[FOR_COND]]:
+// POWERPC_64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4
+// POWERPC_64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// POWERPC_64:       [[FOR_BODY]]:
+// POWERPC_64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[COLAUTHS_ADDR]], align 8
+// POWERPC_64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP2]] to i64
+// POWERPC_64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i64 [[IDXPROM]]
+// POWERPC_64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+// POWERPC_64-NEXT:    [[CONV:%.*]] = zext i16 [[TMP3]] to i32
+// POWERPC_64-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[CONV]], 0
+// POWERPC_64-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// POWERPC_64:       [[IF_THEN]]:
+// POWERPC_64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[RESULT]], align 2
+// POWERPC_64-NEXT:    [[INC:%.*]] = add i16 [[TMP4]], 1
+// POWERPC_64-NEXT:    store i16 [[INC]], ptr [[RESULT]], align 2
+// POWERPC_64-NEXT:    br label %[[IF_END]]
+// POWERPC_64:       [[IF_END]]:
+// POWERPC_64-NEXT:    br label %[[FOR_INC:.*]]
+// POWERPC_64:       [[FOR_INC]]:
+// POWERPC_64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_64-NEXT:    [[INC3:%.*]] = add nsw i32 [[TMP5]], 1
+// POWERPC_64-NEXT:    store i32 [[INC3]], ptr [[I]], align 4
+// POWERPC_64-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+// POWERPC_64:       [[FOR_END]]:
+// POWERPC_64-NEXT:    [[TMP6:%.*]] = load i16, ptr [[RESULT]], align 2
+// POWERPC_64-NEXT:    [[CONV4:%.*]] = zext i16 [[TMP6]] to i32
+// POWERPC_64-NEXT:    ret i32 [[CONV4]]
+//
+// POWERPC_64LE-LABEL: define dso_local signext i32 @test_Greater_than(
+// POWERPC_64LE-SAME: ptr noundef [[COLAUTHS:%.*]]) #[[ATTR0:[0-9]+]] {
+// POWERPC_64LE-NEXT:  [[ENTRY:.*:]]
+// POWERPC_64LE-NEXT:    [[COLAUTHS_ADDR:%.*]] = alloca ptr, align 8
+// POWERPC_64LE-NEXT:    [[RESULT:%.*]] = alloca i16, align 2
+// POWERPC_64LE-NEXT:    [[I:%.*]] = alloca i32, align 4
+// POWERPC_64LE-NEXT:    store ptr [[COLAUTHS]], ptr [[COLAUTHS_ADDR]], align 8
+// POWERPC_64LE-NEXT:    store i16 0, ptr [[RESULT]], align 2
+// POWERPC_64LE-NEXT:    store i32 0, ptr [[I]], align 4
+// POWERPC_64LE-NEXT:    br label %[[FOR_COND:.*]]
+// POWERPC_64LE:       [[FOR_COND]]:
+// POWERPC_64LE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_64LE-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4
+// POWERPC_64LE-NEXT:    br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// POWERPC_64LE:       [[FOR_BODY]]:
+// POWERPC_64LE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[COLAUTHS_ADDR]], align 8
+// POWERPC_64LE-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_64LE-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP2]] to i64
+// POWERPC_64LE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i64 [[IDXPROM]]
+// POWERPC_64LE-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+// POWERPC_64LE-NEXT:    [[CONV:%.*]] = zext i16 [[TMP3]] to i32
+// POWERPC_64LE-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[CONV]], 0
+// POWERPC_64LE-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// POWERPC_64LE:       [[IF_THEN]]:
+// POWERPC_64LE-NEXT:    [[TMP4:%.*]] = load i16, ptr [[RESULT]], align 2
+// POWERPC_64LE-NEXT:    [[INC:%.*]] = add i16 [[TMP4]], 1
+// POWERPC_64LE-NEXT:    store i16 [[INC]], ptr [[RESULT]], align 2
+// POWERPC_64LE-NEXT:    br label %[[IF_END]]
+// POWERPC_64LE:       [[IF_END]]:
+// POWERPC_64LE-NEXT:    br label %[[FOR_INC:.*]]
+// POWERPC_64LE:       [[FOR_INC]]:
+// POWERPC_64LE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_64LE-NEXT:    [[INC3:%.*]] = add nsw i32 [[TMP5]], 1
+// POWERPC_64LE-NEXT:    store i32 [[INC3]], ptr [[I]], align 4
+// POWERPC_64LE-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+// POWERPC_64LE:       [[FOR_END]]:
+// POWERPC_64LE-NEXT:    [[TMP6:%.*]] = load i16, ptr [[RESULT]], align 2
+// POWERPC_64LE-NEXT:    [[CONV4:%.*]] = zext i16 [[TMP6]] to i32
+// POWERPC_64LE-NEXT:    ret i32 [[CONV4]]
+//
+// POWERPC_32-LABEL: define dso_local i32 @test_Greater_than(
+// POWERPC_32-SAME: ptr noundef [[COLAUTHS:%.*]]) #[[ATTR0:[0-9]+]] {
+// POWERPC_32-NEXT:  [[ENTRY:.*:]]
+// POWERPC_32-NEXT:    [[COLAUTHS_ADDR:%.*]] = alloca ptr, align 4
+// POWERPC_32-NEXT:    [[RESULT:%.*]] = alloca i16, align 2
+// POWERPC_32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// POWERPC_32-NEXT:    store ptr [[COLAUTHS]], ptr [[COLAUTHS_ADDR]], align 4
+// POWERPC_32-NEXT:    store i16 0, ptr [[RESULT]], align 2
+// POWERPC_32-NEXT:    store i32 0, ptr [[I]], align 4
+// POWERPC_32-NEXT:    br label %[[FOR_COND:.*]]
+// POWERPC_32:       [[FOR_COND]]:
+// POWERPC_32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_32-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4
+// POWERPC_32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// POWERPC_32:       [[FOR_BODY]]:
+// POWERPC_32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[COLAUTHS_ADDR]], align 4
+// POWERPC_32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 [[TMP2]]
+// POWERPC_32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+// POWERPC_32-NEXT:    [[CONV:%.*]] = zext i16 [[TMP3]] to i32
+// POWERPC_32-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[CONV]], 0
+// POWERPC_32-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// POWERPC_32:       [[IF_THEN]]:
+// POWERPC_32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[RESULT]], align 2
+// POWERPC_32-NEXT:    [[INC:%.*]] = add i16 [[TMP4]], 1
+// POWERPC_32-NEXT:    store i16 [[INC]], ptr [[RESULT]], align 2
+// POWERPC_32-NEXT:    br label %[[IF_END]]
+// POWERPC_32:       [[IF_END]]:
+// POWERPC_32-NEXT:    br label %[[FOR_INC:.*]]
+// POWERPC_32:       [[FOR_INC]]:
+// POWERPC_32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_32-NEXT:    [[INC3:%.*]] = add nsw i32 [[TMP5]], 1
+// POWERPC_32-NEXT:    store i32 [[INC3]], ptr [[I]], align 4
+// POWERPC_32-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+// POWERPC_32:       [[FOR_END]]:
+// POWERPC_32-NEXT:    [[TMP6:%.*]] = load i16, ptr [[RESULT]], align 2
+// POWERPC_32-NEXT:    [[CONV4:%.*]] = zext i16 [[TMP6]] to i32
+// POWERPC_32-NEXT:    ret i32 [[CONV4]]
+//
+int test_Greater_than(unsigned short *colauths) {
+  unsigned short result = 0;
+  for (int i = 0; i < 4; i++) {
+    if (colauths[i] > 0) {
+      result++;
+    }
+  }
+  return result;
+}
+//.
+// POWERPC_64: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
+// POWERPC_64: [[META3]] = !{!"llvm.loop.mustprogress"}
+//.
+// POWERPC_64LE: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
+// POWERPC_64LE: [[META3]] = !{!"llvm.loop.mustprogress"}
+//.
+// POWERPC_32: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
+// POWERPC_32: [[META3]] = !{!"llvm.loop.mustprogress"}
+//.
diff --git a/llvm/test/CodeGen/PowerPC/check-zero-vector.ll b/llvm/test/CodeGen/PowerPC/check-zero-vector.ll
new file mode 100644
index 0000000000000..82f97dd997478
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/check-zero-vector.ll
@@ -0,0 +1,239 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:    < %s | FileCheck %s --check-prefix=POWERPC_64LE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
+; RUN:  < %s | FileCheck %s --check-prefix=POWERPC_64
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
+; RUN:   < %s | FileCheck %s --check-prefix=POWERPC_32
+
+define i32 @test_Greater_than(ptr %colauths, i32 signext %ncols) {
+; POWERPC_64LE-LABEL: test_Greater_than:
+; POWERPC_64LE:  .LBB0_6: # %vector.body
+; POWERPC_64LE-DAG:    #
+; POWERPC_64LE-DAG:    lxv 50, -64(4)
+; POWERPC_64LE-DAG:    vcmpequh 18, 18, 3
+; POWERPC_64LE-DAG:    xxlnor 50, 50, 50
+; POWERPC_64LE-DAG:    vmrghh 19, 18, 18
+; POWERPC_64LE-DAG:    vmrglh 18, 18, 18
+; POWERPC_64LE-DAG:    xxland 51, 51, 34
+; POWERPC_64LE-DAG:    xxland 50, 50, 34
+; POWERPC_64LE-DAG:    vadduwm 5, 5, 19
+; POWERPC_64LE:  .LBB0_10: # %vec.epilog.vector.body
+; POWERPC_64LE-DAG:    #
+; POWERPC_64LE-DAG:    lxv 32, 0(4)
+; POWERPC_64LE-DAG:    addi 4, 4, 16
+; POWERPC_64LE-DAG:    vcmpequh 0, 0, 4
+; POWERPC_64LE-DAG:    xxlnor 32, 32, 32
+; POWERPC_64LE-DAG:    vmrglh 1, 0, 0
+; POWERPC_64LE-DAG:    vmrghh 0, 0, 0
+; POWERPC_64LE-DAG:    xxland 33, 33, 34
+; POWERPC_64LE-DAG:    xxland 32, 32, 34
+; POWERPC_64LE-DAG:    vadduwm 5, 5, 0
+; POWERPC_64LE-DAG:    vadduwm 3, 3, 1
+; POWERPC_64LE-DAG:    bdnz .LBB0_10
+; POWERPC_64LE:    blr
+;
+; POWERPC_64-LABEL: test_Greater_than:
+; POWERPC_64:  L..BB0_6: # %vector.body
+; POWERPC_64-DAG:    #
+; POWERPC_64-DAG:    lxv 50, -64(4)
+; POWERPC_64-DAG:    vcmpequh 18, 18, 3
+; POWERPC_64-DAG:    xxlnor 50, 50, 50
+; POWERPC_64-DAG:    vmrglh 19, 18, 18
+; POWERPC_64-DAG:    vmrghh 18, 18, 18
+; POWERPC_64-DAG:    xxland 51, 51, 34
+; POWERPC_64-DAG:    xxland 50, 50, 34
+; POWERPC_64-DAG:    vadduwm 5, 5, 19
+; POWERPC_64:  L..BB0_10: # %vec.epilog.vector.body
+; POWERPC_64-DAG:    #
+; POWERPC_64-DAG:    lxv 32, 0(4)
+; POWERPC_64-DAG:    addi 4, 4, 16
+; POWERPC_64-DAG:    vcmpequh 0, 0, 4
+; POWERPC_64-DAG:    xxlnor 32, 32, 32
+; POWERPC_64-DAG:    vmrghh 1, 0, 0
+; POWERPC_64-DAG:    vmrglh 0, 0, 0
+; POWERPC_64-DAG:    xxland 33, 33, 34
+; POWERPC_64-DAG:    xxland 32, 32, 34
+; POWERPC_64-DAG:    vadduwm 5, 5, 0
+; POWERPC_64-DAG:    vadduwm 3, 3, 1
+; POWERPC_64-DAG:    bdnz L..BB0_10
+; POWERPC_64:    blr
+;
+; POWERPC_32-LABEL: test_Greater_than:
+; POWERPC_32:  L..BB0_7: # %vector.body
+; POWERPC_32-DAG:    #
+; POWERPC_32-DAG:    lxv 50, 0(10)
+; POWERPC_32-DAG:    addic 9, 9, 64
+; POWERPC_32-DAG:    addze 5, 5
+; POWERPC_32-DAG:    xor 11, 9, 6
+; POWERPC_32-DAG:    or. 11, 11, 5
+; POWERPC_32-DAG:    vcmpequh 18, 18, 3
+; POWERPC_32-DAG:    xxlnor 50, 50, 50
+; POWERPC_32-DAG:    vmrglh 19, 18, 18
+; POWERPC_32-DAG:    vmrghh 18, 18, 18
+; POWERPC_32-DAG:    xxland 51, 51, 34
+; POWERPC_32-DAG:    xxland 50, 50, 34
+; POWERPC_32-DAG:    vadduwm 5, 5, 19
+; POWERPC_32:  L..BB0_11: # %vec.epilog.vector.body
+; POWERPC_32-DAG:    #
+; POWERPC_32-DAG:    slwi 5, 9, 1
+; POWERPC_32-DAG:    addic 9, 9, 8
+; POWERPC_32-DAG:    addze 7, 7
+; POWERPC_32-DAG:    lxvx 32, 3, 5
+; POWERPC_32-DAG:    xor 5, 9, 6
+; POWERPC_32-DAG:    or. 5, 5, 7
+; POWERPC_32-DAG:    vcmpequh 0, 0, 3
+; POWERPC_32-DAG:    xxlnor 32, 32, 32
+; POWERPC_32-DAG:    vmrghh 1, 0, 0
+; POWERPC_32-DAG:    vmrglh 0, 0, 0
+; POWERPC_32-DAG:    xxland 33, 33, 34
+; POWERPC_32-DAG:    xxland 32, 32, 34
+; POWERPC_32-DAG:    vadduwm 5, 5, 0
+; POWERPC_32-DAG:    vadduwm 4, 4, 1
+; POWERPC_32-DAG:    bne 0, L..BB0_11
+; POWERPC_32:    blr
+    entry:
+  %cmp5 = icmp sgt i32 %ncols, 0
+  br i1 %cmp5, label %iter.check, label %for.cond.cleanup
+
+iter.check:                                       ; preds = %entry
+  %wide.trip.count = zext nneg i32 %ncols to i64
+  %min.iters.check = icmp ult i32 %ncols, 8
+  br i1 %min.iters.check, label %for.body.preheader, label %vector.main.loop.iter.check
+
+for.body.preheader:                               ; preds = %vec.epilog.iter.check, %vec.epilog.middle.block, %iter.check
+  %indvars.iv.ph = phi i64 [ 0, %iter.check ], [ %n.vec, %vec.epilog.iter.check ], [ %n.vec31, %vec.epilog.middle.block ]
+  %num_cols_needed.06.ph = phi i32 [ 0, %iter.check ], [ %33, %vec.epilog.iter.check ], [ %40, %vec.epilog.middle.block ]
+  br label %for.body
+
+vector.main.loop.iter.check:                      ; preds = %iter.check
+  %min.iters.check9 = icmp ult i32 %ncols, 64
+  br i1 %min.iters.check9, label %vec.epilog.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %vector.main.loop.iter.check
+  %n.vec = and i64 %wide.trip.count, 2147483584
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %24, %vector.body ]
+  %vec.phi10 = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %25, %vector.body ]
+  %vec.phi11 = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %26, %vector.body ]
+  %vec.phi12 = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %27, %vector.body ]
+  %vec.phi13 = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %28, %vector.body ]
+  %vec.phi14 = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %29, %vector.body ]
+  %vec.phi15 = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %30, %vector.body ]
+  %vec.phi16 = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %31, %vector.body ]
+  %0 = getelementptr inbounds nuw i16, ptr %colauths, i64 %index
+  %1 = getelementptr inbounds nuw i8, ptr %0, i64 16
+  %2 = getelementptr inbounds nuw i8, ptr %0, i64 32
+  %3 = getelementptr inbounds nuw i8, ptr %0, i64 48
+  %4 = getelementptr inbounds nuw i8, ptr %0, i64 64
+  %5 = getelementptr inbounds nuw i8, ptr %0, i64 80
+  %6 = getelementptr inbounds nuw i8, ptr %0, i64 96
+  %7 = getelementptr inbounds nuw i8, ptr %0, i64 112
+  %wide.load = load <8 x i16>, ptr %0, align 2, !tbaa !5
+  %wide.load17 = load <8 x i16>, ptr %1, align 2, !tbaa !5
+  %wide.load18 = load <8 x i16>, ptr %2, align 2, !tbaa !5
+  %wide.load19 = load <8 x i16>, ptr %3, align 2, !tbaa !5
+  %wide.load20 = load <8 x i16>, ptr %4, align 2, !tbaa !5
+  %wide.load21 = load <8 x i16>, ptr %5, align 2, !tbaa !5
+  %wide.load22 = load <8 x i16>, ptr %6, align 2, !tbaa !5
+  %wide.load23 = load <8 x i16>, ptr %7, align 2, !tbaa !5
+  %8 = icmp ne <8 x i16> %wide.load, zeroinitializer
+  %9 = icmp ne <8 x i16> %wide.load17, zeroinitializer
+  %10 = icmp ne <8 x i16> %wide.load18, zeroinitializer
+  %11 = icmp ne <8 x i16> %wide.load19, zeroinitializer
+  %12 = icmp ne <8 x i16> %wide.load20, zeroinitializer
+  %13 = icmp ne <8 x i16> %wide.load21, zeroinitializer
+  %14 = icmp ne <8 x i16> %wide.load22, zeroinitializer
+  %15 = icmp ne <8 x i16> %wide.load23, zeroinitializer
+  %16 = zext <8 x i1> %8 to <8 x i32>
+  %17 = zext <8 x i1> %9 to <8 x i32>
+  %18 = zext <8 x i1> %10 to <8 x i32>
+  %19 = zext <8 x i1> %11 to <8 x i32>
+  %20 = zext <8 x i1> %12 to <8 x i32>
+  %21 = zext <8 x i1> %13 to <8 x i32>
+  %22 = zext <8 x i1> %14 to <8 x i32>
+  %23 = zext <8 x i1> %15 to <8 x i32>
+  %24 = add <8 x i32> %vec.phi, %16
+  %25 = add <8 x i32> %vec.phi10, %17
+  %26 = add <8 x i32> %vec.phi11, %18
+  %27 = add <8 x i32> %vec.phi12, %19
+  %28 = add <8 x i32> %vec.phi13, %20
+  %29 = add <8 x i32> %vec.phi14, %21
+  %30 = add <8 x i32> %vec.phi15, %22
+  %31 = add <8 x i32> %vec.phi16, %23
+  %index.next = add nuw i64 %index, 64
+  %32 = icmp eq i64 %index.next, %n.vec
+  br i1 %32, label %middle.block, label %vector.body, !llvm.loop !9
+
+middle.block:                                     ; preds = %vector.body
+  %bin.rdx = add <8 x i32> %25, %24
+  %bin.rdx24 = add <8 x i32> %26, %bin.rdx
+  %bin.rdx25 = add <8 x i32> %27, %bin.rdx24
+  %bin.rdx26 = add <8 x i32> %28, %bin.rdx25
+  %bin.rdx27 = add <8 x i32> %29, %bin.rdx26
+  %bin.rdx28 = add <8 x i32> %30, %bin.rdx27
+  %bin.rdx29 = add <8 x i32> %31, %bin.rdx28
+  %33 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %bin.rdx29)
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %vec.epilog.iter.check
+
+vec.epilog.iter.check:                            ; preds = %middle.block
+  %n.vec.remaining = and i64 %wide.trip.count, 56
+  %min.epilog.iters.check = icmp eq i64 %n.vec.remaining, 0
+  br i1 %min.epilog.iters.check, label %for.body.preheader, label %vec.epilog.ph
+
+vec.epilog.ph:                                    ; preds = %vec.epilog.iter.check, %vector.main.loop.iter.check
+  %vec.epilog.resume.val = phi i64 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+  %bc.merge.rdx = phi i32 [ %33, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+  %n.vec31 = and i64 %wide.trip.count, 2147483640
+  %34 = insertelement <8 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %bc.merge.rdx, i64 0
+  br label %vec.epilog.vector.body
+
+vec.epilog.vector.body:                           ; preds = %vec.epilog.vector.body, %vec.epilog.ph
+  %index32 = phi i64 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next35, %vec.epilog.vector.body ]
+  %vec.phi33 = phi <8 x i32> [ %34, %vec.epilog.ph ], [ %38, %vec.epilog.vector.body ]
+  %35 = getelementptr inbounds nuw i16, ptr %colauths, i64 %index32
+  %wide.load34 = load <8 x i16>, ptr %35, align 2, !tbaa !5
+  %36 = icmp ne <8 x i16> %wide.load34, zeroinitializer
+  %37 = zext <8 x i1> %36 to <8 x i32>
+  %38 = add <8 x i32> %vec.phi33, %37
+  %index.next35 = add nuw i64 %index32, 8
+  %39 = icmp eq i64 %index.next35, %n.vec31
+  br i1 %39, label %vec.epilog.middle.block, label %vec.epilog.vector.body, !llvm.loop !13
+
+vec.epilog.middle.block:                          ; preds = %vec.epilog.vector.body
+  %40 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %38)
+  %cmp.n36 = icmp eq i64 %n.vec31, %wide.trip.count
+  br i1 %cmp.n36, label %for.cond.cleanup, label %for.body.preheader
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %vec.epilog.middle.block, %entry
+  %num_cols_needed.0.lcssa = phi i32 [ 0, %entry ], [ %33, %middle.block ], [ %40, %vec.epilog.middle.block ], [ %spec.select, %for.body ]
+  ret i32 %num_cols_needed.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ]
+  %num_cols_needed.06 = phi i32 [ %spec.select, %for.body ], [ %num_cols_needed.06.ph, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds nuw i16, ptr %colauths, i64 %indvars.iv
+  %41 = load i16, ptr %arrayidx, align 2, !tbaa !5
+  %tobool.not = icmp ne i16 %41, 0
+  %inc = zext i1 %tobool.not to i32
+  %spec.select = add nuw nsw i32 %num_cols_needed.06, %inc
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !14
+}
+
+!5 = !{!6, !6, i64 0}
+!6 = !{!"short", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+!9 = distinct !{!9, !10, !11, !12}
+!10 = !{!"llvm.loop.mustprogress"}
+!11 = !{!"llvm.loop.isvectorized", i32 1}
+!12 = !{!"llvm.loop.unroll.runtime.disable"}
+!13 = distinct !{!13, !10, !11, !12}
+!14 = distinct !{!14, !10, !12, !11}

``````````

</details>


https://github.com/llvm/llvm-project/pull/147246