[llvm] 346185c - [AArch64] Improve codegen of vectorised early exit loops (#119534)

Mon Jan 6 05:17:17 PST 2025

Author: David Sherwood
Date: 2025-01-06T13:17:14Z
New Revision: 346185c42c59c344fcf0d9fd476c85d287181baf

URL: https://github.com/llvm/llvm-project/commit/346185c42c59c344fcf0d9fd476c85d287181baf
DIFF: https://github.com/llvm/llvm-project/commit/346185c42c59c344fcf0d9fd476c85d287181baf.diff

LOG: [AArch64] Improve codegen of vectorised early exit loops (#119534)

Once PR #112138 lands we are able to start vectorising more loops
that have uncountable early exits. The typical loop structure
looks like this:

vector.body:
  ...
  %pred = icmp eq <2 x ptr> %wide.load, %broadcast.splat
  ...
  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %pred)
  %iv.cmp = icmp eq i64 %index.next, 4
  %exit.cond = or i1 %or.reduc, %iv.cmp
  br i1 %exit.cond, label %middle.split, label %vector.body

middle.split:
  br i1 %or.reduc, label %found, label %notfound

found:
  ret i64 1

notfound:
  ret i64 0

The problem with this is that %or.reduc is kept live after the loop,
and since this is a boolean it typically requires making a copy of
the condition code register. For AArch64 this requires an additional
cset instruction, which is quite expensive for a typical find loop
that only contains 6 or 7 instructions.

This patch attempts to improve the codegen by sinking the reduction
out of the loop to the location of it's user. It's a lot cheaper to
keep the predicate alive if the type is legal and has lots of
registers for it. There is a potential downside in that a little
more work is required after the loop, but I believe this is worth
it since we are likely to spend most of our time in the loop.

Added: 
    llvm/test/CodeGen/AArch64/reduce-or-opt.ll
    llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 72cc0bf7e7dd53..5abe69ef1c9ccf 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5290,11 +5290,17 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     }
   }
 
-  // Sink vscales closer to uses for better isel
+  auto ShouldSinkCondition = [](Value *Cond) -> bool {
+    auto *II = dyn_cast<IntrinsicInst>(Cond);
+    return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
+           isa<ScalableVectorType>(II->getOperand(0)->getType());
+  };
+
   switch (I->getOpcode()) {
   case Instruction::GetElementPtr:
   case Instruction::Add:
   case Instruction::Sub:
+    // Sink vscales closer to uses for better isel
     for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
       if (shouldSinkVScale(I->getOperand(Op), Ops)) {
         Ops.push_back(&I->getOperandUse(Op));
@@ -5302,6 +5308,23 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
       }
     }
     break;
+  case Instruction::Select: {
+    if (!ShouldSinkCondition(I->getOperand(0)))
+      return false;
+
+    Ops.push_back(&I->getOperandUse(0));
+    return true;
+  }
+  case Instruction::Br: {
+    if (cast<BranchInst>(I)->isUnconditional())
+      return false;
+
+    if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
+      return false;
+
+    Ops.push_back(&I->getOperandUse(0));
+    return true;
+  }
   default:
     break;
   }

diff  --git a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
new file mode 100644
index 00000000000000..f5df5ea53c990d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
+
+define i64 @select_or_reduce_v2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: select_or_reduce_v2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    cmeq v0.2d, v0.2d, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    tbnz w9, #0, .LBB0_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    cmp x8, #16
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  .LBB0_3: // %middle.split
+; CHECK-NEXT:    and x0, x9, #0x1
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_v2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: br_or_reduce_v2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  .LBB1_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    cmeq v0.2d, v0.2d, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    tbnz w9, #0, .LBB1_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    cmp x8, #16
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    b.ne .LBB1_1
+; CHECK-NEXT:  .LBB1_3: // %middle.split
+; CHECK-NEXT:    tbz w9, #0, .LBB1_5
+; CHECK-NEXT:  // %bb.4: // %found
+; CHECK-NEXT:    mov w8, #56 // =0x38
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_5:
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: select_or_reduce_nxv2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    neg x10, x8
+; CHECK-NEXT:    add x10, x10, #4
+; CHECK-NEXT:  .LBB2_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    b.ne .LBB2_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    add x9, x9, x8
+; CHECK-NEXT:    b.ne .LBB2_1
+; CHECK-NEXT:  .LBB2_3: // %middle.split
+; CHECK-NEXT:    ptest p0, p1.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: br_or_reduce_nxv2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    neg x10, x8
+; CHECK-NEXT:    add x10, x10, #4
+; CHECK-NEXT:  .LBB3_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    b.ne .LBB3_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    add x9, x9, x8
+; CHECK-NEXT:    b.ne .LBB3_1
+; CHECK-NEXT:  .LBB3_3: // %middle.split
+; CHECK-NEXT:    ptest p0, p1.b
+; CHECK-NEXT:    b.eq .LBB3_5
+; CHECK-NEXT:  // %bb.4: // %found
+; CHECK-NEXT:    mov w8, #56 // =0x38
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_5:
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1>)

diff  --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll
new file mode 100644
index 00000000000000..52257c10b0bf6d
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -codegenprepare -S < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
+
+define i64 @select_or_reduce_v2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: define i64 @select_or_reduce_v2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[OR_REDUC]], i64 1, i64 0
+; CHECK-NEXT:    ret i64 [[SEL]]
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_v2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: define i64 @br_or_reduce_v2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]], ptr noundef readnone [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[OR_REDUC]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]]
+; CHECK:       [[FOUND]]:
+; CHECK-NEXT:    store i64 56, ptr [[P]], align 8
+; CHECK-NEXT:    ret i64 1
+; CHECK:       [[NOTFOUND]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: define i64 @select_or_reduce_nxv2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP2]])
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[TMP3]], i64 1, i64 0
+; CHECK-NEXT:    ret i64 [[SEL]]
+;
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: define i64 @br_or_reduce_nxv2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]], ptr noundef readnone [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP2]])
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]]
+; CHECK:       [[FOUND]]:
+; CHECK-NEXT:    store i64 56, ptr [[P]], align 8
+; CHECK-NEXT:    ret i64 1
+; CHECK:       [[NOTFOUND]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1>)