[clang] [llvm] [LV] Support generating masks for switch terminators. (PR #99808)

Sun Jul 21 04:17:53 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Florian Hahn (fhahn)

<details>
<summary>Changes</summary>

Update createEdgeMask to created masks where the terminator in Src is a switch. We need to handle 2 separate cases:

1. Dst is not the default desintation. Dst is reached if any of the cases with destination == Dst are taken. Join the conditions for each case where destination == Dst using a logical OR.
2. Dst is the default destination. Dst is reached if none of the cases with destination != Dst are taken. Join the conditions for each case where the destination is != Dst using a logical OR and negate it.

Fixes https://github.com/llvm/llvm-project/issues/48188.

---

Patch is 84.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/99808.diff


7 Files Affected:

- (modified) clang/test/Frontend/optimization-remark-analysis.c (+1-1) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (+5-5) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+35) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll (+623-24) 
- (modified) llvm/test/Transforms/LoopVectorize/no_switch.ll (+5-7) 
- (modified) llvm/test/Transforms/LoopVectorize/predicate-switch.ll (+196-4) 
- (modified) llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll (+55-5) 


``````````diff

diff --git a/clang/test/Frontend/optimization-remark-analysis.c b/clang/test/Frontend/optimization-remark-analysis.c
index e43984942a6ef..9d8917265a320 100644
--- a/clang/test/Frontend/optimization-remark-analysis.c
+++ b/clang/test/Frontend/optimization-remark-analysis.c
@@ -1,7 +1,7 @@
 // RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -Rpass-analysis -S %s -o - 2>&1 | FileCheck %s --check-prefix=RPASS
 // RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -S %s -o - 2>&1 | FileCheck %s
 
-// RPASS: {{.*}}:12:5: remark: loop not vectorized: loop contains a switch statement
+// RPASS-NOT: {{.*}}:12:5: remark: loop not vectorized
 // CHECK-NOT: remark: loop not vectorized: loop contains a switch statement
 
 double foo(int N, int *Array) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index f54eebb2874ab..7f84455150093 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1348,11 +1348,11 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   // Collect the blocks that need predication.
   for (BasicBlock *BB : TheLoop->blocks()) {
     // We don't support switch statements inside loops.
-    if (!isa<BranchInst>(BB->getTerminator())) {
-      reportVectorizationFailure("Loop contains a switch statement",
-                                 "loop contains a switch statement",
-                                 "LoopContainsSwitch", ORE, TheLoop,
-                                 BB->getTerminator());
+    if (!isa<BranchInst, SwitchInst>(BB->getTerminator())) {
+      reportVectorizationFailure("Loop contains an unsupported termaintor",
+                                 "loop contains an unsupported terminator",
+                                 "LoopContainsUnsupportedTerminator", ORE,
+                                 TheLoop, BB->getTerminator());
       return false;
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6d28b8fabe42e..2530762e3e424 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7763,6 +7763,41 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
 
   VPValue *SrcMask = getBlockInMask(Src);
 
+  if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
+    // Create mask where the terminator in Src is a switch. We need to handle 2
+    // separate cases:
+    // 1. Dst is not the default desintation. Dst is reached if any of the cases
+    // with destination == Dst are taken. Join the conditions for each case
+    // where destination == Dst using a logical OR.
+    // 2. Dst is the default destination. Dst is reached if none of the cases
+    // with destination != Dst are taken. Join the conditions for each case
+    // where the destination is != Dst using a logical OR and negate it.
+    VPValue *Mask = nullptr;
+    VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition(), Plan);
+    bool IsDefault = SI->getDefaultDest() == Dst;
+    for (auto &C : SI->cases()) {
+      if (IsDefault) {
+        if (C.getCaseSuccessor() == Dst)
+          continue;
+      } else if (C.getCaseSuccessor() != Dst)
+        continue;
+
+      VPValue *Eq = EdgeMaskCache.lookup({Src, C.getCaseSuccessor()});
+      if (!Eq) {
+        VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue(), Plan);
+        Eq = Builder.createICmp(CmpInst::ICMP_EQ, Cond, V);
+      }
+      if (Mask)
+        Mask = Builder.createOr(Mask, Eq);
+      else
+        Mask = Eq;
+    }
+    if (IsDefault)
+      Mask = Builder.createNot(Mask);
+    assert(Mask && "mask must be created");
+    return EdgeMaskCache[Edge] = Mask;
+  }
+
   // The terminator has to be a branch inst!
   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
   assert(BI && "Unexpected terminator found");
diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
index b8ce3c40920a3..ff73a149c8e39 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
@@ -6,9 +6,43 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) {
 ; IC1-LABEL: define void @switch_default_to_latch_common_dest(
 ; IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0:[0-9]+]] {
 ; IC1-NEXT:  [[ENTRY:.*]]:
+; IC1-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; IC1-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; IC1-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8
+; IC1-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; IC1-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; IC1-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC1:       [[VECTOR_PH]]:
+; IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; IC1-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 8
+; IC1-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
+; IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IC1:       [[VECTOR_BODY]]:
+; IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
+; IC1-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IC1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]]
+; IC1-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
+; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1
+; IC1-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12>
+; IC1-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13>
+; IC1-NEXT:    [[TMP9:%.*]] = or <4 x i1> [[TMP7]], [[TMP8]]
+; IC1-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP9]]
+; IC1-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP10]])
+; IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC1-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IC1:       [[MIDDLE_BLOCK]]:
+; IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC1:       [[SCALAR_PH]]:
+; IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
 ; IC1-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; IC1:       [[LOOP_HEADER]]:
-; IC1-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; IC1-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
 ; IC1-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
 ; IC1-NEXT:    switch i64 [[L]], label %[[LOOP_LATCH]] [
 ; IC1-NEXT:      i64 -12, label %[[IF_THEN:.*]]
@@ -20,16 +54,59 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) {
 ; IC1:       [[LOOP_LATCH]]:
 ; IC1-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
 ; IC1-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; IC1-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; IC1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IC1:       [[EXIT]]:
 ; IC1-NEXT:    ret void
 ;
 ; IC2-LABEL: define void @switch_default_to_latch_common_dest(
 ; IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0:[0-9]+]] {
 ; IC2-NEXT:  [[ENTRY:.*]]:
+; IC2-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; IC2-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; IC2-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8
+; IC2-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; IC2-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; IC2-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; IC2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8
+; IC2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC2:       [[VECTOR_PH]]:
+; IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8
+; IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; IC2-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 8
+; IC2-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
+; IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IC2:       [[VECTOR_BODY]]:
+; IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
+; IC2-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IC2-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32
+; IC2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]]
+; IC2-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]]
+; IC2-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
+; IC2-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4
+; IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1
+; IC2-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1
+; IC2-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12>
+; IC2-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 -12, i64 -12, i64 -12, i64 -12>
+; IC2-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13>
+; IC2-NEXT:    [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 13, i64 13, i64 13, i64 13>
+; IC2-NEXT:    [[TMP13:%.*]] = or <4 x i1> [[TMP9]], [[TMP11]]
+; IC2-NEXT:    [[TMP14:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]]
+; IC2-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP13]], [[TMP13]]
+; IC2-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP14]], [[TMP14]]
+; IC2-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP15]])
+; IC2-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]])
+; IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; IC2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC2-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IC2:       [[MIDDLE_BLOCK]]:
+; IC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; IC2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC2:       [[SCALAR_PH]]:
+; IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
 ; IC2-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; IC2:       [[LOOP_HEADER]]:
-; IC2-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; IC2-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
 ; IC2-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
 ; IC2-NEXT:    switch i64 [[L]], label %[[LOOP_LATCH]] [
 ; IC2-NEXT:      i64 -12, label %[[IF_THEN:.*]]
@@ -41,7 +118,7 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) {
 ; IC2:       [[LOOP_LATCH]]:
 ; IC2-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
 ; IC2-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; IC2-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; IC2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IC2:       [[EXIT]]:
 ; IC2-NEXT:    ret void
 ;
@@ -73,9 +150,48 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) {
 ; IC1-LABEL: define void @switch_all_dests_distinct(
 ; IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
 ; IC1-NEXT:  [[ENTRY:.*]]:
+; IC1-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; IC1-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; IC1-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8
+; IC1-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; IC1-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; IC1-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC1:       [[VECTOR_PH]]:
+; IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; IC1-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 8
+; IC1-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
+; IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IC1:       [[VECTOR_BODY]]:
+; IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
+; IC1-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IC1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]]
+; IC1-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
+; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1
+; IC1-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer
+; IC1-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP7]])
+; IC1-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13>
+; IC1-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP6]], i32 1, <4 x i1> [[TMP8]])
+; IC1-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12>
+; IC1-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP9]])
+; IC1-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP8]]
+; IC1-NEXT:    [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP7]]
+; IC1-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP11]], <i1 true, i1 true, i1 true, i1 true>
+; IC1-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP12]])
+; IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC1-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC1-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IC1:       [[MIDDLE_BLOCK]]:
+; IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC1:       [[SCALAR_PH]]:
+; IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
 ; IC1-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; IC1:       [[LOOP_HEADER]]:
-; IC1-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; IC1-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
 ; IC1-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1
 ; IC1-NEXT:    switch i64 [[L]], label %[[DEFAULT:.*]] [
 ; IC1-NEXT:      i64 -12, label %[[IF_THEN_1:.*]]
@@ -97,16 +213,69 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) {
 ; IC1:       [[LOOP_LATCH]]:
 ; IC1-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1
 ; IC1-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; IC1-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; IC1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IC1:       [[EXIT]]:
 ; IC1-NEXT:    ret void
 ;
 ; IC2-LABEL: define void @switch_all_dests_distinct(
 ; IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] {
 ; IC2-NEXT:  [[ENTRY:.*]]:
+; IC2-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; IC2-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; IC2-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8
+; IC2-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; IC2-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; IC2-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; IC2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8
+; IC2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC2:       [[VECTOR_PH]]:
+; IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8
+; IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; IC2-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 8
+; IC2-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
+; IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IC2:       [[VECTOR_BODY]]:
+; IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
+; IC2-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IC2-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32
+; IC2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]]
+; IC2-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]]
+; IC2-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
+; IC2-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4
+; IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1
+; IC2-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1
+; IC2-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer
+; IC2-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], zeroinitializer
+; IC2-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP9]])
+; IC2-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP10]])
+; IC2-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13>
+; IC2-NEXT:    [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 13, i64 13, i64 13, i64 13>
+; IC2-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP11]])
+; IC2-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP12]])
+; IC2-NEXT:    [[TMP13:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12>
+; IC2-NEXT:    [[TMP14:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 -12, i64 -12, i64 -12, i64 -12>
+; IC2-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP13]])
+; IC2-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP14]])
+; IC2-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP13]], [[TMP11]]
+; IC2-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]]
+; IC2-NEXT:    [[TMP17:%.*]] = or <4 x i1> [[TMP15]], [[TMP9]]
+; IC2-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP10]]
+; IC2-NEXT:    [[TMP19:%.*]] = xor <4 x i1> [[TMP17]], <i1 true, i1 true, i1 true, i1 true>
+; IC2-NEXT:    [[TMP20:%.*]] = xor <4 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true>
+; IC2-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP19]])
+; IC2-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP20]])
+; IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; IC2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC2-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IC2:       [[MIDDLE_BLOCK]]:
+; IC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; IC2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC2:       [[SCALAR_PH]]:
+; IC2-NEXT:    [[BC_RES...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/99808