[llvm] 08d153d - [ValueTracking] computeKnownBits - attempt to use a branch condition feeding a phi to improve known bits range (PR38280)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 16 08:54:52 PDT 2022


Author: Simon Pilgrim
Date: 2022-08-16T16:54:44+01:00
New Revision: 08d153d806b500a3d819bbc57d8c4ddcb568f26e

URL: https://github.com/llvm/llvm-project/commit/08d153d806b500a3d819bbc57d8c4ddcb568f26e
DIFF: https://github.com/llvm/llvm-project/commit/08d153d806b500a3d819bbc57d8c4ddcb568f26e.diff

LOG: [ValueTracking] computeKnownBits - attempt to use a branch condition feeding a phi to improve known bits range (PR38280)

If computeKnownBits encounters a phi node, and we fail to determine any known bits through direct analysis, see if the incoming value is part of a branch condition feeding the phi.

Handle cases where icmp(IncomingValue PRED Constant) is driving a branch instruction feeding that phi node - at the moment this only handles EQ/ULT/ULE predicate cases as they are the most straightforward to handle and most likely for branch-loop 'max upper bound' cases - we can extend this if/when necessary.

I investigated a more general icmp(LHS PRED RHS) KnownBits system, but the hard limits we put on value tracking depth through phi nodes meant that we were mainly catching constants anyhow.

Fixes the pointless vectorization in PR38280 / Issue #37628 (excessive unrolling still needs handling though)

Differential Revision: https://reviews.llvm.org/D131838

Added: 
    

Modified: 
    llvm/lib/Analysis/ValueTracking.cpp
    llvm/test/Transforms/InstCombine/known-phi-br.ll
    llvm/test/Transforms/InstCombine/remove-loop-phi-fastmul.ll
    llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 65975d227479..c4f678171d8a 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1575,9 +1575,45 @@ static void computeKnownBitsFromOperator(const Operator *I,
         RecQ.CxtI = P->getIncomingBlock(u)->getTerminator();
 
         Known2 = KnownBits(BitWidth);
+
         // Recurse, but cap the recursion to one level, because we don't
         // want to waste time spinning around in loops.
         computeKnownBits(IncValue, Known2, MaxAnalysisRecursionDepth - 1, RecQ);
+
+        // If this failed, see if we can use a conditional branch into the phi
+        // to help us determine the range of the value.
+        if (Known2.isUnknown()) {
+          ICmpInst::Predicate Pred;
+          const APInt *RHSC;
+          BasicBlock *TrueSucc, *FalseSucc;
+          // TODO: Use RHS Value and compute range from its known bits.
+          if (match(RecQ.CxtI,
+                    m_Br(m_c_ICmp(Pred, m_Specific(IncValue), m_APInt(RHSC)),
+                         m_BasicBlock(TrueSucc), m_BasicBlock(FalseSucc)))) {
+            // Check for cases of duplicate successors.
+            if ((TrueSucc == P->getParent()) != (FalseSucc == P->getParent())) {
+              // If we're using the false successor, invert the predicate.
+              if (FalseSucc == P->getParent())
+                Pred = CmpInst::getInversePredicate(Pred);
+
+              switch (Pred) {
+              case CmpInst::Predicate::ICMP_EQ:
+                Known2 = KnownBits::makeConstant(*RHSC);
+                break;
+              case CmpInst::Predicate::ICMP_ULE:
+                Known2.Zero.setHighBits(RHSC->countLeadingZeros());
+                break;
+              case CmpInst::Predicate::ICMP_ULT:
+                Known2.Zero.setHighBits((*RHSC - 1).countLeadingZeros());
+                break;
+              default:
+                // TODO - add additional integer predicate handling.
+                break;
+              }
+            }
+          }
+        }
+
         Known = KnownBits::commonBits(Known, Known2);
         // If all bits have been ruled out, there's no need to check
         // more operands.

diff  --git a/llvm/test/Transforms/InstCombine/known-phi-br.ll b/llvm/test/Transforms/InstCombine/known-phi-br.ll
index 82b2ffbe2720..64d3344eb206 100644
--- a/llvm/test/Transforms/InstCombine/known-phi-br.ll
+++ b/llvm/test/Transforms/InstCombine/known-phi-br.ll
@@ -6,7 +6,7 @@
 ; the known bits of a phi edge based off a conditional branch feeding the phi.
 ;
 
-; TODO: %x either eq 7 or is set to 7
+; %x either eq 7 or is set to 7
 define i64 @limit_i64_eq_7(i64 %x) {
 ; CHECK-LABEL: @limit_i64_eq_7(
 ; CHECK-NEXT:  entry:
@@ -15,8 +15,7 @@ define i64 @limit_i64_eq_7(i64 %x) {
 ; CHECK:       body:
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[X]], [[ENTRY:%.*]] ], [ 7, [[BODY]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-NEXT:    ret i64 7
 ;
 entry:
   %cmp = icmp eq i64 %x, 7
@@ -28,20 +27,21 @@ end:
   ret i64 %res
 }
 
-; TODO: %x either eq 255 or is set to 255
+; %x either eq 255 or is set to 255
 define i64 @limit_i64_ne_255(i64 %x) {
 ; CHECK-LABEL: @limit_i64_ne_255(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[X:%.*]], 255
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[END:%.*]], label [[BODY:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[X:%.*]], 255
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[BODY:%.*]], label [[END:%.*]]
 ; CHECK:       body:
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[X]], [[ENTRY:%.*]] ], [ 255, [[BODY]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-NEXT:    ret i64 255
 ;
 entry:
   %cmp = icmp ne i64 %x, 255
+  call void @use(i1 %cmp)
   br i1 %cmp, label %body, label %end
 body:
   br label %end
@@ -49,8 +49,9 @@ end:
   %res = phi i64 [ %x, %entry ], [ 255, %body ]
   ret i64 %res
 }
+declare void @use(i1)
 
-; TODO: %x either ule 15 or is masked with 15
+; %x either ule 15 or is masked with 15
 define i64 @limit_i64_ule_15(i64 %x) {
 ; CHECK-LABEL: @limit_i64_ule_15(
 ; CHECK-NEXT:  entry:
@@ -61,8 +62,7 @@ define i64 @limit_i64_ule_15(i64 %x) {
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[X_MASK:%.*]] = phi i64 [ [[X]], [[ENTRY:%.*]] ], [ [[MASK]], [[BODY]] ]
-; CHECK-NEXT:    [[RES:%.*]] = and i64 [[X_MASK]], 15
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-NEXT:    ret i64 [[X_MASK]]
 ;
 entry:
   %cmp = icmp ule i64 %x, 15
@@ -76,7 +76,7 @@ end:
   ret i64 %res
 }
 
-; TODO: %x either uge 8 or is masked with 7
+; %x either uge 8 or is masked with 7
 define i64 @limit_i64_uge_8(i64 %x) {
 ; CHECK-LABEL: @limit_i64_uge_8(
 ; CHECK-NEXT:  entry:
@@ -87,8 +87,7 @@ define i64 @limit_i64_uge_8(i64 %x) {
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[X_MASK:%.*]] = phi i64 [ [[X]], [[ENTRY:%.*]] ], [ [[MASK]], [[BODY]] ]
-; CHECK-NEXT:    [[RES:%.*]] = and i64 [[X_MASK]], 7
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-NEXT:    ret i64 [[X_MASK]]
 ;
 entry:
   %cmp = icmp uge i64 %x, 8
@@ -102,7 +101,7 @@ end:
   ret i64 %res
 }
 
-; TODO: %x either ult 8 or is masked with 7
+; %x either ult 8 or is masked with 7
 define i64 @limit_i64_ult_8(i64 %x) {
 ; CHECK-LABEL: @limit_i64_ult_8(
 ; CHECK-NEXT:  entry:
@@ -113,8 +112,7 @@ define i64 @limit_i64_ult_8(i64 %x) {
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[X_MASK:%.*]] = phi i64 [ [[X]], [[ENTRY:%.*]] ], [ [[MASK]], [[BODY]] ]
-; CHECK-NEXT:    [[RES:%.*]] = and i64 [[X_MASK]], 7
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-NEXT:    ret i64 [[X_MASK]]
 ;
 entry:
   %cmp = icmp ult i64 %x, 8
@@ -128,7 +126,7 @@ end:
   ret i64 %res
 }
 
-; TODO: %x either ugt 7 or is masked with 7
+; %x either ugt 7 or is masked with 7
 define i64 @limit_i64_ugt_7(i64 %x) {
 ; CHECK-LABEL: @limit_i64_ugt_7(
 ; CHECK-NEXT:  entry:
@@ -139,8 +137,7 @@ define i64 @limit_i64_ugt_7(i64 %x) {
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[X_MASK:%.*]] = phi i64 [ [[X]], [[ENTRY:%.*]] ], [ [[MASK]], [[BODY]] ]
-; CHECK-NEXT:    [[RES:%.*]] = and i64 [[X_MASK]], 7
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-NEXT:    ret i64 [[X_MASK]]
 ;
 entry:
   %cmp = icmp ugt i64 %x, 7

diff  --git a/llvm/test/Transforms/InstCombine/remove-loop-phi-fastmul.ll b/llvm/test/Transforms/InstCombine/remove-loop-phi-fastmul.ll
index c232e1743d97..7ced74734548 100644
--- a/llvm/test/Transforms/InstCombine/remove-loop-phi-fastmul.ll
+++ b/llvm/test/Transforms/InstCombine/remove-loop-phi-fastmul.ll
@@ -216,8 +216,8 @@ define double @test_multiple_phi_operands(ptr %arr_d, i1 %entry_cond) {
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x double], ptr [[ARR_D:%.*]], i64 0, i64 [[I_02]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[MUL]] = fmul fast double [[F_PROD_01]], [[TMP0]]
-; CHECK-NEXT:    [[INC]] = add i64 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INC]], 1000
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_02]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[I_02]], 999
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret double [[MUL]]
@@ -255,8 +255,8 @@ define double @test_multiple_phi_operands_with_non_zero(ptr %arr_d, i1 %entry_co
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x double], ptr [[ARR_D:%.*]], i64 0, i64 [[I_02]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[MUL]] = fmul fast double [[F_PROD_01]], [[TMP0]]
-; CHECK-NEXT:    [[INC]] = add i64 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INC]], 1000
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_02]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[I_02]], 999
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret double [[MUL]]

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll
index 2001a7528f55..70b002f766b7 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll
@@ -1,258 +1,49 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -O2 -S -mtriple=x86_64-- -mattr=+sse2 < %s | FileCheck %s --check-prefixes=SSE
-; RUN: opt -O2 -S -mtriple=x86_64-- -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX
+; RUN: opt -O2 -S -mtriple=x86_64-- -mattr=+sse2 < %s | FileCheck %s
+; RUN: opt -O2 -S -mtriple=x86_64-- -mattr=+avx2 < %s | FileCheck %s
 
 ; PR38280 / Issue #37628
 define void @apply_delta(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %neg_offs, i64 noundef %count) {
-; SSE-LABEL: @apply_delta(
-; SSE-NEXT:  entry:
-; SSE-NEXT:    [[CMP21:%.*]] = icmp ugt i64 [[COUNT:%.*]], 7
-; SSE-NEXT:    br i1 [[CMP21]], label [[WHILE_BODY:%.*]], label [[WHILE_COND3_PREHEADER:%.*]]
-; SSE:       while.cond3.preheader:
-; SSE-NEXT:    [[COUNT_ADDR_0_LCSSA:%.*]] = phi i64 [ [[COUNT]], [[ENTRY:%.*]] ], [ [[SUB:%.*]], [[WHILE_BODY]] ]
-; SSE-NEXT:    [[SRC_ADDR_0_LCSSA:%.*]] = phi ptr [ [[SRC:%.*]], [[ENTRY]] ], [ [[ADD_PTR2:%.*]], [[WHILE_BODY]] ]
-; SSE-NEXT:    [[DST_ADDR_0_LCSSA:%.*]] = phi ptr [ [[DST:%.*]], [[ENTRY]] ], [ [[ADD_PTR1:%.*]], [[WHILE_BODY]] ]
-; SSE-NEXT:    [[DST_ADDR_0_LCSSA1:%.*]] = ptrtoint ptr [[DST_ADDR_0_LCSSA]] to i64
-; SSE-NEXT:    [[SRC_ADDR_0_LCSSA2:%.*]] = ptrtoint ptr [[SRC_ADDR_0_LCSSA]] to i64
-; SSE-NEXT:    [[TOBOOL_NOT27:%.*]] = icmp eq i64 [[COUNT_ADDR_0_LCSSA]], 0
-; SSE-NEXT:    br i1 [[TOBOOL_NOT27]], label [[WHILE_END9:%.*]], label [[ITER_CHECK:%.*]]
-; SSE:       iter.check:
-; SSE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[COUNT_ADDR_0_LCSSA]], 8
-; SSE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY4_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; SSE:       vector.memcheck:
-; SSE-NEXT:    [[TMP0:%.*]] = sub i64 [[DST_ADDR_0_LCSSA1]], [[SRC_ADDR_0_LCSSA2]]
-; SSE-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
-; SSE-NEXT:    [[TMP1:%.*]] = sub i64 [[DST_ADDR_0_LCSSA1]], [[SRC_ADDR_0_LCSSA2]]
-; SSE-NEXT:    [[DIFF_CHECK3:%.*]] = icmp ult i64 [[TMP1]], 32
-; SSE-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK3]]
-; SSE-NEXT:    [[TMP2:%.*]] = add i64 [[NEG_OFFS:%.*]], 31
-; SSE-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP2]], 32
-; SSE-NEXT:    [[CONFLICT_RDX5:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK4]]
-; SSE-NEXT:    br i1 [[CONFLICT_RDX5]], label [[WHILE_BODY4_PREHEADER]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; SSE:       vector.main.loop.iter.check:
-; SSE-NEXT:    [[MIN_ITERS_CHECK6:%.*]] = icmp ult i64 [[COUNT_ADDR_0_LCSSA]], 32
-; SSE-NEXT:    br i1 [[MIN_ITERS_CHECK6]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; SSE:       vector.ph:
-; SSE-NEXT:    [[N_VEC:%.*]] = and i64 [[COUNT_ADDR_0_LCSSA]], -32
-; SSE-NEXT:    br label [[VECTOR_BODY:%.*]]
-; SSE:       vector.body:
-; SSE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SSE-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_ADDR_0_LCSSA]], i64 [[INDEX]]
-; SSE-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[SRC_ADDR_0_LCSSA]], i64 [[INDEX]]
-; SSE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP8]], align 1
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP8]], i64 16
-; SSE-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i64 [[NEG_OFFS]]
-; SSE-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 16
-; SSE-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; SSE-NEXT:    [[TMP6:%.*]] = add <16 x i8> [[WIDE_LOAD11]], [[WIDE_LOAD]]
-; SSE-NEXT:    [[TMP7:%.*]] = add <16 x i8> [[WIDE_LOAD12]], [[WIDE_LOAD10]]
-; SSE-NEXT:    store <16 x i8> [[TMP6]], ptr [[NEXT_GEP]], align 1
-; SSE-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16
-; SSE-NEXT:    store <16 x i8> [[TMP7]], ptr [[TMP8]], align 1
-; SSE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; SSE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SSE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; SSE:       middle.block:
-; SSE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[COUNT_ADDR_0_LCSSA]], [[N_VEC]]
-; SSE-NEXT:    br i1 [[CMP_N]], label [[WHILE_END9]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; SSE:       vec.epilog.iter.check:
-; SSE-NEXT:    [[IND_END20:%.*]] = and i64 [[COUNT_ADDR_0_LCSSA]], 31
-; SSE-NEXT:    [[IND_END17:%.*]] = getelementptr i8, ptr [[SRC_ADDR_0_LCSSA]], i64 [[N_VEC]]
-; SSE-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[DST_ADDR_0_LCSSA]], i64 [[N_VEC]]
-; SSE-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[COUNT_ADDR_0_LCSSA]], 24
-; SSE-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
-; SSE-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[WHILE_BODY4_PREHEADER]], label [[VEC_EPILOG_PH]]
-; SSE:       vec.epilog.ph:
-; SSE-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; SSE-NEXT:    [[N_VEC14:%.*]] = and i64 [[COUNT_ADDR_0_LCSSA]], -8
-; SSE-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST_ADDR_0_LCSSA]], i64 [[N_VEC14]]
-; SSE-NEXT:    [[IND_END16:%.*]] = getelementptr i8, ptr [[SRC_ADDR_0_LCSSA]], i64 [[N_VEC14]]
-; SSE-NEXT:    [[IND_END19:%.*]] = and i64 [[COUNT_ADDR_0_LCSSA]], 7
-; SSE-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; SSE:       vec.epilog.vector.body:
-; SSE-NEXT:    [[INDEX23:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; SSE-NEXT:    [[NEXT_GEP24:%.*]] = getelementptr i8, ptr [[DST_ADDR_0_LCSSA]], i64 [[INDEX23]]
-; SSE-NEXT:    [[NEXT_GEP25:%.*]] = getelementptr i8, ptr [[SRC_ADDR_0_LCSSA]], i64 [[INDEX23]]
-; SSE-NEXT:    [[WIDE_LOAD26:%.*]] = load <8 x i8>, ptr [[NEXT_GEP25]], align 1
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP24]], i64 [[NEG_OFFS]]
-; SSE-NEXT:    [[WIDE_LOAD27:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1
-; SSE-NEXT:    [[TMP11:%.*]] = add <8 x i8> [[WIDE_LOAD27]], [[WIDE_LOAD26]]
-; SSE-NEXT:    store <8 x i8> [[TMP11]], ptr [[NEXT_GEP24]], align 1
-; SSE-NEXT:    [[INDEX_NEXT28]] = add nuw i64 [[INDEX23]], 8
-; SSE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC14]]
-; SSE-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; SSE:       vec.epilog.middle.block:
-; SSE-NEXT:    [[CMP_N22:%.*]] = icmp eq i64 [[COUNT_ADDR_0_LCSSA]], [[N_VEC14]]
-; SSE-NEXT:    br i1 [[CMP_N22]], label [[WHILE_END9]], label [[WHILE_BODY4_PREHEADER]]
-; SSE:       while.body4.preheader:
-; SSE-NEXT:    [[DST_ADDR_130_PH:%.*]] = phi ptr [ [[DST_ADDR_0_LCSSA]], [[ITER_CHECK]] ], [ [[DST_ADDR_0_LCSSA]], [[VECTOR_MEMCHECK]] ], [ [[IND_END15]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; SSE-NEXT:    [[SRC_ADDR_129_PH:%.*]] = phi ptr [ [[SRC_ADDR_0_LCSSA]], [[ITER_CHECK]] ], [ [[SRC_ADDR_0_LCSSA]], [[VECTOR_MEMCHECK]] ], [ [[IND_END17]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; SSE-NEXT:    [[COUNT_ADDR_128_PH:%.*]] = phi i64 [ [[COUNT_ADDR_0_LCSSA]], [[ITER_CHECK]] ], [ [[COUNT_ADDR_0_LCSSA]], [[VECTOR_MEMCHECK]] ], [ [[IND_END20]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; SSE-NEXT:    br label [[WHILE_BODY4:%.*]]
-; SSE:       while.body:
-; SSE-NEXT:    [[DST_ADDR_024:%.*]] = phi ptr [ [[ADD_PTR1]], [[WHILE_BODY]] ], [ [[DST]], [[ENTRY]] ]
-; SSE-NEXT:    [[SRC_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR2]], [[WHILE_BODY]] ], [ [[SRC]], [[ENTRY]] ]
-; SSE-NEXT:    [[COUNT_ADDR_022:%.*]] = phi i64 [ [[SUB]], [[WHILE_BODY]] ], [ [[COUNT]], [[ENTRY]] ]
-; SSE-NEXT:    [[TMP13:%.*]] = load <8 x i8>, ptr [[SRC_ADDR_023]], align 1
-; SSE-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[DST_ADDR_024]], i64 [[NEG_OFFS]]
-; SSE-NEXT:    [[TMP14:%.*]] = load <8 x i8>, ptr [[ADD_PTR]], align 1
-; SSE-NEXT:    [[ADD:%.*]] = add <8 x i8> [[TMP14]], [[TMP13]]
-; SSE-NEXT:    store <8 x i8> [[ADD]], ptr [[DST_ADDR_024]], align 1
-; SSE-NEXT:    [[ADD_PTR1]] = getelementptr inbounds i8, ptr [[DST_ADDR_024]], i64 8
-; SSE-NEXT:    [[ADD_PTR2]] = getelementptr inbounds i8, ptr [[SRC_ADDR_023]], i64 8
-; SSE-NEXT:    [[SUB]] = add i64 [[COUNT_ADDR_022]], -8
-; SSE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[SUB]], 7
-; SSE-NEXT:    br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_COND3_PREHEADER]]
-; SSE:       while.body4:
-; SSE-NEXT:    [[DST_ADDR_130:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY4]] ], [ [[DST_ADDR_130_PH]], [[WHILE_BODY4_PREHEADER]] ]
-; SSE-NEXT:    [[SRC_ADDR_129:%.*]] = phi ptr [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY4]] ], [ [[SRC_ADDR_129_PH]], [[WHILE_BODY4_PREHEADER]] ]
-; SSE-NEXT:    [[COUNT_ADDR_128:%.*]] = phi i64 [ [[DEC:%.*]], [[WHILE_BODY4]] ], [ [[COUNT_ADDR_128_PH]], [[WHILE_BODY4_PREHEADER]] ]
-; SSE-NEXT:    [[DEC]] = add i64 [[COUNT_ADDR_128]], -1
-; SSE-NEXT:    [[TMP15:%.*]] = load i8, ptr [[SRC_ADDR_129]], align 1
-; SSE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST_ADDR_130]], i64 [[NEG_OFFS]]
-; SSE-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; SSE-NEXT:    [[ADD6:%.*]] = add i8 [[TMP16]], [[TMP15]]
-; SSE-NEXT:    store i8 [[ADD6]], ptr [[DST_ADDR_130]], align 1
-; SSE-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[DST_ADDR_130]], i64 1
-; SSE-NEXT:    [[INCDEC_PTR8]] = getelementptr inbounds i8, ptr [[SRC_ADDR_129]], i64 1
-; SSE-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[DEC]], 0
-; SSE-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END9]], label [[WHILE_BODY4]], !llvm.loop [[LOOP4:![0-9]+]]
-; SSE:       while.end9:
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @apply_delta(
-; AVX-NEXT:  entry:
-; AVX-NEXT:    [[CMP21:%.*]] = icmp ugt i64 [[COUNT:%.*]], 7
-; AVX-NEXT:    br i1 [[CMP21]], label [[WHILE_BODY:%.*]], label [[WHILE_COND3_PREHEADER:%.*]]
-; AVX:       while.cond3.preheader:
-; AVX-NEXT:    [[COUNT_ADDR_0_LCSSA:%.*]] = phi i64 [ [[COUNT]], [[ENTRY:%.*]] ], [ [[SUB:%.*]], [[WHILE_BODY]] ]
-; AVX-NEXT:    [[SRC_ADDR_0_LCSSA:%.*]] = phi ptr [ [[SRC:%.*]], [[ENTRY]] ], [ [[ADD_PTR2:%.*]], [[WHILE_BODY]] ]
-; AVX-NEXT:    [[DST_ADDR_0_LCSSA:%.*]] = phi ptr [ [[DST:%.*]], [[ENTRY]] ], [ [[ADD_PTR1:%.*]], [[WHILE_BODY]] ]
-; AVX-NEXT:    [[DST_ADDR_0_LCSSA1:%.*]] = ptrtoint ptr [[DST_ADDR_0_LCSSA]] to i64
-; AVX-NEXT:    [[SRC_ADDR_0_LCSSA2:%.*]] = ptrtoint ptr [[SRC_ADDR_0_LCSSA]] to i64
-; AVX-NEXT:    [[TOBOOL_NOT27:%.*]] = icmp eq i64 [[COUNT_ADDR_0_LCSSA]], 0
-; AVX-NEXT:    br i1 [[TOBOOL_NOT27]], label [[WHILE_END9:%.*]], label [[ITER_CHECK:%.*]]
-; AVX:       iter.check:
-; AVX-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[COUNT_ADDR_0_LCSSA]], 16
-; AVX-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY4_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; AVX:       vector.memcheck:
-; AVX-NEXT:    [[TMP0:%.*]] = sub i64 [[DST_ADDR_0_LCSSA1]], [[SRC_ADDR_0_LCSSA2]]
-; AVX-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128
-; AVX-NEXT:    [[TMP1:%.*]] = sub i64 [[DST_ADDR_0_LCSSA1]], [[SRC_ADDR_0_LCSSA2]]
-; AVX-NEXT:    [[DIFF_CHECK3:%.*]] = icmp ult i64 [[TMP1]], 128
-; AVX-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK3]]
-; AVX-NEXT:    [[TMP2:%.*]] = add i64 [[NEG_OFFS:%.*]], 127
-; AVX-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP2]], 128
-; AVX-NEXT:    [[CONFLICT_RDX5:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK4]]
-; AVX-NEXT:    br i1 [[CONFLICT_RDX5]], label [[WHILE_BODY4_PREHEADER]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; AVX:       vector.main.loop.iter.check:
-; AVX-NEXT:    [[MIN_ITERS_CHECK6:%.*]] = icmp ult i64 [[COUNT_ADDR_0_LCSSA]], 128
-; AVX-NEXT:    br i1 [[MIN_ITERS_CHECK6]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; AVX:       vector.ph:
-; AVX-NEXT:    [[N_VEC:%.*]] = and i64 [[COUNT_ADDR_0_LCSSA]], -128
-; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
-; AVX:       vector.body:
-; AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_ADDR_0_LCSSA]], i64 [[INDEX]]
-; AVX-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[SRC_ADDR_0_LCSSA]], i64 [[INDEX]]
-; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[NEXT_GEP10]], align 1
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP10]], i64 32
-; AVX-NEXT:    [[WIDE_LOAD14:%.*]] = load <32 x i8>, ptr [[TMP3]], align 1
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP10]], i64 64
-; AVX-NEXT:    [[WIDE_LOAD15:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP10]], i64 96
-; AVX-NEXT:    [[WIDE_LOAD16:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i64 [[NEG_OFFS]]
-; AVX-NEXT:    [[WIDE_LOAD17:%.*]] = load <32 x i8>, ptr [[TMP6]], align 1
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 32
-; AVX-NEXT:    [[WIDE_LOAD18:%.*]] = load <32 x i8>, ptr [[TMP7]], align 1
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 64
-; AVX-NEXT:    [[WIDE_LOAD19:%.*]] = load <32 x i8>, ptr [[TMP8]], align 1
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 96
-; AVX-NEXT:    [[WIDE_LOAD20:%.*]] = load <32 x i8>, ptr [[TMP9]], align 1
-; AVX-NEXT:    [[TMP10:%.*]] = add <32 x i8> [[WIDE_LOAD17]], [[WIDE_LOAD]]
-; AVX-NEXT:    [[TMP11:%.*]] = add <32 x i8> [[WIDE_LOAD18]], [[WIDE_LOAD14]]
-; AVX-NEXT:    [[TMP12:%.*]] = add <32 x i8> [[WIDE_LOAD19]], [[WIDE_LOAD15]]
-; AVX-NEXT:    [[TMP13:%.*]] = add <32 x i8> [[WIDE_LOAD20]], [[WIDE_LOAD16]]
-; AVX-NEXT:    store <32 x i8> [[TMP10]], ptr [[NEXT_GEP]], align 1
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32
-; AVX-NEXT:    store <32 x i8> [[TMP11]], ptr [[TMP14]], align 1
-; AVX-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64
-; AVX-NEXT:    store <32 x i8> [[TMP12]], ptr [[TMP15]], align 1
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96
-; AVX-NEXT:    store <32 x i8> [[TMP13]], ptr [[TMP16]], align 1
-; AVX-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
-; AVX-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; AVX:       middle.block:
-; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[COUNT_ADDR_0_LCSSA]], [[N_VEC]]
-; AVX-NEXT:    br i1 [[CMP_N]], label [[WHILE_END9]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; AVX:       vec.epilog.iter.check:
-; AVX-NEXT:    [[IND_END28:%.*]] = and i64 [[COUNT_ADDR_0_LCSSA]], 127
-; AVX-NEXT:    [[IND_END25:%.*]] = getelementptr i8, ptr [[SRC_ADDR_0_LCSSA]], i64 [[N_VEC]]
-; AVX-NEXT:    [[IND_END23:%.*]] = getelementptr i8, ptr [[DST_ADDR_0_LCSSA]], i64 [[N_VEC]]
-; AVX-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[COUNT_ADDR_0_LCSSA]], 112
-; AVX-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
-; AVX-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[WHILE_BODY4_PREHEADER]], label [[VEC_EPILOG_PH]]
-; AVX:       vec.epilog.ph:
-; AVX-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; AVX-NEXT:    [[N_VEC22:%.*]] = and i64 [[COUNT_ADDR_0_LCSSA]], -16
-; AVX-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST_ADDR_0_LCSSA]], i64 [[N_VEC22]]
-; AVX-NEXT:    [[IND_END24:%.*]] = getelementptr i8, ptr [[SRC_ADDR_0_LCSSA]], i64 [[N_VEC22]]
-; AVX-NEXT:    [[IND_END27:%.*]] = and i64 [[COUNT_ADDR_0_LCSSA]], 15
-; AVX-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; AVX:       vec.epilog.vector.body:
-; AVX-NEXT:    [[INDEX31:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT36:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX-NEXT:    [[NEXT_GEP32:%.*]] = getelementptr i8, ptr [[DST_ADDR_0_LCSSA]], i64 [[INDEX31]]
-; AVX-NEXT:    [[NEXT_GEP33:%.*]] = getelementptr i8, ptr [[SRC_ADDR_0_LCSSA]], i64 [[INDEX31]]
-; AVX-NEXT:    [[WIDE_LOAD34:%.*]] = load <16 x i8>, ptr [[NEXT_GEP33]], align 1
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP32]], i64 [[NEG_OFFS]]
-; AVX-NEXT:    [[WIDE_LOAD35:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1
-; AVX-NEXT:    [[TMP19:%.*]] = add <16 x i8> [[WIDE_LOAD35]], [[WIDE_LOAD34]]
-; AVX-NEXT:    store <16 x i8> [[TMP19]], ptr [[NEXT_GEP32]], align 1
-; AVX-NEXT:    [[INDEX_NEXT36]] = add nuw i64 [[INDEX31]], 16
-; AVX-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT36]], [[N_VEC22]]
-; AVX-NEXT:    br i1 [[TMP20]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; AVX:       vec.epilog.middle.block:
-; AVX-NEXT:    [[CMP_N30:%.*]] = icmp eq i64 [[COUNT_ADDR_0_LCSSA]], [[N_VEC22]]
-; AVX-NEXT:    br i1 [[CMP_N30]], label [[WHILE_END9]], label [[WHILE_BODY4_PREHEADER]]
-; AVX:       while.body4.preheader:
-; AVX-NEXT:    [[DST_ADDR_130_PH:%.*]] = phi ptr [ [[DST_ADDR_0_LCSSA]], [[ITER_CHECK]] ], [ [[DST_ADDR_0_LCSSA]], [[VECTOR_MEMCHECK]] ], [ [[IND_END23]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AVX-NEXT:    [[SRC_ADDR_129_PH:%.*]] = phi ptr [ [[SRC_ADDR_0_LCSSA]], [[ITER_CHECK]] ], [ [[SRC_ADDR_0_LCSSA]], [[VECTOR_MEMCHECK]] ], [ [[IND_END25]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AVX-NEXT:    [[COUNT_ADDR_128_PH:%.*]] = phi i64 [ [[COUNT_ADDR_0_LCSSA]], [[ITER_CHECK]] ], [ [[COUNT_ADDR_0_LCSSA]], [[VECTOR_MEMCHECK]] ], [ [[IND_END28]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AVX-NEXT:    br label [[WHILE_BODY4:%.*]]
-; AVX:       while.body:
-; AVX-NEXT:    [[DST_ADDR_024:%.*]] = phi ptr [ [[ADD_PTR1]], [[WHILE_BODY]] ], [ [[DST]], [[ENTRY]] ]
-; AVX-NEXT:    [[SRC_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR2]], [[WHILE_BODY]] ], [ [[SRC]], [[ENTRY]] ]
-; AVX-NEXT:    [[COUNT_ADDR_022:%.*]] = phi i64 [ [[SUB]], [[WHILE_BODY]] ], [ [[COUNT]], [[ENTRY]] ]
-; AVX-NEXT:    [[TMP21:%.*]] = load <8 x i8>, ptr [[SRC_ADDR_023]], align 1
-; AVX-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[DST_ADDR_024]], i64 [[NEG_OFFS]]
-; AVX-NEXT:    [[TMP22:%.*]] = load <8 x i8>, ptr [[ADD_PTR]], align 1
-; AVX-NEXT:    [[ADD:%.*]] = add <8 x i8> [[TMP22]], [[TMP21]]
-; AVX-NEXT:    store <8 x i8> [[ADD]], ptr [[DST_ADDR_024]], align 1
-; AVX-NEXT:    [[ADD_PTR1]] = getelementptr inbounds i8, ptr [[DST_ADDR_024]], i64 8
-; AVX-NEXT:    [[ADD_PTR2]] = getelementptr inbounds i8, ptr [[SRC_ADDR_023]], i64 8
-; AVX-NEXT:    [[SUB]] = add i64 [[COUNT_ADDR_022]], -8
-; AVX-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[SUB]], 7
-; AVX-NEXT:    br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_COND3_PREHEADER]]
-; AVX:       while.body4:
-; AVX-NEXT:    [[DST_ADDR_130:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY4]] ], [ [[DST_ADDR_130_PH]], [[WHILE_BODY4_PREHEADER]] ]
-; AVX-NEXT:    [[SRC_ADDR_129:%.*]] = phi ptr [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY4]] ], [ [[SRC_ADDR_129_PH]], [[WHILE_BODY4_PREHEADER]] ]
-; AVX-NEXT:    [[COUNT_ADDR_128:%.*]] = phi i64 [ [[DEC:%.*]], [[WHILE_BODY4]] ], [ [[COUNT_ADDR_128_PH]], [[WHILE_BODY4_PREHEADER]] ]
-; AVX-NEXT:    [[DEC]] = add i64 [[COUNT_ADDR_128]], -1
-; AVX-NEXT:    [[TMP23:%.*]] = load i8, ptr [[SRC_ADDR_129]], align 1
-; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST_ADDR_130]], i64 [[NEG_OFFS]]
-; AVX-NEXT:    [[TMP24:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; AVX-NEXT:    [[ADD6:%.*]] = add i8 [[TMP24]], [[TMP23]]
-; AVX-NEXT:    store i8 [[ADD6]], ptr [[DST_ADDR_130]], align 1
-; AVX-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[DST_ADDR_130]], i64 1
-; AVX-NEXT:    [[INCDEC_PTR8]] = getelementptr inbounds i8, ptr [[SRC_ADDR_129]], i64 1
-; AVX-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[DEC]], 0
-; AVX-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END9]], label [[WHILE_BODY4]], !llvm.loop [[LOOP4:![0-9]+]]
-; AVX:       while.end9:
-; AVX-NEXT:    ret void
+; CHECK-LABEL: @apply_delta(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp ugt i64 [[COUNT:%.*]], 7
+; CHECK-NEXT:    br i1 [[CMP21]], label [[WHILE_BODY:%.*]], label [[WHILE_COND3_PREHEADER:%.*]]
+; CHECK:       while.cond3.preheader:
+; CHECK-NEXT:    [[COUNT_ADDR_0_LCSSA:%.*]] = phi i64 [ [[COUNT]], [[ENTRY:%.*]] ], [ [[SUB:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[SRC_ADDR_0_LCSSA:%.*]] = phi ptr [ [[SRC:%.*]], [[ENTRY]] ], [ [[ADD_PTR2:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[DST_ADDR_0_LCSSA:%.*]] = phi ptr [ [[DST:%.*]], [[ENTRY]] ], [ [[ADD_PTR1:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[TOBOOL_NOT27:%.*]] = icmp eq i64 [[COUNT_ADDR_0_LCSSA]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT27]], label [[WHILE_END9:%.*]], label [[WHILE_BODY4:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[DST_ADDR_024:%.*]] = phi ptr [ [[ADD_PTR1]], [[WHILE_BODY]] ], [ [[DST]], [[ENTRY]] ]
+; CHECK-NEXT:    [[SRC_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR2]], [[WHILE_BODY]] ], [ [[SRC]], [[ENTRY]] ]
+; CHECK-NEXT:    [[COUNT_ADDR_022:%.*]] = phi i64 [ [[SUB]], [[WHILE_BODY]] ], [ [[COUNT]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[SRC_ADDR_023]], align 1
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[DST_ADDR_024]], i64 [[NEG_OFFS:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add <8 x i8> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    store <8 x i8> [[ADD]], ptr [[DST_ADDR_024]], align 1
+; CHECK-NEXT:    [[ADD_PTR1]] = getelementptr inbounds i8, ptr [[DST_ADDR_024]], i64 8
+; CHECK-NEXT:    [[ADD_PTR2]] = getelementptr inbounds i8, ptr [[SRC_ADDR_023]], i64 8
+; CHECK-NEXT:    [[SUB]] = add i64 [[COUNT_ADDR_022]], -8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[SUB]], 7
+; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_COND3_PREHEADER]]
+; CHECK:       while.body4:
+; CHECK-NEXT:    [[DST_ADDR_130:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY4]] ], [ [[DST_ADDR_0_LCSSA]], [[WHILE_COND3_PREHEADER]] ]
+; CHECK-NEXT:    [[SRC_ADDR_129:%.*]] = phi ptr [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY4]] ], [ [[SRC_ADDR_0_LCSSA]], [[WHILE_COND3_PREHEADER]] ]
+; CHECK-NEXT:    [[COUNT_ADDR_128:%.*]] = phi i64 [ [[DEC:%.*]], [[WHILE_BODY4]] ], [ [[COUNT_ADDR_0_LCSSA]], [[WHILE_COND3_PREHEADER]] ]
+; CHECK-NEXT:    [[DEC]] = add i64 [[COUNT_ADDR_128]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[SRC_ADDR_129]], align 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST_ADDR_130]], i64 [[NEG_OFFS]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ADD6:%.*]] = add i8 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    store i8 [[ADD6]], ptr [[DST_ADDR_130]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[DST_ADDR_130]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR8]] = getelementptr inbounds i8, ptr [[SRC_ADDR_129]], i64 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END9]], label [[WHILE_BODY4]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       while.end9:
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp21 = icmp ugt i64 %count, 7


        


More information about the llvm-commits mailing list