[llvm-branch-commits] [llvm] e5bf2e8 - [SLP] Use the width of value truncated just before storing

Anton Afanasyev via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Dec 9 05:42:57 PST 2020


Author: Anton Afanasyev
Date: 2020-12-09T16:38:45+03:00
New Revision: e5bf2e8989469ec328d910be26bd3ee0710326d9

URL: https://github.com/llvm/llvm-project/commit/e5bf2e8989469ec328d910be26bd3ee0710326d9
DIFF: https://github.com/llvm/llvm-project/commit/e5bf2e8989469ec328d910be26bd3ee0710326d9.diff

LOG: [SLP] Use the width of value truncated just before storing

For stores chain vectorization we choose the size of vector
elements to ensure we fit to minimum and maximum vector register
size for the number of elements given. This patch corrects vector
element size choosing the width of value truncated just before
storing instead of the width of value stored.

Fixes PR46983

Differential Revision: https://reviews.llvm.org/D92824

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e3f6d8cc05f7..456485f45809 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5536,10 +5536,15 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
 }
 
 unsigned BoUpSLP::getVectorElementSize(Value *V) {
-  // If V is a store, just return the width of the stored value without
-  // traversing the expression tree. This is the common case.
-  if (auto *Store = dyn_cast<StoreInst>(V))
-    return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+  // If V is a store, just return the width of the stored value (or value
+  // truncated just before storing) without traversing the expression tree.
+  // This is the common case.
+  if (auto *Store = dyn_cast<StoreInst>(V)) {
+    if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
+      return DL->getTypeSizeInBits(Trunc->getSrcTy());
+    else
+      return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+  }
 
   auto E = InstrElementSize.find(V);
   if (E != InstrElementSize.end())

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
index fa1183400cb0..23aa3536a88d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
@@ -22,132 +22,304 @@ entry:
 }
 
 define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i8* noalias nocapture readonly %d, i8* noalias nocapture %e, i32 %w) local_unnamed_addr #1 {
-; CHECK-LABEL: @bar(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i32> undef, i32 [[W:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[W]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[W]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[W]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[W]], i32 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[W]], i32 5
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[W]], i32 6
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[W]], i32 7
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[W]], i32 8
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[W]], i32 9
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[W]], i32 10
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[W]], i32 11
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[W]], i32 12
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[W]], i32 13
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[W]], i32 14
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[W]], i32 15
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ [[ADD_PTR192:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ [[ADD_PTR191:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ [[ADD_PTR190:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[ADD_PTR189:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX47:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX56:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX57:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX59:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX68:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX69:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX71:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX73:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX81:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 7
-; CHECK-NEXT:    [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7
-; CHECK-NEXT:    [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7
-; CHECK-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7
-; CHECK-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7
-; CHECK-NEXT:    [[ARRAYIDX93:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 8
-; CHECK-NEXT:    [[ARRAYIDX95:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 8
-; CHECK-NEXT:    [[ARRAYIDX97:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 8
-; CHECK-NEXT:    [[ARRAYIDX100:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 8
-; CHECK-NEXT:    [[ARRAYIDX104:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 8
-; CHECK-NEXT:    [[ARRAYIDX105:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 9
-; CHECK-NEXT:    [[ARRAYIDX107:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 9
-; CHECK-NEXT:    [[ARRAYIDX109:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 9
-; CHECK-NEXT:    [[ARRAYIDX112:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 9
-; CHECK-NEXT:    [[ARRAYIDX116:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 9
-; CHECK-NEXT:    [[ARRAYIDX117:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 10
-; CHECK-NEXT:    [[ARRAYIDX119:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 10
-; CHECK-NEXT:    [[ARRAYIDX121:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 10
-; CHECK-NEXT:    [[ARRAYIDX124:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 10
-; CHECK-NEXT:    [[ARRAYIDX128:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 10
-; CHECK-NEXT:    [[ARRAYIDX129:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 11
-; CHECK-NEXT:    [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11
-; CHECK-NEXT:    [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11
-; CHECK-NEXT:    [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11
-; CHECK-NEXT:    [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11
-; CHECK-NEXT:    [[ARRAYIDX141:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 12
-; CHECK-NEXT:    [[ARRAYIDX143:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 12
-; CHECK-NEXT:    [[ARRAYIDX145:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 12
-; CHECK-NEXT:    [[ARRAYIDX148:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 12
-; CHECK-NEXT:    [[ARRAYIDX152:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 12
-; CHECK-NEXT:    [[ARRAYIDX153:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 13
-; CHECK-NEXT:    [[ARRAYIDX155:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 13
-; CHECK-NEXT:    [[ARRAYIDX157:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 13
-; CHECK-NEXT:    [[ARRAYIDX160:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 13
-; CHECK-NEXT:    [[ARRAYIDX164:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 13
-; CHECK-NEXT:    [[ARRAYIDX165:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 14
-; CHECK-NEXT:    [[ARRAYIDX167:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 14
-; CHECK-NEXT:    [[ARRAYIDX169:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 14
-; CHECK-NEXT:    [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14
-; CHECK-NEXT:    [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14
-; CHECK-NEXT:    [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[C_ADDR_0352]] to <16 x i8>*
-; CHECK-NEXT:    [[TMP17:%.*]] = load <16 x i8>, <16 x i8>* [[TMP16]], align 1
-; CHECK-NEXT:    [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8* [[D_ADDR_0353]] to <16 x i8>*
-; CHECK-NEXT:    [[TMP19:%.*]] = load <16 x i8>, <16 x i8>* [[TMP18]], align 1
-; CHECK-NEXT:    [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8* [[A_ADDR_0355]] to <16 x i8>*
-; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, <16 x i8>* [[TMP20]], align 1
-; CHECK-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8* [[B_ADDR_0351]] to <16 x i8>*
-; CHECK-NEXT:    [[TMP23:%.*]] = load <16 x i8>, <16 x i8>* [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp ult <16 x i8> [[TMP17]], [[TMP19]]
-; CHECK-NEXT:    [[TMP25:%.*]] = select <16 x i1> [[TMP24]], <16 x i8> [[TMP23]], <16 x i8> [[TMP21]]
-; CHECK-NEXT:    [[TMP26:%.*]] = zext <16 x i8> [[TMP25]] to <16 x i32>
-; CHECK-NEXT:    [[TMP27:%.*]] = mul <16 x i32> [[TMP26]], [[TMP15]]
-; CHECK-NEXT:    [[TMP28:%.*]] = trunc <16 x i32> [[TMP27]] to <16 x i8>
-; CHECK-NEXT:    [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i8* [[E_ADDR_0354]] to <16 x i8>*
-; CHECK-NEXT:    store <16 x i8> [[TMP28]], <16 x i8>* [[TMP29]], align 1
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_0356]], 1
-; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 16
-; CHECK-NEXT:    [[ADD_PTR189]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 16
-; CHECK-NEXT:    [[ADD_PTR190]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 16
-; CHECK-NEXT:    [[ADD_PTR191]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 16
-; CHECK-NEXT:    [[ADD_PTR192]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 16
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @bar(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[W:%.*]], i32 0
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[W]], i32 1
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[W]], i32 2
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[W]], i32 3
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[W]], i32 0
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[W]], i32 2
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[W]], i32 3
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> undef, i32 [[W]], i32 0
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[W]], i32 1
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[W]], i32 2
+; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[W]], i32 3
+; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> undef, i32 [[W]], i32 0
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[W]], i32 1
+; SSE-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[W]], i32 2
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[W]], i32 3
+; SSE-NEXT:    br label [[FOR_BODY:%.*]]
+; SSE:       for.body:
+; SSE-NEXT:    [[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT:    [[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT:    [[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ [[ADD_PTR192:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT:    [[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ [[ADD_PTR191:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT:    [[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ [[ADD_PTR190:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT:    [[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[ADD_PTR189:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 1
+; SSE-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 1
+; SSE-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 1
+; SSE-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 1
+; SSE-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 1
+; SSE-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 2
+; SSE-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 2
+; SSE-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 2
+; SSE-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 2
+; SSE-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 2
+; SSE-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 3
+; SSE-NEXT:    [[TMP16:%.*]] = bitcast i8* [[C_ADDR_0352]] to <4 x i8>*
+; SSE-NEXT:    [[TMP17:%.*]] = load <4 x i8>, <4 x i8>* [[TMP16]], align 1
+; SSE-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3
+; SSE-NEXT:    [[TMP18:%.*]] = bitcast i8* [[D_ADDR_0353]] to <4 x i8>*
+; SSE-NEXT:    [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1
+; SSE-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3
+; SSE-NEXT:    [[TMP20:%.*]] = bitcast i8* [[A_ADDR_0355]] to <4 x i8>*
+; SSE-NEXT:    [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1
+; SSE-NEXT:    [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3
+; SSE-NEXT:    [[TMP22:%.*]] = bitcast i8* [[B_ADDR_0351]] to <4 x i8>*
+; SSE-NEXT:    [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1
+; SSE-NEXT:    [[TMP24:%.*]] = icmp ult <4 x i8> [[TMP17]], [[TMP19]]
+; SSE-NEXT:    [[TMP25:%.*]] = select <4 x i1> [[TMP24]], <4 x i8> [[TMP23]], <4 x i8> [[TMP21]]
+; SSE-NEXT:    [[TMP26:%.*]] = zext <4 x i8> [[TMP25]] to <4 x i32>
+; SSE-NEXT:    [[TMP27:%.*]] = mul <4 x i32> [[TMP26]], [[TMP3]]
+; SSE-NEXT:    [[TMP28:%.*]] = trunc <4 x i32> [[TMP27]] to <4 x i8>
+; SSE-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3
+; SSE-NEXT:    [[TMP29:%.*]] = bitcast i8* [[E_ADDR_0354]] to <4 x i8>*
+; SSE-NEXT:    store <4 x i8> [[TMP28]], <4 x i8>* [[TMP29]], align 1
+; SSE-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 4
+; SSE-NEXT:    [[ARRAYIDX47:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 4
+; SSE-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 4
+; SSE-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 4
+; SSE-NEXT:    [[ARRAYIDX56:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 4
+; SSE-NEXT:    [[ARRAYIDX57:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 5
+; SSE-NEXT:    [[ARRAYIDX59:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 5
+; SSE-NEXT:    [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 5
+; SSE-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 5
+; SSE-NEXT:    [[ARRAYIDX68:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 5
+; SSE-NEXT:    [[ARRAYIDX69:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 6
+; SSE-NEXT:    [[ARRAYIDX71:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 6
+; SSE-NEXT:    [[ARRAYIDX73:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 6
+; SSE-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 6
+; SSE-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 6
+; SSE-NEXT:    [[ARRAYIDX81:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 7
+; SSE-NEXT:    [[TMP30:%.*]] = bitcast i8* [[ARRAYIDX45]] to <4 x i8>*
+; SSE-NEXT:    [[TMP31:%.*]] = load <4 x i8>, <4 x i8>* [[TMP30]], align 1
+; SSE-NEXT:    [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7
+; SSE-NEXT:    [[TMP32:%.*]] = bitcast i8* [[ARRAYIDX47]] to <4 x i8>*
+; SSE-NEXT:    [[TMP33:%.*]] = load <4 x i8>, <4 x i8>* [[TMP32]], align 1
+; SSE-NEXT:    [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7
+; SSE-NEXT:    [[TMP34:%.*]] = bitcast i8* [[ARRAYIDX49]] to <4 x i8>*
+; SSE-NEXT:    [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1
+; SSE-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7
+; SSE-NEXT:    [[TMP36:%.*]] = bitcast i8* [[ARRAYIDX52]] to <4 x i8>*
+; SSE-NEXT:    [[TMP37:%.*]] = load <4 x i8>, <4 x i8>* [[TMP36]], align 1
+; SSE-NEXT:    [[TMP38:%.*]] = icmp ult <4 x i8> [[TMP31]], [[TMP33]]
+; SSE-NEXT:    [[TMP39:%.*]] = select <4 x i1> [[TMP38]], <4 x i8> [[TMP37]], <4 x i8> [[TMP35]]
+; SSE-NEXT:    [[TMP40:%.*]] = zext <4 x i8> [[TMP39]] to <4 x i32>
+; SSE-NEXT:    [[TMP41:%.*]] = mul <4 x i32> [[TMP40]], [[TMP7]]
+; SSE-NEXT:    [[TMP42:%.*]] = trunc <4 x i32> [[TMP41]] to <4 x i8>
+; SSE-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7
+; SSE-NEXT:    [[TMP43:%.*]] = bitcast i8* [[ARRAYIDX56]] to <4 x i8>*
+; SSE-NEXT:    store <4 x i8> [[TMP42]], <4 x i8>* [[TMP43]], align 1
+; SSE-NEXT:    [[ARRAYIDX93:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 8
+; SSE-NEXT:    [[ARRAYIDX95:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 8
+; SSE-NEXT:    [[ARRAYIDX97:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 8
+; SSE-NEXT:    [[ARRAYIDX100:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 8
+; SSE-NEXT:    [[ARRAYIDX104:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 8
+; SSE-NEXT:    [[ARRAYIDX105:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 9
+; SSE-NEXT:    [[ARRAYIDX107:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 9
+; SSE-NEXT:    [[ARRAYIDX109:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 9
+; SSE-NEXT:    [[ARRAYIDX112:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 9
+; SSE-NEXT:    [[ARRAYIDX116:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 9
+; SSE-NEXT:    [[ARRAYIDX117:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 10
+; SSE-NEXT:    [[ARRAYIDX119:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 10
+; SSE-NEXT:    [[ARRAYIDX121:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 10
+; SSE-NEXT:    [[ARRAYIDX124:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 10
+; SSE-NEXT:    [[ARRAYIDX128:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 10
+; SSE-NEXT:    [[ARRAYIDX129:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 11
+; SSE-NEXT:    [[TMP44:%.*]] = bitcast i8* [[ARRAYIDX93]] to <4 x i8>*
+; SSE-NEXT:    [[TMP45:%.*]] = load <4 x i8>, <4 x i8>* [[TMP44]], align 1
+; SSE-NEXT:    [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11
+; SSE-NEXT:    [[TMP46:%.*]] = bitcast i8* [[ARRAYIDX95]] to <4 x i8>*
+; SSE-NEXT:    [[TMP47:%.*]] = load <4 x i8>, <4 x i8>* [[TMP46]], align 1
+; SSE-NEXT:    [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11
+; SSE-NEXT:    [[TMP48:%.*]] = bitcast i8* [[ARRAYIDX97]] to <4 x i8>*
+; SSE-NEXT:    [[TMP49:%.*]] = load <4 x i8>, <4 x i8>* [[TMP48]], align 1
+; SSE-NEXT:    [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11
+; SSE-NEXT:    [[TMP50:%.*]] = bitcast i8* [[ARRAYIDX100]] to <4 x i8>*
+; SSE-NEXT:    [[TMP51:%.*]] = load <4 x i8>, <4 x i8>* [[TMP50]], align 1
+; SSE-NEXT:    [[TMP52:%.*]] = icmp ult <4 x i8> [[TMP45]], [[TMP47]]
+; SSE-NEXT:    [[TMP53:%.*]] = select <4 x i1> [[TMP52]], <4 x i8> [[TMP51]], <4 x i8> [[TMP49]]
+; SSE-NEXT:    [[TMP54:%.*]] = zext <4 x i8> [[TMP53]] to <4 x i32>
+; SSE-NEXT:    [[TMP55:%.*]] = mul <4 x i32> [[TMP54]], [[TMP11]]
+; SSE-NEXT:    [[TMP56:%.*]] = trunc <4 x i32> [[TMP55]] to <4 x i8>
+; SSE-NEXT:    [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11
+; SSE-NEXT:    [[TMP57:%.*]] = bitcast i8* [[ARRAYIDX104]] to <4 x i8>*
+; SSE-NEXT:    store <4 x i8> [[TMP56]], <4 x i8>* [[TMP57]], align 1
+; SSE-NEXT:    [[ARRAYIDX141:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 12
+; SSE-NEXT:    [[ARRAYIDX143:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 12
+; SSE-NEXT:    [[ARRAYIDX145:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 12
+; SSE-NEXT:    [[ARRAYIDX148:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 12
+; SSE-NEXT:    [[ARRAYIDX152:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 12
+; SSE-NEXT:    [[ARRAYIDX153:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 13
+; SSE-NEXT:    [[ARRAYIDX155:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 13
+; SSE-NEXT:    [[ARRAYIDX157:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 13
+; SSE-NEXT:    [[ARRAYIDX160:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 13
+; SSE-NEXT:    [[ARRAYIDX164:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 13
+; SSE-NEXT:    [[ARRAYIDX165:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 14
+; SSE-NEXT:    [[ARRAYIDX167:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 14
+; SSE-NEXT:    [[ARRAYIDX169:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 14
+; SSE-NEXT:    [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14
+; SSE-NEXT:    [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14
+; SSE-NEXT:    [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15
+; SSE-NEXT:    [[TMP58:%.*]] = bitcast i8* [[ARRAYIDX141]] to <4 x i8>*
+; SSE-NEXT:    [[TMP59:%.*]] = load <4 x i8>, <4 x i8>* [[TMP58]], align 1
+; SSE-NEXT:    [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
+; SSE-NEXT:    [[TMP60:%.*]] = bitcast i8* [[ARRAYIDX143]] to <4 x i8>*
+; SSE-NEXT:    [[TMP61:%.*]] = load <4 x i8>, <4 x i8>* [[TMP60]], align 1
+; SSE-NEXT:    [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
+; SSE-NEXT:    [[TMP62:%.*]] = bitcast i8* [[ARRAYIDX145]] to <4 x i8>*
+; SSE-NEXT:    [[TMP63:%.*]] = load <4 x i8>, <4 x i8>* [[TMP62]], align 1
+; SSE-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
+; SSE-NEXT:    [[TMP64:%.*]] = bitcast i8* [[ARRAYIDX148]] to <4 x i8>*
+; SSE-NEXT:    [[TMP65:%.*]] = load <4 x i8>, <4 x i8>* [[TMP64]], align 1
+; SSE-NEXT:    [[TMP66:%.*]] = icmp ult <4 x i8> [[TMP59]], [[TMP61]]
+; SSE-NEXT:    [[TMP67:%.*]] = select <4 x i1> [[TMP66]], <4 x i8> [[TMP65]], <4 x i8> [[TMP63]]
+; SSE-NEXT:    [[TMP68:%.*]] = zext <4 x i8> [[TMP67]] to <4 x i32>
+; SSE-NEXT:    [[TMP69:%.*]] = mul <4 x i32> [[TMP68]], [[TMP15]]
+; SSE-NEXT:    [[TMP70:%.*]] = trunc <4 x i32> [[TMP69]] to <4 x i8>
+; SSE-NEXT:    [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
+; SSE-NEXT:    [[TMP71:%.*]] = bitcast i8* [[ARRAYIDX152]] to <4 x i8>*
+; SSE-NEXT:    store <4 x i8> [[TMP70]], <4 x i8>* [[TMP71]], align 1
+; SSE-NEXT:    [[INC]] = add nuw nsw i32 [[I_0356]], 1
+; SSE-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 16
+; SSE-NEXT:    [[ADD_PTR189]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 16
+; SSE-NEXT:    [[ADD_PTR190]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 16
+; SSE-NEXT:    [[ADD_PTR191]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 16
+; SSE-NEXT:    [[ADD_PTR192]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 16
+; SSE-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 8
+; SSE-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; SSE:       for.end:
+; SSE-NEXT:    ret void
+;
+; AVX512-LABEL: @bar(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = insertelement <16 x i32> undef, i32 [[W:%.*]], i32 0
+; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[W]], i32 1
+; AVX512-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[W]], i32 2
+; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[W]], i32 3
+; AVX512-NEXT:    [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[W]], i32 4
+; AVX512-NEXT:    [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[W]], i32 5
+; AVX512-NEXT:    [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[W]], i32 6
+; AVX512-NEXT:    [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[W]], i32 7
+; AVX512-NEXT:    [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[W]], i32 8
+; AVX512-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[W]], i32 9
+; AVX512-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[W]], i32 10
+; AVX512-NEXT:    [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[W]], i32 11
+; AVX512-NEXT:    [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[W]], i32 12
+; AVX512-NEXT:    [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[W]], i32 13
+; AVX512-NEXT:    [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[W]], i32 14
+; AVX512-NEXT:    [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[W]], i32 15
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; AVX512-NEXT:    [[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
+; AVX512-NEXT:    [[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ [[ADD_PTR192:%.*]], [[FOR_BODY]] ]
+; AVX512-NEXT:    [[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ [[ADD_PTR191:%.*]], [[FOR_BODY]] ]
+; AVX512-NEXT:    [[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ [[ADD_PTR190:%.*]], [[FOR_BODY]] ]
+; AVX512-NEXT:    [[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[ADD_PTR189:%.*]], [[FOR_BODY]] ]
+; AVX512-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 1
+; AVX512-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 1
+; AVX512-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 1
+; AVX512-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 1
+; AVX512-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 1
+; AVX512-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 2
+; AVX512-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 2
+; AVX512-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 2
+; AVX512-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 2
+; AVX512-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 2
+; AVX512-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 3
+; AVX512-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3
+; AVX512-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3
+; AVX512-NEXT:    [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3
+; AVX512-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3
+; AVX512-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 4
+; AVX512-NEXT:    [[ARRAYIDX47:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 4
+; AVX512-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 4
+; AVX512-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 4
+; AVX512-NEXT:    [[ARRAYIDX56:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 4
+; AVX512-NEXT:    [[ARRAYIDX57:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 5
+; AVX512-NEXT:    [[ARRAYIDX59:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 5
+; AVX512-NEXT:    [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 5
+; AVX512-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 5
+; AVX512-NEXT:    [[ARRAYIDX68:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 5
+; AVX512-NEXT:    [[ARRAYIDX69:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 6
+; AVX512-NEXT:    [[ARRAYIDX71:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 6
+; AVX512-NEXT:    [[ARRAYIDX73:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 6
+; AVX512-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 6
+; AVX512-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 6
+; AVX512-NEXT:    [[ARRAYIDX81:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 7
+; AVX512-NEXT:    [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7
+; AVX512-NEXT:    [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7
+; AVX512-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7
+; AVX512-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7
+; AVX512-NEXT:    [[ARRAYIDX93:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 8
+; AVX512-NEXT:    [[ARRAYIDX95:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 8
+; AVX512-NEXT:    [[ARRAYIDX97:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 8
+; AVX512-NEXT:    [[ARRAYIDX100:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 8
+; AVX512-NEXT:    [[ARRAYIDX104:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 8
+; AVX512-NEXT:    [[ARRAYIDX105:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 9
+; AVX512-NEXT:    [[ARRAYIDX107:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 9
+; AVX512-NEXT:    [[ARRAYIDX109:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 9
+; AVX512-NEXT:    [[ARRAYIDX112:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 9
+; AVX512-NEXT:    [[ARRAYIDX116:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 9
+; AVX512-NEXT:    [[ARRAYIDX117:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 10
+; AVX512-NEXT:    [[ARRAYIDX119:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 10
+; AVX512-NEXT:    [[ARRAYIDX121:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 10
+; AVX512-NEXT:    [[ARRAYIDX124:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 10
+; AVX512-NEXT:    [[ARRAYIDX128:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 10
+; AVX512-NEXT:    [[ARRAYIDX129:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 11
+; AVX512-NEXT:    [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11
+; AVX512-NEXT:    [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11
+; AVX512-NEXT:    [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11
+; AVX512-NEXT:    [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11
+; AVX512-NEXT:    [[ARRAYIDX141:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 12
+; AVX512-NEXT:    [[ARRAYIDX143:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 12
+; AVX512-NEXT:    [[ARRAYIDX145:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 12
+; AVX512-NEXT:    [[ARRAYIDX148:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 12
+; AVX512-NEXT:    [[ARRAYIDX152:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 12
+; AVX512-NEXT:    [[ARRAYIDX153:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 13
+; AVX512-NEXT:    [[ARRAYIDX155:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 13
+; AVX512-NEXT:    [[ARRAYIDX157:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 13
+; AVX512-NEXT:    [[ARRAYIDX160:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 13
+; AVX512-NEXT:    [[ARRAYIDX164:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 13
+; AVX512-NEXT:    [[ARRAYIDX165:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 14
+; AVX512-NEXT:    [[ARRAYIDX167:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 14
+; AVX512-NEXT:    [[ARRAYIDX169:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 14
+; AVX512-NEXT:    [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14
+; AVX512-NEXT:    [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14
+; AVX512-NEXT:    [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15
+; AVX512-NEXT:    [[TMP16:%.*]] = bitcast i8* [[C_ADDR_0352]] to <16 x i8>*
+; AVX512-NEXT:    [[TMP17:%.*]] = load <16 x i8>, <16 x i8>* [[TMP16]], align 1
+; AVX512-NEXT:    [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
+; AVX512-NEXT:    [[TMP18:%.*]] = bitcast i8* [[D_ADDR_0353]] to <16 x i8>*
+; AVX512-NEXT:    [[TMP19:%.*]] = load <16 x i8>, <16 x i8>* [[TMP18]], align 1
+; AVX512-NEXT:    [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
+; AVX512-NEXT:    [[TMP20:%.*]] = bitcast i8* [[A_ADDR_0355]] to <16 x i8>*
+; AVX512-NEXT:    [[TMP21:%.*]] = load <16 x i8>, <16 x i8>* [[TMP20]], align 1
+; AVX512-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
+; AVX512-NEXT:    [[TMP22:%.*]] = bitcast i8* [[B_ADDR_0351]] to <16 x i8>*
+; AVX512-NEXT:    [[TMP23:%.*]] = load <16 x i8>, <16 x i8>* [[TMP22]], align 1
+; AVX512-NEXT:    [[TMP24:%.*]] = icmp ult <16 x i8> [[TMP17]], [[TMP19]]
+; AVX512-NEXT:    [[TMP25:%.*]] = select <16 x i1> [[TMP24]], <16 x i8> [[TMP23]], <16 x i8> [[TMP21]]
+; AVX512-NEXT:    [[TMP26:%.*]] = zext <16 x i8> [[TMP25]] to <16 x i32>
+; AVX512-NEXT:    [[TMP27:%.*]] = mul <16 x i32> [[TMP26]], [[TMP15]]
+; AVX512-NEXT:    [[TMP28:%.*]] = trunc <16 x i32> [[TMP27]] to <16 x i8>
+; AVX512-NEXT:    [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast i8* [[E_ADDR_0354]] to <16 x i8>*
+; AVX512-NEXT:    store <16 x i8> [[TMP28]], <16 x i8>* [[TMP29]], align 1
+; AVX512-NEXT:    [[INC]] = add nuw nsw i32 [[I_0356]], 1
+; AVX512-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 16
+; AVX512-NEXT:    [[ADD_PTR189]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 16
+; AVX512-NEXT:    [[ADD_PTR190]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 16
+; AVX512-NEXT:    [[ADD_PTR191]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 16
+; AVX512-NEXT:    [[ADD_PTR192]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 16
+; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 8
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
index ca74d9d71631..cda204acee68 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -51,41 +51,18 @@ define void @store_i32(i32* nocapture %0, i32 %1, i32 %2) {
 
 define void @store_i8(i8* nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i8(
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[TMP0:%.*]], align 1, [[TBAA4:!tbaa !.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP1:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 15
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 255
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP7]], i32 255
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc i32 [[TMP9]] to i8
-; CHECK-NEXT:    store i8 [[TMP10]], i8* [[TMP0]], align 1, [[TBAA4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP11]], align 1, [[TBAA4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP12]] to i32
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP14]], 15
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult i32 [[TMP15]], 255
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP15]], i32 255
-; CHECK-NEXT:    [[TMP18:%.*]] = trunc i32 [[TMP17]] to i8
-; CHECK-NEXT:    store i8 [[TMP18]], i8* [[TMP11]], align 1, [[TBAA4]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP20:%.*]] = load i8, i8* [[TMP19]], align 1, [[TBAA4]]
-; CHECK-NEXT:    [[TMP21:%.*]] = zext i8 [[TMP20]] to i32
-; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP1]]
-; CHECK-NEXT:    [[TMP23:%.*]] = lshr i32 [[TMP22]], 15
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp ult i32 [[TMP23]], 255
-; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP23]], i32 255
-; CHECK-NEXT:    [[TMP26:%.*]] = trunc i32 [[TMP25]] to i8
-; CHECK-NEXT:    store i8 [[TMP26]], i8* [[TMP19]], align 1, [[TBAA4]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 3
-; CHECK-NEXT:    [[TMP28:%.*]] = load i8, i8* [[TMP27]], align 1, [[TBAA4]]
-; CHECK-NEXT:    [[TMP29:%.*]] = zext i8 [[TMP28]] to i32
-; CHECK-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], [[TMP1]]
-; CHECK-NEXT:    [[TMP31:%.*]] = lshr i32 [[TMP30]], 15
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp ult i32 [[TMP31]], 255
-; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP31]], i32 255
-; CHECK-NEXT:    [[TMP34:%.*]] = trunc i32 [[TMP33]] to i8
-; CHECK-NEXT:    store i8 [[TMP34]], i8* [[TMP27]], align 1, [[TBAA4]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0:%.*]] to <4 x i8>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1, [[TBAA4:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP8]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP10]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1, [[TBAA4]]
 ; CHECK-NEXT:    ret void
 ;
   %4 = load i8, i8* %0, align 1, !tbaa !6


        


More information about the llvm-branch-commits mailing list