[llvm] [VectorCombine] foldShuffleOfShuffles - fold "shuffle (shuffle x, undef), (shuffle y, undef)" -> "shuffle x, y" (PR #88743)
    Simon Pilgrim via llvm-commits 
    llvm-commits at lists.llvm.org
       
    Mon Apr 22 04:10:26 PDT 2024
    
    
  
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/88743
>From 3db9106d0a81c58ea1f0ad7fd160ae0a15e35233 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 15 Apr 2024 15:00:18 +0100
Subject: [PATCH] [VectorCombine] foldShuffleOfShuffles - fold "shuffle
 (shuffle x, undef), (shuffle y, undef)" -> "shuffle x, y"
Another step towards cleaning up shuffles that have been split, often across bitcasts between SSE intrinsic.
Strip shuffles entirely if we fold to an identity shuffle.
---
 llvm/include/llvm/IR/PatternMatch.h           |   8 +
 .../Transforms/Vectorize/VectorCombine.cpp    |  82 +++++++++
 .../AArch64/interleavevectorization.ll        | 158 ++++++++----------
 .../VectorCombine/AArch64/select-shuffle.ll   |  16 +-
 .../AArch64/shuffletoidentity.ll              |   5 +-
 .../Transforms/VectorCombine/X86/pr67803.ll   |   5 +-
 .../VectorCombine/X86/shuffle-of-shuffles.ll  |  14 +-
 7 files changed, 167 insertions(+), 121 deletions(-)
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 1b418a98545414..1fee1901fabb65 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -151,6 +151,11 @@ struct undef_match {
 /// neither undef nor poison, the aggregate is not matched.
 inline auto m_Undef() { return undef_match(); }
 
+/// Match an arbitrary UndefValue constant.
+inline class_match<UndefValue> m_UndefValue() {
+  return class_match<UndefValue>();
+}
+
 /// Match an arbitrary poison constant.
 inline class_match<PoisonValue> m_Poison() {
   return class_match<PoisonValue>();
@@ -777,6 +782,9 @@ m_WithOverflowInst(const WithOverflowInst *&I) {
   return I;
 }
 
+/// Match an UndefValue, capturing the value if we match.
+inline bind_ty<UndefValue> m_UndefValue(UndefValue *&U) { return U; }
+
 /// Match a Constant, capturing the value if we match.
 inline bind_ty<Constant> m_Constant(Constant *&C) { return C; }
 
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index f23b10540338dd..df761f9b711a10 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -113,6 +113,7 @@ class VectorCombine {
   bool scalarizeLoadExtract(Instruction &I);
   bool foldShuffleOfBinops(Instruction &I);
   bool foldShuffleOfCastops(Instruction &I);
+  bool foldShuffleOfShuffles(Instruction &I);
   bool foldShuffleFromReductions(Instruction &I);
   bool foldTruncFromReductions(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
@@ -1552,6 +1553,86 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
   return true;
 }
 
+/// Try to convert "shuffle (shuffle x, undef), (shuffle y, undef)"
+/// into "shuffle x, y".
+bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
+  Value *V0, *V1;
+  UndefValue *U0, *U1;
+  ArrayRef<int> OuterMask, InnerMask0, InnerMask1;
+  if (!match(&I, m_Shuffle(m_OneUse(m_Shuffle(m_Value(V0), m_UndefValue(U0),
+                                              m_Mask(InnerMask0))),
+                           m_OneUse(m_Shuffle(m_Value(V1), m_UndefValue(U1),
+                                              m_Mask(InnerMask1))),
+                           m_Mask(OuterMask))))
+    return false;
+
+  auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
+  auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(V0->getType());
+  auto *ShuffleImmTy = dyn_cast<FixedVectorType>(I.getOperand(0)->getType());
+  if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
+      V0->getType() != V1->getType())
+    return false;
+
+  unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
+  unsigned NumImmElts = ShuffleImmTy->getNumElements();
+
+  // Bail if either inner masks reference a RHS undef arg.
+  if ((!isa<PoisonValue>(U0) &&
+       any_of(InnerMask0, [&](int M) { return M >= (int)NumSrcElts; })) ||
+      (!isa<PoisonValue>(U1) &&
+       any_of(InnerMask1, [&](int M) { return M >= (int)NumSrcElts; })))
+    return false;
+
+  // Merge shuffles - replace index to the RHS poison arg with PoisonMaskElem,
+  SmallVector<int, 16> NewMask(OuterMask.begin(), OuterMask.end());
+  for (int &M : NewMask) {
+    if (0 <= M && M < (int)NumImmElts) {
+      M = (InnerMask0[M] >= (int)NumSrcElts) ? PoisonMaskElem : InnerMask0[M];
+    } else if (M >= (int)NumImmElts) {
+      if (InnerMask1[M - NumImmElts] >= (int)NumSrcElts)
+        M = PoisonMaskElem;
+      else
+        M = InnerMask1[M - NumImmElts] + (V0 == V1 ? 0 : NumSrcElts);
+    }
+  }
+
+  // Have we folded to an Identity shuffle?
+  if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
+    replaceValue(I, *V0);
+    return true;
+  }
+
+  // Try to merge the shuffles if the new shuffle is not costly.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+  InstructionCost OldCost =
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
+                         InnerMask0, CostKind) +
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
+                         InnerMask1, CostKind) +
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy,
+                         OuterMask, CostKind, 0, nullptr, std::nullopt, &I);
+
+  InstructionCost NewCost = TTI.getShuffleCost(
+      TargetTransformInfo::SK_PermuteTwoSrc, ShuffleSrcTy, NewMask, CostKind);
+
+  LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
+                    << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
+                    << "\n");
+  if (NewCost > OldCost)
+    return false;
+
+  // Clear unused sources to poison.
+  if (none_of(NewMask, [&](int M) { return 0 <= M && M < (int)NumSrcElts; }))
+    V0 = PoisonValue::get(ShuffleSrcTy);
+  if (none_of(NewMask, [&](int M) { return (int)NumSrcElts <= M; }))
+    V1 = PoisonValue::get(ShuffleSrcTy);
+
+  Value *Shuf = Builder.CreateShuffleVector(V0, V1, NewMask);
+  replaceValue(I, *Shuf);
+  return true;
+}
+
 /// Given a commutative reduction, the order of the input lanes does not alter
 /// the results. We can use this to remove certain shuffles feeding the
 /// reduction, removing the need to shuffle at all.
@@ -2107,6 +2188,7 @@ bool VectorCombine::run() {
       case Instruction::ShuffleVector:
         MadeChange |= foldShuffleOfBinops(I);
         MadeChange |= foldShuffleOfCastops(I);
+        MadeChange |= foldShuffleOfShuffles(I);
         MadeChange |= foldSelectShuffle(I);
         break;
       case Instruction::BitCast:
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
index b49c0ad9976654..f1d7c0e0c41233 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -S -o - %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S -o - %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
@@ -23,22 +23,18 @@ define void @add4(ptr noalias noundef %x, ptr noalias noundef %y, i32 noundef %n
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC24:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP6]], <32 x i16> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
-; CHECK-NEXT:    [[TMP9:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i16> [[TMP9]], <32 x i16> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> [[TMP12]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> [[TMP3]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> [[TMP8]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP13]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP9]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -156,22 +152,18 @@ define void @addsubs(ptr noalias noundef %x, ptr noundef %y, i32 noundef %n) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC24:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP6]], <32 x i16> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
-; CHECK-NEXT:    [[TMP9:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i16> [[TMP9]], <32 x i16> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> [[TMP12]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> [[TMP3]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> [[TMP8]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP13]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP9]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -289,22 +281,18 @@ define void @add2sub2(ptr noalias noundef %x, ptr noundef %y, i32 noundef %n) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC24:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP6]], <32 x i16> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
-; CHECK-NEXT:    [[TMP9:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i16> [[TMP9]], <32 x i16> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> [[TMP12]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> [[TMP3]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> [[TMP8]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP13]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP9]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -425,25 +413,21 @@ define void @addmul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z, i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP3]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <32 x i16> [[TMP6]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP7]], <32 x i16> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <32 x i16> [[TMP9]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <32 x i16> [[TMP10]], <32 x i16> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
-; CHECK-NEXT:    [[TMP13:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add <32 x i16> [[TMP13]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i16> [[TMP14]], <32 x i16> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP16]], <16 x i16> [[TMP17]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <32 x i16> [[TMP5]], [[WIDE_VEC36]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <32 x i16> [[TMP7]], [[WIDE_VEC36]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add <32 x i16> [[TMP10]], [[WIDE_VEC36]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP8]], <32 x i16> [[TMP11]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP12]], <16 x i16> [[TMP13]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP18]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP14]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -597,25 +581,21 @@ define void @addsubsmul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z,
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP3]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP7]], <32 x i16> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <32 x i16> [[TMP9]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <32 x i16> [[TMP10]], <32 x i16> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
-; CHECK-NEXT:    [[TMP13:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP14:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i16> [[TMP14]], <32 x i16> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP16]], <16 x i16> [[TMP17]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <32 x i16> [[TMP7]], [[WIDE_VEC36]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP10]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP8]], <32 x i16> [[TMP11]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP12]], <16 x i16> [[TMP13]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP18]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP14]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -769,25 +749,21 @@ define void @add2sub2mul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z,
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP3]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <32 x i16> [[TMP6]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP7]], <32 x i16> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP10:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <32 x i16> [[TMP10]], <32 x i16> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
-; CHECK-NEXT:    [[TMP13:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP14:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i16> [[TMP14]], <32 x i16> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP16]], <16 x i16> [[TMP17]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <32 x i16> [[TMP5]], [[WIDE_VEC36]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP10]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP8]], <32 x i16> [[TMP11]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP12]], <16 x i16> [[TMP13]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP18]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP14]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
index b27c026dbccf0b..b49f3c9f3eeb27 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
@@ -741,18 +741,14 @@ define i32 @full_reorder(ptr nocapture noundef readonly %pix1, i32 noundef %i_pi
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -760,18 +756,14 @@ define i32 @full_reorder(ptr nocapture noundef readonly %pix1, i32 noundef %i_pi
 ; CHECK-NEXT:    [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
-; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <16 x i8> [[TMP32]], <16 x i8> [[TMP33]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP40:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT:    [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP40]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP42:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <16 x i8> [[TMP41]], <16 x i8> [[TMP42]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP40]], <4 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index 56b42380a68bfb..4cc7d5f253289b 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -5,10 +5,7 @@ target triple = "aarch64"
 
 define <8 x i8> @trivial(<8 x i8> %a) {
 ; CHECK-LABEL: @trivial(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[AT]], <4 x i8> [[AB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i8> [[R]]
+; CHECK-NEXT:    ret <8 x i8> [[R:%.*]]
 ;
   %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %at = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
diff --git a/llvm/test/Transforms/VectorCombine/X86/pr67803.ll b/llvm/test/Transforms/VectorCombine/X86/pr67803.ll
index 69fd6f6a10e2a6..0277580d21fcb7 100644
--- a/llvm/test/Transforms/VectorCombine/X86/pr67803.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/pr67803.ll
@@ -6,10 +6,7 @@ define <4 x i64> @PR67803(<8 x i32> %x, <8 x i32> %y, <8 x float> %a, <8 x float
 ; CHECK-LABEL: @PR67803(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <8 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMP_LO:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[CMP_HI:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i1> [[CMP_LO]], <4 x i1> [[CMP_HI]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i1> [[CMP]] to <8 x i32>
 ; CHECK-NEXT:    [[CONCAT:%.*]] = bitcast <8 x i32> [[TMP1]] to <4 x i64>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast <4 x i64> [[CONCAT]] to <8 x float>
 ; CHECK-NEXT:    [[SEL:%.*]] = tail call noundef <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[MASK]])
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index b5b5bb997c6c7a..9079a4f693ab8b 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -2,15 +2,12 @@
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
 
-; TODO: fold to identity
+; fold to identity
 
 define <8 x i32> @concat_extract_subvectors(<8 x i32> %x) {
 ; CHECK-LABEL: define <8 x i32> @concat_extract_subvectors(
 ; CHECK-SAME: <8 x i32> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[LO:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[HI:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <4 x i32> [[LO]], <4 x i32> [[HI]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x i32> [[CONCAT]]
+; CHECK-NEXT:    ret <8 x i32> [[X]]
 ;
   %lo = shufflevector <8 x i32> %x, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %hi = shufflevector <8 x i32> %x, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -34,15 +31,12 @@ define <8 x i32> @concat_extract_subvectors_undef(<8 x i32> %x) {
   ret <8 x i32> %concat
 }
 
-; negative test - shuffle contains poision
+; shuffle contains poison
 
 define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
 ; CHECK-LABEL: define <8 x i32> @concat_extract_subvectors_poison(
 ; CHECK-SAME: <8 x i32> [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LO:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 8>
-; CHECK-NEXT:    [[HI:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 8>
-; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <4 x i32> [[LO]], <4 x i32> [[HI]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x i32> [[CONCAT]]
+; CHECK-NEXT:    ret <8 x i32> [[X]]
 ;
   %lo = shufflevector <8 x i32> %x, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 8>
   %hi = shufflevector <8 x i32> %x, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 8>
    
    
More information about the llvm-commits
mailing list