[llvm] 0108a59 - [SLP]Fix a crash on an subvector size calculation for non-power-of-2 vector

Tue Jun 17 08:58:21 PDT 2025

Author: Alexey Bataev
Date: 2025-06-17T08:58:07-07:00
New Revision: 0108a5908cab5e418c683ef9b6e1810755344b5e

URL: https://github.com/llvm/llvm-project/commit/0108a5908cab5e418c683ef9b6e1810755344b5e
DIFF: https://github.com/llvm/llvm-project/commit/0108a5908cab5e418c683ef9b6e1810755344b5e.diff

LOG: [SLP]Fix a crash on an subvector size calculation for non-power-of-2 vector

Patch fixes cost estimation for the extractelements from non-power-of-2
vectors, defined as subvector extracts. In this case the subvector size
might be not adjusted to a whole register size, need to get the minimum
between whole vector size and the actual difference to prevent compiler
crash.

Fixes #143513

Added: 
    llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
    llvm/test/Transforms/PhaseOrdering/X86/hsub.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d811e9d77d183..4551a365a6967 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12085,7 +12085,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // FIXME: this must be moved to TTI for better estimation.
     unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
     auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
-                                        SmallVectorImpl<unsigned> &Indices)
+                                        SmallVectorImpl<unsigned> &Indices,
+                                        SmallVectorImpl<unsigned> &SubVecSizes)
         -> std::optional<TTI::ShuffleKind> {
       if (NumElts <= EltsPerVector)
         return std::nullopt;
@@ -12130,7 +12131,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                       return std::min(S, I);
                     }),
                 EltsPerVector);
-            Indices.push_back(OffsetReg1 % NumElts);
+            unsigned Index = OffsetReg1 % NumElts;
+            Indices.push_back(Index);
+            SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
           }
           Idx = I - OffsetReg1;
         }
@@ -12152,8 +12155,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
       copy(MaskSlice, SubMask.begin());
       SmallVector<unsigned, 2> Indices;
+      SmallVector<unsigned, 2> SubVecSizes;
       std::optional<TTI::ShuffleKind> RegShuffleKind =
-          CheckPerRegistersShuffle(SubMask, Indices);
+          CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
       if (!RegShuffleKind) {
         if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
             !ShuffleVectorInst::isIdentityMask(
@@ -12171,12 +12175,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       }
       const unsigned BaseVF = getFullVectorNumberOfElements(
           *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
-      for (unsigned Idx : Indices) {
-        assert((Idx + EltsPerVector) <= BaseVF &&
+      for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
+        assert((Idx + SubVecSize) <= BaseVF &&
                "SK_ExtractSubvector index out of range");
         Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
                                  getWidenedType(ScalarTy, BaseVF), {}, CostKind,
-                                 Idx, getWidenedType(ScalarTy, EltsPerVector));
+                                 Idx, getWidenedType(ScalarTy, SubVecSize));
       }
       // Second attempt to check, if just a permute is better estimated than
       // subvector extract.

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 28b48bd3ce6d9..9bfd92ef35a46 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -272,24 +272,21 @@ define <16 x i16> @add_v16i16_0123456789ABCDEF(<16 x i16> %a, <16 x i16> %b) {
 
 define <16 x i16> @add_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: @add_v16i16_0123u56789uBCDEF(
-; SSE2-NEXT:    [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
-; SSE2-NEXT:    [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
-; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
+; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 14
 ; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
-; SSE2-NEXT:    [[BCD:%.*]] = add i16 [[BC]], [[BD]]
 ; SSE2-NEXT:    [[BEF:%.*]] = add i16 [[BE]], [[BF]]
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HADD8:%.*]] = add <16 x i16> [[TMP3]], [[TMP7]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 poison, i32 14, i32 24, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 poison, i32 15, i32 25, i32 27, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 9, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 8, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
-; SSE2-NEXT:    [[HADDD1:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 19, i32 20, i32 poison, i32 poison>
-; SSE2-NEXT:    [[HADDE:%.*]] = insertelement <16 x i16> [[HADDD1]], i16 [[BCD]], i64 14
-; SSE2-NEXT:    [[HADDF:%.*]] = insertelement <16 x i16> [[HADDE]], i16 [[BEF]], i64 15
-; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HADDF]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:    [[HADD92:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HADDB:%.*]] = insertelement <16 x i16> [[HADD92]], i16 [[BEF]], i64 11
+; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP9:%.*]] = add <16 x i16> [[TMP10]], [[TMP8]]
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HADDB]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v16i16_0123u56789uBCDEF(

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index 0062527b678c9..13b4d7da97c9d 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -272,24 +272,21 @@ define <16 x i16> @sub_v16i16_0123456789ABCDEF(<16 x i16> %a, <16 x i16> %b) {
 
 define <16 x i16> @sub_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: @sub_v16i16_0123u56789uBCDEF(
-; SSE2-NEXT:    [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
-; SSE2-NEXT:    [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
-; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
+; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 14
 ; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
-; SSE2-NEXT:    [[BCD:%.*]] = sub i16 [[BC]], [[BD]]
 ; SSE2-NEXT:    [[BEF:%.*]] = sub i16 [[BE]], [[BF]]
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HSUB8:%.*]] = sub <16 x i16> [[TMP3]], [[TMP7]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 poison, i32 14, i32 24, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 poison, i32 15, i32 25, i32 27, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
-; SSE2-NEXT:    [[HSUBD1:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 19, i32 20, i32 poison, i32 poison>
-; SSE2-NEXT:    [[HSUBE:%.*]] = insertelement <16 x i16> [[HSUBD1]], i16 [[BCD]], i64 14
-; SSE2-NEXT:    [[HSUBF:%.*]] = insertelement <16 x i16> [[HSUBE]], i16 [[BEF]], i64 15
-; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBF]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:    [[HSUB92:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HSUBB:%.*]] = insertelement <16 x i16> [[HSUB92]], i16 [[BEF]], i64 11
+; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP9:%.*]] = sub <16 x i16> [[TMP10]], [[TMP8]]
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBB]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @sub_v16i16_0123u56789uBCDEF(

diff  --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll
new file mode 100644
index 0000000000000..6006bf9cb262d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-ibm-linux -mcpu=z13 -slp-max-reg-size=256 -slp-vectorize-hor-store -slp-vectorize-non-power-of-2 < %s | FileCheck %s
+
+ at c = external global [1 x [10 x i32]]
+ at j.0 = external global i32
+
+define void @p() {
+; CHECK-LABEL: define void @p(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> [[TMP0]], splat (i32 1)
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <7 x i32>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <7 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[OR_1_5_I_3:%.*]] = or i32 [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    store i32 [[OR_1_5_I_3]], ptr @j.0, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <7 x i32> [[TMP4]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <7 x i32> [[TMP4]], splat (i32 1)
+; CHECK-NEXT:    store <7 x i32> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP2]]
+; CHECK-NEXT:    [[OR_1_5_I_5:%.*]] = or i32 [[TMP10]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
+; CHECK-NEXT:    [[OR_1_6_I_5:%.*]] = or i32 [[OR_1_5_I_5]], [[TMP11]]
+; CHECK-NEXT:    store i32 [[OR_1_6_I_5]], ptr @j.0, align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i32> [[TMP8]], splat (i32 1)
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx12.promoted.5.i = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+  %conv14.5.i = xor i32 %arrayidx12.promoted.5.i, 1
+  store i32 %conv14.5.i, ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+  %arrayidx12.promoted.5.i.1 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 204), align 4
+  %conv14.5.i.1 = xor i32 %arrayidx12.promoted.5.i.1, 1
+  store i32 %conv14.5.i.1, ptr getelementptr inbounds nuw (i8, ptr @c, i64 204), align 4
+  %arrayidx12.promoted.5.i.2 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 208), align 4
+  %conv14.5.i.2 = xor i32 %arrayidx12.promoted.5.i.2, 1
+  store i32 %conv14.5.i.2, ptr getelementptr inbounds nuw (i8, ptr @c, i64 208), align 4
+  %arrayidx12.promoted.1.i.3 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+  %conv14.1.i.3 = xor i32 %arrayidx12.promoted.1.i.3, 1
+  store i32 %conv14.1.i.3, ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+  %arrayidx12.promoted.5.i.3 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 212), align 4
+  %conv14.5.i.3 = xor i32 %arrayidx12.promoted.5.i.3, 1
+  store i32 %conv14.5.i.3, ptr getelementptr inbounds nuw (i8, ptr @c, i64 212), align 4
+  %arrayidx12.promoted.6.i.3 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+  %conv14.6.i.3 = xor i32 %arrayidx12.promoted.6.i.3, 1
+  %or.1.5.i.3 = or i32 %arrayidx12.promoted.1.i.3, %arrayidx12.promoted.5.i.3
+  store i32 %conv14.6.i.3, ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+  store i32 %or.1.5.i.3, ptr @j.0, align 4
+  %arrayidx12.promoted.1.i.4 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 56), align 4
+  %conv14.1.i.4 = xor i32 %arrayidx12.promoted.1.i.4, 1
+  store i32 %conv14.1.i.4, ptr getelementptr inbounds nuw (i8, ptr @c, i64 56), align 4
+  %arrayidx12.promoted.5.i.4 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 216), align 4
+  %conv14.5.i.4 = xor i32 %arrayidx12.promoted.5.i.4, 1
+  store i32 %conv14.5.i.4, ptr getelementptr inbounds nuw (i8, ptr @c, i64 216), align 4
+  %arrayidx12.promoted.6.i.4 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 256), align 4
+  %conv14.6.i.4 = xor i32 %arrayidx12.promoted.6.i.4, 1
+  store i32 %conv14.6.i.4, ptr getelementptr inbounds nuw (i8, ptr @c, i64 256), align 4
+  %arrayidx12.promoted.1.i.5 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 60), align 4
+  %conv14.1.i.5 = xor i32 %arrayidx12.promoted.1.i.5, 1
+  store i32 %conv14.1.i.5, ptr getelementptr inbounds nuw (i8, ptr @c, i64 60), align 4
+  %arrayidx12.promoted.5.i.5 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 220), align 4
+  %conv14.5.i.5 = xor i32 %arrayidx12.promoted.5.i.5, 1
+  store i32 %conv14.5.i.5, ptr getelementptr inbounds nuw (i8, ptr @c, i64 220), align 4
+  %arrayidx12.promoted.6.i.5 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 260), align 4
+  %conv14.6.i.5 = xor i32 %arrayidx12.promoted.6.i.5, 1
+  %0 = or i32 %arrayidx12.promoted.6.i.4, %arrayidx12.promoted.1.i.5
+  %or.1.5.i.5 = or i32 %0, %arrayidx12.promoted.5.i.5
+  %or.1.6.i.5 = or i32 %or.1.5.i.5, %arrayidx12.promoted.6.i.5
+  store i32 %conv14.6.i.5, ptr getelementptr inbounds nuw (i8, ptr @c, i64 260), align 4
+  store i32 %or.1.6.i.5, ptr @j.0, align 4
+  %arrayidx12.promoted.1.i.6 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 64), align 4
+  %conv14.1.i.6 = xor i32 %arrayidx12.promoted.1.i.6, 1
+  store i32 %conv14.1.i.6, ptr getelementptr inbounds nuw (i8, ptr @c, i64 64), align 4
+  %arrayidx12.promoted.5.i.6 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 224), align 4
+  %conv14.5.i.6 = xor i32 %arrayidx12.promoted.5.i.6, 1
+  store i32 %conv14.5.i.6, ptr getelementptr inbounds nuw (i8, ptr @c, i64 224), align 4
+  %arrayidx12.promoted.6.i.6 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 264), align 4
+  %conv14.6.i.6 = xor i32 %arrayidx12.promoted.6.i.6, 1
+  store i32 %conv14.6.i.6, ptr getelementptr inbounds nuw (i8, ptr @c, i64 264), align 4
+  ret void
+}