[llvm] [SLP]Fix/improve potential masked gather loads analysis. (PR #83472)

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 1 04:38:18 PST 2024


https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/83472

>From 26746a7e79156e7f94cd3a447fa97d3da3386ed1 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 29 Feb 2024 20:02:03 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |   6 +-
 .../SLPVectorizer/RISCV/complex-loads.ll      | 626 +++++++++---------
 .../SLPVectorizer/X86/split-load8_2-unord.ll  |  65 +-
 3 files changed, 354 insertions(+), 343 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 94b7c4952f055e..df48fc1f7cf8dd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4102,12 +4102,14 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
     bool ProfitableGatherPointers =
         static_cast<unsigned>(count_if(
             PointerOps,
-            [L](Value *V) { return L && L->isLoopInvariant(V); })) <= Sz / 2 &&
+            [L](Value *V) { return L && L->isLoopInvariant(V); })) >= Sz / 2 &&
         Sz > 2;
     if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
           auto *GEP = dyn_cast<GetElementPtrInst>(P);
           return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
-                 (GEP && GEP->getNumOperands() == 2);
+                 (GEP && GEP->getNumOperands() == 2 &&
+                  (isa<Constant>(GEP->getOperand(1)) ||
+                   isa<Instruction>(GEP->getOperand(1))));
         })) {
       Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
       if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index e167b6a47af592..97008ac6785961 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -17,353 +17,359 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr i8, ptr [[PIX1]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 2
 ; CHECK-NEXT:    [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ADD_PTR3]], align 1
 ; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ADD_PTR644]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
-; CHECK-NEXT:    [[CONV33_1:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
+; CHECK-NEXT:    [[CONV33_1:%.*]] = zext i8 [[TMP12]] to i32
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR_1]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x ptr> [[TMP11]], ptr [[ARRAYIDX20_2]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP12]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP14:%.*]] = zext <2 x i8> [[TMP13]] to <2 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX20_2]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR64_1]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x ptr> [[TMP15]], ptr [[ARRAYIDX22_2]], i32 1
-; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP16]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = sub <2 x i32> [[TMP14]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x ptr> [[TMP12]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, <2 x ptr> [[TMP20]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP22:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP21]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x ptr> [[TMP16]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, <2 x ptr> [[TMP24]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP26:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP25]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32>
-; CHECK-NEXT:    [[TMP28:%.*]] = sub <2 x i32> [[TMP23]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = shl <2 x i32> [[TMP28]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP30:%.*]] = add <2 x i32> [[TMP29]], [[TMP19]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i8, <2 x ptr> [[TMP20]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i8> poison, i8 [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[TMP15]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i8> poison, i8 [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x i8> [[TMP20]], i8 [[TMP16]], i32 1
+; CHECK-NEXT:    [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32>
+; CHECK-NEXT:    [[TMP23:%.*]] = sub <2 x i32> [[TMP19]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR_1]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x ptr> [[TMP24]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, <2 x ptr> [[TMP25]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT:    [[TMP27:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP26]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR64_1]], i32 0
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x ptr> [[TMP29]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i8, <2 x ptr> [[TMP30]], <2 x i64> <i64 4, i64 6>
 ; CHECK-NEXT:    [[TMP32:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP31]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP33:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i8, <2 x ptr> [[TMP24]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT:    [[TMP35:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP34]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP36:%.*]] = zext <2 x i8> [[TMP35]] to <2 x i32>
-; CHECK-NEXT:    [[TMP37:%.*]] = sub <2 x i32> [[TMP33]], [[TMP36]]
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i8, <2 x ptr> [[TMP20]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP39:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP38]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP40:%.*]] = zext <2 x i8> [[TMP39]] to <2 x i32>
-; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i8, <2 x ptr> [[TMP24]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP42:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP41]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP43:%.*]] = zext <2 x i8> [[TMP42]] to <2 x i32>
-; CHECK-NEXT:    [[TMP44:%.*]] = sub <2 x i32> [[TMP40]], [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = shl <2 x i32> [[TMP44]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP46:%.*]] = add <2 x i32> [[TMP45]], [[TMP37]]
-; CHECK-NEXT:    [[TMP47:%.*]] = sub <2 x i32> [[TMP30]], [[TMP46]]
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
-; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[TMP48]], [[TMP49]]
-; CHECK-NEXT:    [[TMP50:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = sub <2 x i32> [[TMP28]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = shl <2 x i32> [[TMP34]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP36:%.*]] = add <2 x i32> [[TMP35]], [[TMP23]]
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, <2 x ptr> [[TMP25]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT:    [[TMP38:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP37]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i8, <2 x ptr> [[TMP30]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT:    [[TMP41:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP40]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP42:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
+; CHECK-NEXT:    [[TMP43:%.*]] = sub <2 x i32> [[TMP39]], [[TMP42]]
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, <2 x ptr> [[TMP25]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP45:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP44]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP46:%.*]] = zext <2 x i8> [[TMP45]] to <2 x i32>
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i8, <2 x ptr> [[TMP30]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP48:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP47]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP48]] to <2 x i32>
+; CHECK-NEXT:    [[TMP50:%.*]] = sub <2 x i32> [[TMP46]], [[TMP49]]
+; CHECK-NEXT:    [[TMP51:%.*]] = shl <2 x i32> [[TMP50]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP52:%.*]] = add <2 x i32> [[TMP51]], [[TMP43]]
+; CHECK-NEXT:    [[TMP53:%.*]] = sub <2 x i32> [[TMP36]], [[TMP52]]
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <2 x i32> [[TMP53]], i32 0
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i32> [[TMP53]], i32 1
+; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP56:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
-; CHECK-NEXT:    [[TMP51:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
-; CHECK-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP52]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
-; CHECK-NEXT:    [[TMP56:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP55]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
-; CHECK-NEXT:    [[TMP58:%.*]] = sub <2 x i32> [[TMP54]], [[TMP57]]
-; CHECK-NEXT:    [[TMP59:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP57:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
+; CHECK-NEXT:    [[TMP59:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP58]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
-; CHECK-NEXT:    [[TMP61:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
-; CHECK-NEXT:    [[TMP63:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
-; CHECK-NEXT:    [[TMP64:%.*]] = shl <2 x i32> [[TMP63]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP65:%.*]] = add <2 x i32> [[TMP64]], [[TMP58]]
-; CHECK-NEXT:    [[TMP66:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP67:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
-; CHECK-NEXT:    [[TMP68:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP69:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
-; CHECK-NEXT:    [[TMP70:%.*]] = sub <2 x i32> [[TMP67]], [[TMP69]]
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i8> poison, i8 [[TMP51]], i32 0
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i8> [[TMP71]], i8 [[TMP50]], i32 1
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
+; CHECK-NEXT:    [[TMP62:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP61]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
+; CHECK-NEXT:    [[TMP64:%.*]] = sub <2 x i32> [[TMP60]], [[TMP63]]
+; CHECK-NEXT:    [[TMP65:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP66:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
+; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]]
+; CHECK-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP64]]
+; CHECK-NEXT:    [[TMP72:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP73:%.*]] = zext <2 x i8> [[TMP72]] to <2 x i32>
-; CHECK-NEXT:    [[TMP74:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP74:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP76:%.*]] = sub <2 x i32> [[TMP73]], [[TMP75]]
-; CHECK-NEXT:    [[TMP77:%.*]] = shl <2 x i32> [[TMP76]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP78:%.*]] = add <2 x i32> [[TMP77]], [[TMP70]]
-; CHECK-NEXT:    [[TMP79:%.*]] = sub <2 x i32> [[TMP65]], [[TMP78]]
-; CHECK-NEXT:    [[TMP80:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP82:%.*]] = add <2 x i32> [[TMP80]], [[TMP81]]
-; CHECK-NEXT:    [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP85:%.*]] = add <2 x i32> [[TMP83]], [[TMP84]]
-; CHECK-NEXT:    [[TMP86:%.*]] = add <2 x i32> [[TMP85]], [[TMP82]]
-; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP82]], [[TMP85]]
-; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
-; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP88]]
-; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP86]], i32 0
-; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
-; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[TMP90]], [[TMP91]]
-; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[TMP91]], [[TMP90]]
-; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <2 x i32> [[TMP54]], i32 1
-; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP92]], 15
+; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <2 x i8> poison, i8 [[TMP57]], i32 0
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <2 x i8> [[TMP77]], i8 [[TMP56]], i32 1
+; CHECK-NEXT:    [[TMP79:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32>
+; CHECK-NEXT:    [[TMP80:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP81:%.*]] = zext <2 x i8> [[TMP80]] to <2 x i32>
+; CHECK-NEXT:    [[TMP82:%.*]] = sub <2 x i32> [[TMP79]], [[TMP81]]
+; CHECK-NEXT:    [[TMP83:%.*]] = shl <2 x i32> [[TMP82]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP84:%.*]] = add <2 x i32> [[TMP83]], [[TMP76]]
+; CHECK-NEXT:    [[TMP85:%.*]] = sub <2 x i32> [[TMP71]], [[TMP84]]
+; CHECK-NEXT:    [[TMP86:%.*]] = shufflevector <2 x i32> [[TMP84]], <2 x i32> [[TMP52]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP87:%.*]] = shufflevector <2 x i32> [[TMP71]], <2 x i32> [[TMP36]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP88:%.*]] = add <2 x i32> [[TMP86]], [[TMP87]]
+; CHECK-NEXT:    [[TMP89:%.*]] = shufflevector <2 x i32> [[TMP84]], <2 x i32> [[TMP52]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP90:%.*]] = shufflevector <2 x i32> [[TMP71]], <2 x i32> [[TMP36]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP91:%.*]] = add <2 x i32> [[TMP89]], [[TMP90]]
+; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP91]], [[TMP88]]
+; CHECK-NEXT:    [[TMP93:%.*]] = sub <2 x i32> [[TMP88]], [[TMP91]]
+; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0
+; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
+; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP95]], [[TMP94]]
+; CHECK-NEXT:    [[TMP96:%.*]] = extractelement <2 x i32> [[TMP92]], i32 0
+; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <2 x i32> [[TMP92]], i32 1
+; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[TMP96]], [[TMP97]]
+; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[TMP97]], [[TMP96]]
+; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <2 x i32> [[TMP60]], i32 1
+; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP98]], 15
 ; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
 ; CHECK-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
-; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
-; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP93]], 15
+; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <2 x i32> [[TMP91]], i32 1
+; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP99]], 15
 ; CHECK-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
 ; CHECK-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
-; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
-; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP94]], [[TMP95]]
-; CHECK-NEXT:    [[TMP96:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
-; CHECK-NEXT:    [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
-; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
-; CHECK-NEXT:    [[TMP99:%.*]] = shufflevector <2 x i32> [[TMP98]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
-; CHECK-NEXT:    [[TMP101:%.*]] = shufflevector <2 x i32> [[TMP100]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP102:%.*]] = add <2 x i32> [[TMP99]], [[TMP101]]
-; CHECK-NEXT:    [[TMP103:%.*]] = sub <2 x i32> [[TMP99]], [[TMP101]]
-; CHECK-NEXT:    [[TMP104:%.*]] = shufflevector <2 x i32> [[TMP102]], <2 x i32> [[TMP103]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP105:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
-; CHECK-NEXT:    [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
-; CHECK-NEXT:    [[TMP107:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
-; CHECK-NEXT:    [[TMP109:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP108]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
-; CHECK-NEXT:    [[TMP111:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <2 x i32> [[TMP93]], i32 0
+; CHECK-NEXT:    [[TMP101:%.*]] = extractelement <2 x i32> [[TMP93]], i32 1
+; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP100]], [[TMP101]]
+; CHECK-NEXT:    [[TMP102:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
+; CHECK-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32>
+; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
+; CHECK-NEXT:    [[TMP105:%.*]] = shufflevector <2 x i32> [[TMP104]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP106:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
+; CHECK-NEXT:    [[TMP107:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP108:%.*]] = add <2 x i32> [[TMP105]], [[TMP107]]
+; CHECK-NEXT:    [[TMP109:%.*]] = sub <2 x i32> [[TMP105]], [[TMP107]]
+; CHECK-NEXT:    [[TMP110:%.*]] = shufflevector <2 x i32> [[TMP108]], <2 x i32> [[TMP109]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP111:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
 ; CHECK-NEXT:    [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
-; CHECK-NEXT:    [[TMP113:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP114:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32>
-; CHECK-NEXT:    [[TMP115:%.*]] = sub <2 x i32> [[TMP112]], [[TMP114]]
-; CHECK-NEXT:    [[TMP116:%.*]] = shl <2 x i32> [[TMP115]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP113:%.*]] = shufflevector <2 x i32> [[TMP112]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
+; CHECK-NEXT:    [[TMP115:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP114]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP116:%.*]] = zext <2 x i8> [[TMP115]] to <2 x i32>
+; CHECK-NEXT:    [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP118:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32>
-; CHECK-NEXT:    [[TMP119:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP119:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
-; CHECK-NEXT:    [[TMP121:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP122:%.*]] = zext <2 x i8> [[TMP121]] to <2 x i32>
-; CHECK-NEXT:    [[TMP123:%.*]] = sub <2 x i32> [[TMP120]], [[TMP122]]
-; CHECK-NEXT:    [[TMP124:%.*]] = shl <2 x i32> [[TMP123]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP125:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP126:%.*]] = sub <2 x i32> [[TMP125]], [[TMP110]]
-; CHECK-NEXT:    [[TMP127:%.*]] = add <2 x i32> [[TMP116]], [[TMP126]]
-; CHECK-NEXT:    [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP129:%.*]] = sub <2 x i32> [[TMP128]], [[TMP118]]
-; CHECK-NEXT:    [[TMP130:%.*]] = add <2 x i32> [[TMP124]], [[TMP129]]
-; CHECK-NEXT:    [[TMP131:%.*]] = extractelement <2 x i32> [[TMP127]], i32 1
-; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <2 x i32> [[TMP130]], i32 1
-; CHECK-NEXT:    [[ADD46:%.*]] = add i32 [[TMP132]], [[TMP131]]
-; CHECK-NEXT:    [[TMP133:%.*]] = sub <2 x i32> [[TMP127]], [[TMP130]]
-; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <2 x i32> [[TMP127]], i32 0
-; CHECK-NEXT:    [[TMP135:%.*]] = extractelement <2 x i32> [[TMP130]], i32 0
-; CHECK-NEXT:    [[ADD44:%.*]] = add i32 [[TMP135]], [[TMP134]]
-; CHECK-NEXT:    [[TMP136:%.*]] = lshr <2 x i32> [[TMP107]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP137:%.*]] = and <2 x i32> [[TMP136]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP138:%.*]] = mul <2 x i32> [[TMP137]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <2 x i32> [[TMP133]], i32 0
-; CHECK-NEXT:    [[TMP140:%.*]] = extractelement <2 x i32> [[TMP133]], i32 1
-; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP139]], [[TMP140]]
-; CHECK-NEXT:    [[TMP141:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; CHECK-NEXT:    [[TMP142:%.*]] = zext <2 x i8> [[TMP141]] to <2 x i32>
-; CHECK-NEXT:    [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; CHECK-NEXT:    [[TMP143:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
-; CHECK-NEXT:    [[TMP144:%.*]] = insertelement <2 x ptr> [[TMP143]], ptr [[ARRAYIDX22_1]], i32 1
-; CHECK-NEXT:    [[TMP145:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP144]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32>
-; CHECK-NEXT:    [[TMP147:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
-; CHECK-NEXT:    [[TMP148:%.*]] = shufflevector <2 x ptr> [[TMP147]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP149:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP150:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP149]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP121:%.*]] = sub <2 x i32> [[TMP118]], [[TMP120]]
+; CHECK-NEXT:    [[TMP122:%.*]] = shl <2 x i32> [[TMP121]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP123:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP124:%.*]] = zext <2 x i8> [[TMP123]] to <2 x i32>
+; CHECK-NEXT:    [[TMP125:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP126:%.*]] = zext <2 x i8> [[TMP125]] to <2 x i32>
+; CHECK-NEXT:    [[TMP127:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT:    [[TMP129:%.*]] = sub <2 x i32> [[TMP126]], [[TMP128]]
+; CHECK-NEXT:    [[TMP130:%.*]] = shl <2 x i32> [[TMP129]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP131:%.*]] = shufflevector <2 x i32> [[TMP112]], <2 x i32> [[TMP103]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP132:%.*]] = sub <2 x i32> [[TMP131]], [[TMP116]]
+; CHECK-NEXT:    [[TMP133:%.*]] = add <2 x i32> [[TMP122]], [[TMP132]]
+; CHECK-NEXT:    [[TMP134:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> [[TMP103]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP134]], [[TMP124]]
+; CHECK-NEXT:    [[TMP136:%.*]] = add <2 x i32> [[TMP130]], [[TMP135]]
+; CHECK-NEXT:    [[TMP137:%.*]] = extractelement <2 x i32> [[TMP133]], i32 1
+; CHECK-NEXT:    [[TMP138:%.*]] = extractelement <2 x i32> [[TMP136]], i32 1
+; CHECK-NEXT:    [[ADD46:%.*]] = add i32 [[TMP138]], [[TMP137]]
+; CHECK-NEXT:    [[TMP139:%.*]] = sub <2 x i32> [[TMP133]], [[TMP136]]
+; CHECK-NEXT:    [[TMP140:%.*]] = extractelement <2 x i32> [[TMP133]], i32 0
+; CHECK-NEXT:    [[TMP141:%.*]] = extractelement <2 x i32> [[TMP136]], i32 0
+; CHECK-NEXT:    [[ADD44:%.*]] = add i32 [[TMP141]], [[TMP140]]
+; CHECK-NEXT:    [[TMP142:%.*]] = lshr <2 x i32> [[TMP113]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP143:%.*]] = and <2 x i32> [[TMP142]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP144:%.*]] = mul <2 x i32> [[TMP143]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP139]], i32 0
+; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP139]], i32 1
+; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP145]], [[TMP146]]
+; CHECK-NEXT:    [[TMP147:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT:    [[TMP148:%.*]] = zext <2 x i8> [[TMP147]] to <2 x i32>
+; CHECK-NEXT:    [[TMP149:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP150:%.*]] = insertelement <2 x i8> [[TMP149]], i8 [[TMP11]], i32 1
 ; CHECK-NEXT:    [[TMP151:%.*]] = zext <2 x i8> [[TMP150]] to <2 x i32>
-; CHECK-NEXT:    [[TMP152:%.*]] = shufflevector <2 x ptr> [[TMP144]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP153:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP154:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP153]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32>
-; CHECK-NEXT:    [[TMP156:%.*]] = sub <2 x i32> [[TMP151]], [[TMP155]]
-; CHECK-NEXT:    [[TMP157:%.*]] = shl <2 x i32> [[TMP156]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP158:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT:    [[TMP159:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP158]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP160:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32>
-; CHECK-NEXT:    [[TMP161:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP162:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP161]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP163:%.*]] = zext <2 x i8> [[TMP162]] to <2 x i32>
-; CHECK-NEXT:    [[TMP164:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
+; CHECK-NEXT:    [[TMP153:%.*]] = shufflevector <2 x ptr> [[TMP152]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP154:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT:    [[TMP155:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP154]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP156:%.*]] = zext <2 x i8> [[TMP155]] to <2 x i32>
+; CHECK-NEXT:    [[TMP157:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
+; CHECK-NEXT:    [[TMP158:%.*]] = shufflevector <2 x ptr> [[TMP157]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP159:%.*]] = getelementptr i8, <2 x ptr> [[TMP158]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT:    [[TMP160:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP159]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32>
+; CHECK-NEXT:    [[TMP162:%.*]] = sub <2 x i32> [[TMP156]], [[TMP161]]
+; CHECK-NEXT:    [[TMP163:%.*]] = shl <2 x i32> [[TMP162]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP164:%.*]] = getelementptr i8, <2 x ptr> [[TMP158]], <2 x i64> <i64 1, i64 3>
 ; CHECK-NEXT:    [[TMP165:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP164]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP166:%.*]] = zext <2 x i8> [[TMP165]] to <2 x i32>
-; CHECK-NEXT:    [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP166]]
-; CHECK-NEXT:    [[TMP168:%.*]] = shl <2 x i32> [[TMP167]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP169:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP170:%.*]] = sub <2 x i32> [[TMP169]], [[TMP160]]
-; CHECK-NEXT:    [[TMP171:%.*]] = add <2 x i32> [[TMP168]], [[TMP170]]
-; CHECK-NEXT:    [[TMP172:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP173:%.*]] = sub <2 x i32> [[TMP172]], [[TMP146]]
-; CHECK-NEXT:    [[TMP174:%.*]] = add <2 x i32> [[TMP157]], [[TMP173]]
-; CHECK-NEXT:    [[TMP175:%.*]] = add <2 x i32> [[TMP171]], [[TMP174]]
-; CHECK-NEXT:    [[TMP176:%.*]] = sub <2 x i32> [[TMP174]], [[TMP171]]
-; CHECK-NEXT:    [[TMP177:%.*]] = extractelement <2 x i32> [[TMP175]], i32 0
-; CHECK-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP175]], i32 1
-; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP177]], [[TMP178]]
-; CHECK-NEXT:    [[TMP179:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP181:%.*]] = add <2 x i32> [[TMP179]], [[TMP180]]
-; CHECK-NEXT:    [[TMP182:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
-; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
-; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP182]], [[TMP183]]
-; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP178]], 15
+; CHECK-NEXT:    [[TMP167:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP168:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP167]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP169:%.*]] = zext <2 x i8> [[TMP168]] to <2 x i32>
+; CHECK-NEXT:    [[TMP170:%.*]] = getelementptr i8, <2 x ptr> [[TMP158]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT:    [[TMP171:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP170]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP172:%.*]] = zext <2 x i8> [[TMP171]] to <2 x i32>
+; CHECK-NEXT:    [[TMP173:%.*]] = sub <2 x i32> [[TMP169]], [[TMP172]]
+; CHECK-NEXT:    [[TMP174:%.*]] = shl <2 x i32> [[TMP173]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP175:%.*]] = insertelement <2 x i32> [[TMP148]], i32 [[CONV33_1]], i32 1
+; CHECK-NEXT:    [[TMP176:%.*]] = sub <2 x i32> [[TMP175]], [[TMP166]]
+; CHECK-NEXT:    [[TMP177:%.*]] = add <2 x i32> [[TMP174]], [[TMP176]]
+; CHECK-NEXT:    [[TMP178:%.*]] = insertelement <2 x i32> [[TMP148]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP179:%.*]] = sub <2 x i32> [[TMP178]], [[TMP151]]
+; CHECK-NEXT:    [[TMP180:%.*]] = add <2 x i32> [[TMP163]], [[TMP179]]
+; CHECK-NEXT:    [[TMP181:%.*]] = add <2 x i32> [[TMP177]], [[TMP180]]
+; CHECK-NEXT:    [[TMP182:%.*]] = sub <2 x i32> [[TMP180]], [[TMP177]]
+; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <2 x i32> [[TMP181]], i32 0
+; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <2 x i32> [[TMP181]], i32 1
+; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP183]], [[TMP184]]
+; CHECK-NEXT:    [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP182]], <2 x i32> [[TMP139]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP182]], <2 x i32> [[TMP139]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP187:%.*]] = add <2 x i32> [[TMP185]], [[TMP186]]
+; CHECK-NEXT:    [[TMP188:%.*]] = extractelement <2 x i32> [[TMP182]], i32 0
+; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP182]], i32 1
+; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP188]], [[TMP189]]
+; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP184]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT:    [[TMP184:%.*]] = lshr <2 x i32> [[TMP142]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP185:%.*]] = and <2 x i32> [[TMP184]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP186:%.*]] = mul <2 x i32> [[TMP185]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP187:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
-; CHECK-NEXT:    [[TMP188:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP181]], i32 0
-; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP181]], i32 1
-; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP189]], [[TMP190]]
-; CHECK-NEXT:    [[TMP191:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP192:%.*]] = lshr <2 x i32> [[TMP191]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP193:%.*]] = and <2 x i32> [[TMP192]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP194:%.*]] = mul <2 x i32> [[TMP193]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP195:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
-; CHECK-NEXT:    [[TMP196:%.*]] = shufflevector <2 x i32> [[TMP195]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
-; CHECK-NEXT:    [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
-; CHECK-NEXT:    [[TMP200:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP201:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP202:%.*]] = lshr <2 x i32> [[TMP200]], [[TMP201]]
-; CHECK-NEXT:    [[TMP203:%.*]] = sub <2 x i32> [[TMP200]], [[TMP201]]
-; CHECK-NEXT:    [[TMP204:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> [[TMP203]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP205:%.*]] = extractelement <2 x i32> [[TMP204]], i32 1
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP205]]
-; CHECK-NEXT:    [[TMP206:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
-; CHECK-NEXT:    [[TMP207:%.*]] = and <2 x i32> [[TMP204]], [[TMP206]]
-; CHECK-NEXT:    [[TMP208:%.*]] = sub <2 x i32> [[TMP204]], [[TMP206]]
-; CHECK-NEXT:    [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP208]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP210:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT:    [[TMP211:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP212:%.*]] = add <2 x i32> [[TMP211]], [[TMP198]]
-; CHECK-NEXT:    [[TMP213:%.*]] = sub <2 x i32> [[TMP211]], [[TMP198]]
-; CHECK-NEXT:    [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP212]], <2 x i32> [[TMP213]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP215:%.*]] = insertelement <2 x i32> [[TMP133]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP216:%.*]] = lshr <2 x i32> [[TMP215]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP217:%.*]] = and <2 x i32> [[TMP216]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP218:%.*]] = mul <2 x i32> [[TMP217]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP219]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP222:%.*]] = sub <2 x i32> [[TMP220]], [[TMP221]]
-; CHECK-NEXT:    [[TMP223:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP224:%.*]] = insertelement <2 x i32> [[TMP223]], i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP225:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
-; CHECK-NEXT:    [[TMP226:%.*]] = add <2 x i32> [[TMP224]], [[TMP225]]
-; CHECK-NEXT:    [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP229:%.*]] = add <2 x i32> [[TMP227]], [[TMP228]]
-; CHECK-NEXT:    [[TMP230:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
-; CHECK-NEXT:    [[TMP231:%.*]] = extractelement <2 x i32> [[TMP229]], i32 0
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP231]], [[TMP230]]
-; CHECK-NEXT:    [[TMP232:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
-; CHECK-NEXT:    [[TMP233:%.*]] = lshr <2 x i32> [[TMP232]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP234:%.*]] = and <2 x i32> [[TMP233]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP235:%.*]] = mul <2 x i32> [[TMP234]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
-; CHECK-NEXT:    [[TMP237:%.*]] = extractelement <2 x i32> [[TMP229]], i32 1
-; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[TMP237]], [[TMP236]]
-; CHECK-NEXT:    [[TMP238:%.*]] = sub <2 x i32> [[TMP226]], [[TMP229]]
+; CHECK-NEXT:    [[TMP190:%.*]] = lshr <2 x i32> [[TMP148]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP191:%.*]] = and <2 x i32> [[TMP190]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP192:%.*]] = mul <2 x i32> [[TMP191]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP193:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
+; CHECK-NEXT:    [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP193]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP195:%.*]] = extractelement <2 x i32> [[TMP187]], i32 0
+; CHECK-NEXT:    [[TMP196:%.*]] = extractelement <2 x i32> [[TMP187]], i32 1
+; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP195]], [[TMP196]]
+; CHECK-NEXT:    [[TMP197:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP198:%.*]] = lshr <2 x i32> [[TMP197]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP199:%.*]] = and <2 x i32> [[TMP198]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP200:%.*]] = mul <2 x i32> [[TMP199]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP201:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
+; CHECK-NEXT:    [[TMP202:%.*]] = shufflevector <2 x i32> [[TMP201]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP203:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; CHECK-NEXT:    [[TMP204:%.*]] = shufflevector <2 x i32> [[TMP203]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP205:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
+; CHECK-NEXT:    [[TMP206:%.*]] = shufflevector <2 x i32> [[TMP205]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP207:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP208:%.*]] = lshr <2 x i32> [[TMP206]], [[TMP207]]
+; CHECK-NEXT:    [[TMP209:%.*]] = sub <2 x i32> [[TMP206]], [[TMP207]]
+; CHECK-NEXT:    [[TMP210:%.*]] = shufflevector <2 x i32> [[TMP208]], <2 x i32> [[TMP209]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP211:%.*]] = extractelement <2 x i32> [[TMP210]], i32 1
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP211]]
+; CHECK-NEXT:    [[TMP212:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
+; CHECK-NEXT:    [[TMP213:%.*]] = and <2 x i32> [[TMP210]], [[TMP212]]
+; CHECK-NEXT:    [[TMP214:%.*]] = sub <2 x i32> [[TMP210]], [[TMP212]]
+; CHECK-NEXT:    [[TMP215:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP214]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP216:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; CHECK-NEXT:    [[TMP217:%.*]] = shufflevector <2 x i32> [[TMP216]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP218:%.*]] = add <2 x i32> [[TMP217]], [[TMP204]]
+; CHECK-NEXT:    [[TMP219:%.*]] = sub <2 x i32> [[TMP217]], [[TMP204]]
+; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP218]], <2 x i32> [[TMP219]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP221:%.*]] = insertelement <2 x i32> [[TMP139]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP222:%.*]] = lshr <2 x i32> [[TMP221]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP223:%.*]] = and <2 x i32> [[TMP222]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP224:%.*]] = mul <2 x i32> [[TMP223]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP225:%.*]] = shufflevector <2 x i32> [[TMP93]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP226:%.*]] = shufflevector <2 x i32> [[TMP225]], <2 x i32> [[TMP187]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP93]], <2 x i32> [[TMP187]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP228:%.*]] = sub <2 x i32> [[TMP226]], [[TMP227]]
+; CHECK-NEXT:    [[TMP229:%.*]] = shufflevector <2 x i32> [[TMP53]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP230:%.*]] = insertelement <2 x i32> [[TMP229]], i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP231:%.*]] = insertelement <2 x i32> [[TMP53]], i32 [[ADD44]], i32 1
+; CHECK-NEXT:    [[TMP232:%.*]] = add <2 x i32> [[TMP230]], [[TMP231]]
+; CHECK-NEXT:    [[TMP233:%.*]] = shufflevector <2 x i32> [[TMP85]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP234:%.*]] = shufflevector <2 x i32> [[TMP85]], <2 x i32> [[TMP181]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP235:%.*]] = add <2 x i32> [[TMP233]], [[TMP234]]
+; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP232]], i32 0
+; CHECK-NEXT:    [[TMP237:%.*]] = extractelement <2 x i32> [[TMP235]], i32 0
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP237]], [[TMP236]]
+; CHECK-NEXT:    [[TMP238:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[ADD46]], i32 1
+; CHECK-NEXT:    [[TMP239:%.*]] = lshr <2 x i32> [[TMP238]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP240:%.*]] = and <2 x i32> [[TMP239]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP241:%.*]] = mul <2 x i32> [[TMP240]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP242:%.*]] = extractelement <2 x i32> [[TMP232]], i32 1
+; CHECK-NEXT:    [[TMP243:%.*]] = extractelement <2 x i32> [[TMP235]], i32 1
+; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[TMP243]], [[TMP242]]
+; CHECK-NEXT:    [[TMP244:%.*]] = sub <2 x i32> [[TMP232]], [[TMP235]]
 ; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
 ; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
-; CHECK-NEXT:    [[TMP239:%.*]] = extractelement <2 x i32> [[TMP238]], i32 1
-; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP239]]
+; CHECK-NEXT:    [[TMP245:%.*]] = extractelement <2 x i32> [[TMP244]], i32 1
+; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP245]]
 ; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
-; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP92]]
+; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP98]]
 ; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
-; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP93]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP99]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP178]]
+; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP184]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
-; CHECK-NEXT:    [[TMP240:%.*]] = shufflevector <2 x i32> [[TMP222]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP241:%.*]] = insertelement <2 x i32> [[TMP240]], i32 [[SUB102]], i32 1
-; CHECK-NEXT:    [[TMP242:%.*]] = add <2 x i32> [[TMP238]], [[TMP241]]
-; CHECK-NEXT:    [[TMP243:%.*]] = sub <2 x i32> [[TMP238]], [[TMP241]]
-; CHECK-NEXT:    [[TMP244:%.*]] = shufflevector <2 x i32> [[TMP242]], <2 x i32> [[TMP243]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP245:%.*]] = add <2 x i32> [[TMP235]], [[TMP244]]
-; CHECK-NEXT:    [[TMP246:%.*]] = xor <2 x i32> [[TMP245]], [[TMP232]]
-; CHECK-NEXT:    [[TMP247:%.*]] = extractelement <2 x i32> [[TMP246]], i32 1
-; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP247]]
-; CHECK-NEXT:    [[TMP248:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
-; CHECK-NEXT:    [[TMP249:%.*]] = shufflevector <2 x i32> [[TMP248]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP250:%.*]] = add <2 x i32> [[TMP196]], [[TMP249]]
-; CHECK-NEXT:    [[TMP251:%.*]] = sub <2 x i32> [[TMP196]], [[TMP249]]
-; CHECK-NEXT:    [[TMP252:%.*]] = shufflevector <2 x i32> [[TMP250]], <2 x i32> [[TMP251]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP253:%.*]] = add <2 x i32> [[TMP194]], [[TMP252]]
-; CHECK-NEXT:    [[TMP254:%.*]] = xor <2 x i32> [[TMP253]], [[TMP191]]
-; CHECK-NEXT:    [[TMP255:%.*]] = extractelement <2 x i32> [[TMP246]], i32 0
-; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[TMP255]], [[ADD113]]
-; CHECK-NEXT:    [[TMP256:%.*]] = extractelement <2 x i32> [[TMP254]], i32 0
-; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP256]]
-; CHECK-NEXT:    [[TMP257:%.*]] = extractelement <2 x i32> [[TMP254]], i32 1
-; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP257]]
-; CHECK-NEXT:    [[TMP258:%.*]] = shufflevector <2 x i32> [[TMP209]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP258]], <2 x i32> [[TMP238]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP260:%.*]] = add <2 x i32> [[TMP222]], [[TMP259]]
-; CHECK-NEXT:    [[TMP261:%.*]] = sub <2 x i32> [[TMP222]], [[TMP259]]
-; CHECK-NEXT:    [[TMP262:%.*]] = shufflevector <2 x i32> [[TMP260]], <2 x i32> [[TMP261]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP263:%.*]] = add <2 x i32> [[TMP218]], [[TMP262]]
-; CHECK-NEXT:    [[TMP264:%.*]] = xor <2 x i32> [[TMP263]], [[TMP215]]
-; CHECK-NEXT:    [[TMP265:%.*]] = extractelement <2 x i32> [[TMP264]], i32 1
-; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP265]]
-; CHECK-NEXT:    [[TMP266:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP222]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP267:%.*]] = mul <2 x i32> [[TMP209]], [[TMP266]]
-; CHECK-NEXT:    [[TMP268:%.*]] = sub <2 x i32> [[TMP209]], [[TMP266]]
-; CHECK-NEXT:    [[TMP269:%.*]] = shufflevector <2 x i32> [[TMP267]], <2 x i32> [[TMP268]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP270:%.*]] = add <2 x i32> [[TMP186]], [[TMP214]]
-; CHECK-NEXT:    [[TMP271:%.*]] = xor <2 x i32> [[TMP270]], [[TMP142]]
-; CHECK-NEXT:    [[TMP272:%.*]] = extractelement <2 x i32> [[TMP269]], i32 0
-; CHECK-NEXT:    [[TMP273:%.*]] = extractelement <2 x i32> [[TMP269]], i32 1
-; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[TMP272]], [[TMP273]]
+; CHECK-NEXT:    [[TMP246:%.*]] = shufflevector <2 x i32> [[TMP228]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP247:%.*]] = insertelement <2 x i32> [[TMP246]], i32 [[SUB102]], i32 1
+; CHECK-NEXT:    [[TMP248:%.*]] = add <2 x i32> [[TMP244]], [[TMP247]]
+; CHECK-NEXT:    [[TMP249:%.*]] = sub <2 x i32> [[TMP244]], [[TMP247]]
+; CHECK-NEXT:    [[TMP250:%.*]] = shufflevector <2 x i32> [[TMP248]], <2 x i32> [[TMP249]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP251:%.*]] = add <2 x i32> [[TMP241]], [[TMP250]]
+; CHECK-NEXT:    [[TMP252:%.*]] = xor <2 x i32> [[TMP251]], [[TMP238]]
+; CHECK-NEXT:    [[TMP253:%.*]] = extractelement <2 x i32> [[TMP252]], i32 1
+; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP253]]
+; CHECK-NEXT:    [[TMP254:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
+; CHECK-NEXT:    [[TMP255:%.*]] = shufflevector <2 x i32> [[TMP254]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP256:%.*]] = add <2 x i32> [[TMP202]], [[TMP255]]
+; CHECK-NEXT:    [[TMP257:%.*]] = sub <2 x i32> [[TMP202]], [[TMP255]]
+; CHECK-NEXT:    [[TMP258:%.*]] = shufflevector <2 x i32> [[TMP256]], <2 x i32> [[TMP257]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP259:%.*]] = add <2 x i32> [[TMP200]], [[TMP258]]
+; CHECK-NEXT:    [[TMP260:%.*]] = xor <2 x i32> [[TMP259]], [[TMP197]]
+; CHECK-NEXT:    [[TMP261:%.*]] = extractelement <2 x i32> [[TMP252]], i32 0
+; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[TMP261]], [[ADD113]]
+; CHECK-NEXT:    [[TMP262:%.*]] = extractelement <2 x i32> [[TMP260]], i32 0
+; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP262]]
+; CHECK-NEXT:    [[TMP263:%.*]] = extractelement <2 x i32> [[TMP260]], i32 1
+; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP263]]
+; CHECK-NEXT:    [[TMP264:%.*]] = shufflevector <2 x i32> [[TMP215]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP265:%.*]] = shufflevector <2 x i32> [[TMP264]], <2 x i32> [[TMP244]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP266:%.*]] = add <2 x i32> [[TMP228]], [[TMP265]]
+; CHECK-NEXT:    [[TMP267:%.*]] = sub <2 x i32> [[TMP228]], [[TMP265]]
+; CHECK-NEXT:    [[TMP268:%.*]] = shufflevector <2 x i32> [[TMP266]], <2 x i32> [[TMP267]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP269:%.*]] = add <2 x i32> [[TMP224]], [[TMP268]]
+; CHECK-NEXT:    [[TMP270:%.*]] = xor <2 x i32> [[TMP269]], [[TMP221]]
+; CHECK-NEXT:    [[TMP271:%.*]] = extractelement <2 x i32> [[TMP270]], i32 1
+; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP271]]
+; CHECK-NEXT:    [[TMP272:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP228]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP273:%.*]] = mul <2 x i32> [[TMP215]], [[TMP272]]
+; CHECK-NEXT:    [[TMP274:%.*]] = sub <2 x i32> [[TMP215]], [[TMP272]]
+; CHECK-NEXT:    [[TMP275:%.*]] = shufflevector <2 x i32> [[TMP273]], <2 x i32> [[TMP274]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP276:%.*]] = add <2 x i32> [[TMP192]], [[TMP220]]
+; CHECK-NEXT:    [[TMP277:%.*]] = xor <2 x i32> [[TMP276]], [[TMP148]]
+; CHECK-NEXT:    [[TMP278:%.*]] = extractelement <2 x i32> [[TMP275]], i32 0
+; CHECK-NEXT:    [[TMP279:%.*]] = extractelement <2 x i32> [[TMP275]], i32 1
+; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[TMP278]], [[TMP279]]
 ; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[ADD44]]
-; CHECK-NEXT:    [[TMP274:%.*]] = extractelement <2 x i32> [[TMP264]], i32 0
-; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[TMP274]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP275:%.*]] = extractelement <2 x i32> [[TMP271]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP275]]
-; CHECK-NEXT:    [[TMP276:%.*]] = extractelement <2 x i32> [[TMP271]], i32 1
-; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP276]]
+; CHECK-NEXT:    [[TMP280:%.*]] = extractelement <2 x i32> [[TMP270]], i32 0
+; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[TMP280]], [[ADD113_1]]
+; CHECK-NEXT:    [[TMP281:%.*]] = extractelement <2 x i32> [[TMP277]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP281]]
+; CHECK-NEXT:    [[TMP282:%.*]] = extractelement <2 x i32> [[TMP277]], i32 1
+; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP282]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[TMP277:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
-; CHECK-NEXT:    [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP277]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP279:%.*]] = add <2 x i32> [[TMP278]], [[TMP188]]
-; CHECK-NEXT:    [[TMP280:%.*]] = sub <2 x i32> [[TMP278]], [[TMP188]]
-; CHECK-NEXT:    [[TMP281:%.*]] = shufflevector <2 x i32> [[TMP279]], <2 x i32> [[TMP280]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP282:%.*]] = add <2 x i32> [[TMP104]], [[TMP281]]
-; CHECK-NEXT:    [[TMP283:%.*]] = sub <2 x i32> [[TMP281]], [[TMP104]]
-; CHECK-NEXT:    [[TMP284:%.*]] = add <2 x i32> [[TMP138]], [[TMP282]]
-; CHECK-NEXT:    [[TMP285:%.*]] = xor <2 x i32> [[TMP284]], [[TMP107]]
-; CHECK-NEXT:    [[TMP286:%.*]] = lshr <2 x i32> [[TMP97]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP287:%.*]] = and <2 x i32> [[TMP286]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP288:%.*]] = mul <2 x i32> [[TMP287]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP289:%.*]] = add <2 x i32> [[TMP288]], [[TMP283]]
-; CHECK-NEXT:    [[TMP290:%.*]] = xor <2 x i32> [[TMP289]], [[TMP97]]
-; CHECK-NEXT:    [[TMP291:%.*]] = extractelement <2 x i32> [[TMP285]], i32 1
-; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[TMP291]], [[ADD113_2]]
-; CHECK-NEXT:    [[TMP292:%.*]] = extractelement <2 x i32> [[TMP285]], i32 0
-; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP292]]
-; CHECK-NEXT:    [[TMP293:%.*]] = extractelement <2 x i32> [[TMP290]], i32 0
-; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP293]]
-; CHECK-NEXT:    [[TMP294:%.*]] = extractelement <2 x i32> [[TMP290]], i32 1
-; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP294]]
+; CHECK-NEXT:    [[TMP283:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
+; CHECK-NEXT:    [[TMP284:%.*]] = shufflevector <2 x i32> [[TMP283]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP285:%.*]] = add <2 x i32> [[TMP284]], [[TMP194]]
+; CHECK-NEXT:    [[TMP286:%.*]] = sub <2 x i32> [[TMP284]], [[TMP194]]
+; CHECK-NEXT:    [[TMP287:%.*]] = shufflevector <2 x i32> [[TMP285]], <2 x i32> [[TMP286]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP288:%.*]] = add <2 x i32> [[TMP110]], [[TMP287]]
+; CHECK-NEXT:    [[TMP289:%.*]] = sub <2 x i32> [[TMP287]], [[TMP110]]
+; CHECK-NEXT:    [[TMP290:%.*]] = add <2 x i32> [[TMP144]], [[TMP288]]
+; CHECK-NEXT:    [[TMP291:%.*]] = xor <2 x i32> [[TMP290]], [[TMP113]]
+; CHECK-NEXT:    [[TMP292:%.*]] = lshr <2 x i32> [[TMP103]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP293:%.*]] = and <2 x i32> [[TMP292]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP294:%.*]] = mul <2 x i32> [[TMP293]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP295:%.*]] = add <2 x i32> [[TMP294]], [[TMP289]]
+; CHECK-NEXT:    [[TMP296:%.*]] = xor <2 x i32> [[TMP295]], [[TMP103]]
+; CHECK-NEXT:    [[TMP297:%.*]] = extractelement <2 x i32> [[TMP291]], i32 1
+; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[TMP297]], [[ADD113_2]]
+; CHECK-NEXT:    [[TMP298:%.*]] = extractelement <2 x i32> [[TMP291]], i32 0
+; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP298]]
+; CHECK-NEXT:    [[TMP299:%.*]] = extractelement <2 x i32> [[TMP296]], i32 0
+; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP299]]
+; CHECK-NEXT:    [[TMP300:%.*]] = extractelement <2 x i32> [[TMP296]], i32 1
+; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP300]]
 ; CHECK-NEXT:    ret i32 [[ADD113_3]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
index 63d13452bc96d1..6825f43b5a9eb4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
@@ -8,24 +8,27 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 15
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 7
+; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 6
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4
+; CHECK-NEXT:    [[I7:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12
+; CHECK-NEXT:    [[I9:%.*]] = load i32, ptr [[ARRAYIDX27]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 13
-; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX34]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 5
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> poison, ptr [[ARRAYIDX1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[ARRAYIDX6]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> [[TMP3]], ptr [[ARRAYIDX13]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x ptr> [[TMP4]], ptr [[ARRAYIDX20]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[ARRAYIDX27]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[ARRAYIDX34]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x ptr> [[TMP7]], ptr [[ARRAYIDX41]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x ptr> [[TMP8]], ptr [[ARRAYIDX48]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
-; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <8 x i32> [[TMP10]], [[TMP1]]
+; CHECK-NEXT:    [[I15:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[I1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 1, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[I7]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I9]], i32 4
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[I15]], i32 7
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <8 x i32> [[TMP10]], [[TMP2]]
 ; CHECK-NEXT:    store <8 x i32> [[TMP11]], ptr [[P]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -101,13 +104,13 @@ define dso_local void @test_unordered_splits(ptr nocapture %p) local_unnamed_add
 ; CHECK-NEXT:    [[P2:%.*]] = alloca [16 x i32], align 16
 ; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds [16 x i32], ptr [[P1]], i32 0, i64 4
 ; CHECK-NEXT:    [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 12
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[G10]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[G20]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
-; CHECK-NEXT:    store <8 x i32> [[SHUFFLE]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[G10]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[G20]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
+; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -158,18 +161,18 @@ define dso_local void @test_cost_splits(ptr nocapture %p) local_unnamed_addr {
 ; CHECK-NEXT:    [[G12:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 6
 ; CHECK-NEXT:    [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P3]], i32 0, i64 12
 ; CHECK-NEXT:    [[G22:%.*]] = getelementptr inbounds [16 x i32], ptr [[P4]], i32 0, i64 14
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[G10]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[G12]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr [[G20]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[G22]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[G10]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[G12]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[G20]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[G22]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:    store <8 x i32> [[TMP14]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; CHECK-NEXT:    store <8 x i32> [[TMP10]], ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:



More information about the llvm-commits mailing list