[llvm] [SLP]Try to vectorize small graph with extractelements, used in (PR #83468)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 29 11:40:24 PST 2024
https://github.com/alexey-bataev created https://github.com/llvm/llvm-project/pull/83468
buildvector.
If the graph incudes only single "gather" node with only
extractelements/undefs, which used only in insertelement-based
buildvector sequences, it still might be profitable to vectorize it.
Need to rely on the cost model, not throw this graph away immediately.
>From 124d775ffe803a35916ea4ec2a8421f0ccd123fa Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 29 Feb 2024 19:40:15 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 14 +
.../SLPVectorizer/RISCV/complex-loads.ll | 655 +++++++++---------
.../Transforms/SLPVectorizer/X86/PR39774.ll | 18 +-
...nsert-element-build-vector-inseltpoison.ll | 2 +
.../X86/insert-element-build-vector.ll | 2 +
.../SLPVectorizer/X86/reduction-transpose.ll | 115 +--
6 files changed, 418 insertions(+), 388 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 94b7c4952f055e..6937cae0d3e13b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8974,6 +8974,20 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
if (isFullyVectorizableTinyTree(ForReduction))
return false;
+ // Check if any of the gather node forms an insertelement buildvector
+ // somewhere.
+ if (any_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->State == TreeEntry::NeedToGather &&
+ all_of(TE->Scalars, [](Value *V) {
+ return isa<ExtractElementInst, UndefValue>(V) ||
+ (!V->hasNUsesOrMore(8) &&
+ any_of(V->users(), [](User *U) {
+ return isa<InsertElementInst>(U);
+ }));
+ });
+ }))
+ return false;
+
assert(VectorizableTree.empty()
? ExternalUses.empty()
: true && "We shouldn't have any external users");
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index e167b6a47af592..ec431a56a8609f 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -1,369 +1,360 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-40 | FileCheck %s
+; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-20 | FileCheck %s
define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) {
; CHECK-LABEL: define i32 @test(
; CHECK-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX1]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x ptr> [[TMP3]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <2 x ptr> [[TMP4]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP4]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, <2 x ptr> [[TMP4]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr i8, ptr [[PIX1]], i64 2
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX2]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x ptr> [[TMP4]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 5, i64 7>
; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 2
+; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3
+; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
+; CHECK-NEXT: [[CONV33:%.*]] = zext i8 [[TMP10]] to i32
; CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]]
-; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[ADD_PTR3]], align 1
-; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP9]] to i32
+; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ADD_PTR3]], align 1
+; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP11]] to i32
; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1
; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
-; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
-; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
+; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP12]] to i32
; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
-; CHECK-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR_1]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x ptr> [[TMP11]], ptr [[ARRAYIDX20_2]], i32 1
-; CHECK-NEXT: [[TMP13:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP12]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
+; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
; CHECK-NEXT: [[TMP14:%.*]] = zext <2 x i8> [[TMP13]] to <2 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
+; CHECK-NEXT: [[TMP17:%.*]] = sub <2 x i32> [[TMP14]], [[TMP16]]
+; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT: [[TMP19:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32>
+; CHECK-NEXT: [[TMP20:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32>
+; CHECK-NEXT: [[TMP22:%.*]] = sub <2 x i32> [[TMP19]], [[TMP21]]
+; CHECK-NEXT: [[TMP23:%.*]] = shl <2 x i32> [[TMP22]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP24:%.*]] = add <2 x i32> [[TMP23]], [[TMP17]]
+; CHECK-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
; CHECK-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR64_1]], i32 0
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x ptr> [[TMP15]], ptr [[ARRAYIDX22_2]], i32 1
-; CHECK-NEXT: [[TMP17:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP16]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32>
-; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP14]], [[TMP18]]
-; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x ptr> [[TMP12]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, <2 x ptr> [[TMP20]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP22:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP21]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
-; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x ptr> [[TMP16]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, <2 x ptr> [[TMP24]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP26:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP25]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32>
-; CHECK-NEXT: [[TMP28:%.*]] = sub <2 x i32> [[TMP23]], [[TMP27]]
-; CHECK-NEXT: [[TMP29:%.*]] = shl <2 x i32> [[TMP28]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP29]], [[TMP19]]
-; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i8, <2 x ptr> [[TMP20]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT: [[TMP32:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP31]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6
+; CHECK-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6
+; CHECK-NEXT: [[TMP25:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
+; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>
+; CHECK-NEXT: [[TMP27:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1
+; CHECK-NEXT: [[TMP28:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
+; CHECK-NEXT: [[TMP29:%.*]] = sub <2 x i32> [[TMP26]], [[TMP28]]
+; CHECK-NEXT: [[TMP30:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1
+; CHECK-NEXT: [[TMP31:%.*]] = zext <2 x i8> [[TMP30]] to <2 x i32>
+; CHECK-NEXT: [[TMP32:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1
; CHECK-NEXT: [[TMP33:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
-; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i8, <2 x ptr> [[TMP24]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT: [[TMP35:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP34]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP36:%.*]] = zext <2 x i8> [[TMP35]] to <2 x i32>
-; CHECK-NEXT: [[TMP37:%.*]] = sub <2 x i32> [[TMP33]], [[TMP36]]
-; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i8, <2 x ptr> [[TMP20]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP39:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP38]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP40:%.*]] = zext <2 x i8> [[TMP39]] to <2 x i32>
-; CHECK-NEXT: [[TMP41:%.*]] = getelementptr i8, <2 x ptr> [[TMP24]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP42:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP41]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP43:%.*]] = zext <2 x i8> [[TMP42]] to <2 x i32>
-; CHECK-NEXT: [[TMP44:%.*]] = sub <2 x i32> [[TMP40]], [[TMP43]]
-; CHECK-NEXT: [[TMP45:%.*]] = shl <2 x i32> [[TMP44]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP46:%.*]] = add <2 x i32> [[TMP45]], [[TMP37]]
-; CHECK-NEXT: [[TMP47:%.*]] = sub <2 x i32> [[TMP30]], [[TMP46]]
-; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0
-; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
-; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP48]], [[TMP49]]
-; CHECK-NEXT: [[TMP50:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT: [[TMP34:%.*]] = sub <2 x i32> [[TMP31]], [[TMP33]]
+; CHECK-NEXT: [[TMP35:%.*]] = shl <2 x i32> [[TMP34]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP36:%.*]] = add <2 x i32> [[TMP35]], [[TMP29]]
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i32> [[TMP24]], i32 0
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x i32> [[TMP24]], i32 1
+; CHECK-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP38]], [[TMP37]]
+; CHECK-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP37]], [[TMP38]]
+; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP36]], i32 0
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[TMP36]], i32 1
+; CHECK-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP40]], [[TMP39]]
+; CHECK-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP39]], [[TMP40]]
+; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]]
+; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
-; CHECK-NEXT: [[TMP51:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
-; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP52]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
-; CHECK-NEXT: [[TMP55:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
-; CHECK-NEXT: [[TMP56:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP55]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
-; CHECK-NEXT: [[TMP58:%.*]] = sub <2 x i32> [[TMP54]], [[TMP57]]
-; CHECK-NEXT: [[TMP59:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
+; CHECK-NEXT: [[TMP44:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP43]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP45:%.*]] = zext <2 x i8> [[TMP44]] to <2 x i32>
+; CHECK-NEXT: [[TMP46:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
+; CHECK-NEXT: [[TMP47:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP46]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP48:%.*]] = zext <2 x i8> [[TMP47]] to <2 x i32>
+; CHECK-NEXT: [[TMP49:%.*]] = sub <2 x i32> [[TMP45]], [[TMP48]]
+; CHECK-NEXT: [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
+; CHECK-NEXT: [[TMP52:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP53:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
+; CHECK-NEXT: [[TMP54:%.*]] = sub <2 x i32> [[TMP51]], [[TMP53]]
+; CHECK-NEXT: [[TMP55:%.*]] = shl <2 x i32> [[TMP54]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP56:%.*]] = add <2 x i32> [[TMP55]], [[TMP49]]
+; CHECK-NEXT: [[TMP57:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP58:%.*]] = zext <2 x i8> [[TMP57]] to <2 x i32>
+; CHECK-NEXT: [[TMP59:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
; CHECK-NEXT: [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
-; CHECK-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
-; CHECK-NEXT: [[TMP63:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
-; CHECK-NEXT: [[TMP64:%.*]] = shl <2 x i32> [[TMP63]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP65:%.*]] = add <2 x i32> [[TMP64]], [[TMP58]]
-; CHECK-NEXT: [[TMP66:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP67:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
-; CHECK-NEXT: [[TMP68:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP69:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
-; CHECK-NEXT: [[TMP70:%.*]] = sub <2 x i32> [[TMP67]], [[TMP69]]
-; CHECK-NEXT: [[TMP71:%.*]] = insertelement <2 x i8> poison, i8 [[TMP51]], i32 0
-; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i8> [[TMP71]], i8 [[TMP50]], i32 1
-; CHECK-NEXT: [[TMP73:%.*]] = zext <2 x i8> [[TMP72]] to <2 x i32>
-; CHECK-NEXT: [[TMP74:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
-; CHECK-NEXT: [[TMP76:%.*]] = sub <2 x i32> [[TMP73]], [[TMP75]]
-; CHECK-NEXT: [[TMP77:%.*]] = shl <2 x i32> [[TMP76]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP78:%.*]] = add <2 x i32> [[TMP77]], [[TMP70]]
-; CHECK-NEXT: [[TMP79:%.*]] = sub <2 x i32> [[TMP65]], [[TMP78]]
-; CHECK-NEXT: [[TMP80:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP82:%.*]] = add <2 x i32> [[TMP80]], [[TMP81]]
-; CHECK-NEXT: [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP85:%.*]] = add <2 x i32> [[TMP83]], [[TMP84]]
-; CHECK-NEXT: [[TMP86:%.*]] = add <2 x i32> [[TMP85]], [[TMP82]]
-; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP82]], [[TMP85]]
-; CHECK-NEXT: [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
-; CHECK-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
-; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP88]]
-; CHECK-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP86]], i32 0
-; CHECK-NEXT: [[TMP91:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1
-; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[TMP90]], [[TMP91]]
-; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[TMP91]], [[TMP90]]
-; CHECK-NEXT: [[TMP92:%.*]] = extractelement <2 x i32> [[TMP54]], i32 1
-; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP92]], 15
+; CHECK-NEXT: [[TMP61:%.*]] = sub <2 x i32> [[TMP58]], [[TMP60]]
+; CHECK-NEXT: [[TMP62:%.*]] = insertelement <2 x i8> poison, i8 [[TMP42]], i32 0
+; CHECK-NEXT: [[TMP63:%.*]] = insertelement <2 x i8> [[TMP62]], i8 [[TMP41]], i32 1
+; CHECK-NEXT: [[TMP64:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
+; CHECK-NEXT: [[TMP65:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP66:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
+; CHECK-NEXT: [[TMP67:%.*]] = sub <2 x i32> [[TMP64]], [[TMP66]]
+; CHECK-NEXT: [[TMP68:%.*]] = shl <2 x i32> [[TMP67]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP69:%.*]] = add <2 x i32> [[TMP68]], [[TMP61]]
+; CHECK-NEXT: [[TMP70:%.*]] = add <2 x i32> [[TMP69]], [[TMP56]]
+; CHECK-NEXT: [[TMP71:%.*]] = sub <2 x i32> [[TMP56]], [[TMP69]]
+; CHECK-NEXT: [[TMP72:%.*]] = extractelement <2 x i32> [[TMP70]], i32 0
+; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP70]], i32 1
+; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP72]], [[TMP73]]
+; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
+; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
+; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP45]], i32 1
+; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP74]], 15
; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
; CHECK-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
-; CHECK-NEXT: [[TMP93:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
-; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP93]], 15
+; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[ADD46_2]], 15
; CHECK-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
; CHECK-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
-; CHECK-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0
-; CHECK-NEXT: [[TMP95:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1
-; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP94]], [[TMP95]]
-; CHECK-NEXT: [[TMP96:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1
-; CHECK-NEXT: [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
-; CHECK-NEXT: [[TMP98:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0
-; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <2 x i32> [[TMP98]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP100:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0
-; CHECK-NEXT: [[TMP101:%.*]] = shufflevector <2 x i32> [[TMP100]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP102:%.*]] = add <2 x i32> [[TMP99]], [[TMP101]]
-; CHECK-NEXT: [[TMP103:%.*]] = sub <2 x i32> [[TMP99]], [[TMP101]]
-; CHECK-NEXT: [[TMP104:%.*]] = shufflevector <2 x i32> [[TMP102]], <2 x i32> [[TMP103]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP105:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1
-; CHECK-NEXT: [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
-; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP108:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1
-; CHECK-NEXT: [[TMP109:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP108]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
-; CHECK-NEXT: [[TMP111:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
-; CHECK-NEXT: [[TMP113:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP114:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32>
-; CHECK-NEXT: [[TMP115:%.*]] = sub <2 x i32> [[TMP112]], [[TMP114]]
-; CHECK-NEXT: [[TMP116:%.*]] = shl <2 x i32> [[TMP115]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP118:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32>
-; CHECK-NEXT: [[TMP119:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
-; CHECK-NEXT: [[TMP121:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP122:%.*]] = zext <2 x i8> [[TMP121]] to <2 x i32>
-; CHECK-NEXT: [[TMP123:%.*]] = sub <2 x i32> [[TMP120]], [[TMP122]]
-; CHECK-NEXT: [[TMP124:%.*]] = shl <2 x i32> [[TMP123]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP125:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP126:%.*]] = sub <2 x i32> [[TMP125]], [[TMP110]]
-; CHECK-NEXT: [[TMP127:%.*]] = add <2 x i32> [[TMP116]], [[TMP126]]
-; CHECK-NEXT: [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP97]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP129:%.*]] = sub <2 x i32> [[TMP128]], [[TMP118]]
-; CHECK-NEXT: [[TMP130:%.*]] = add <2 x i32> [[TMP124]], [[TMP129]]
-; CHECK-NEXT: [[TMP131:%.*]] = extractelement <2 x i32> [[TMP127]], i32 1
-; CHECK-NEXT: [[TMP132:%.*]] = extractelement <2 x i32> [[TMP130]], i32 1
-; CHECK-NEXT: [[ADD46:%.*]] = add i32 [[TMP132]], [[TMP131]]
-; CHECK-NEXT: [[TMP133:%.*]] = sub <2 x i32> [[TMP127]], [[TMP130]]
-; CHECK-NEXT: [[TMP134:%.*]] = extractelement <2 x i32> [[TMP127]], i32 0
-; CHECK-NEXT: [[TMP135:%.*]] = extractelement <2 x i32> [[TMP130]], i32 0
-; CHECK-NEXT: [[ADD44:%.*]] = add i32 [[TMP135]], [[TMP134]]
-; CHECK-NEXT: [[TMP136:%.*]] = lshr <2 x i32> [[TMP107]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP137:%.*]] = and <2 x i32> [[TMP136]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP138:%.*]] = mul <2 x i32> [[TMP137]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP139:%.*]] = extractelement <2 x i32> [[TMP133]], i32 0
-; CHECK-NEXT: [[TMP140:%.*]] = extractelement <2 x i32> [[TMP133]], i32 1
-; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP139]], [[TMP140]]
-; CHECK-NEXT: [[TMP141:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; CHECK-NEXT: [[TMP142:%.*]] = zext <2 x i8> [[TMP141]] to <2 x i32>
+; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
+; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP75]], 15
+; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
+; CHECK-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
+; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
+; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
+; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
+; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15
+; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
+; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
+; CHECK-NEXT: [[TMP76:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
+; CHECK-NEXT: [[TMP77:%.*]] = zext <2 x i8> [[TMP76]] to <2 x i32>
+; CHECK-NEXT: [[TMP78:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[ARRAYIDX22]], i32 1
+; CHECK-NEXT: [[TMP79:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP78]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP80:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
+; CHECK-NEXT: [[TMP81:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP3]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP82:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
+; CHECK-NEXT: [[TMP83:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP84:%.*]] = zext <2 x i8> [[TMP83]] to <2 x i32>
+; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]]
+; CHECK-NEXT: [[TMP86:%.*]] = shl <2 x i32> [[TMP85]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP87:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP88:%.*]] = zext <2 x i8> [[TMP87]] to <2 x i32>
+; CHECK-NEXT: [[TMP89:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP90:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
+; CHECK-NEXT: [[TMP91:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP9]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP92:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
+; CHECK-NEXT: [[TMP93:%.*]] = sub <2 x i32> [[TMP90]], [[TMP92]]
+; CHECK-NEXT: [[TMP94:%.*]] = shl <2 x i32> [[TMP93]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP95:%.*]] = insertelement <2 x i32> [[TMP77]], i32 [[CONV33]], i32 1
+; CHECK-NEXT: [[TMP96:%.*]] = sub <2 x i32> [[TMP95]], [[TMP88]]
+; CHECK-NEXT: [[TMP97:%.*]] = add <2 x i32> [[TMP94]], [[TMP96]]
+; CHECK-NEXT: [[TMP98:%.*]] = insertelement <2 x i32> [[TMP77]], i32 [[CONV]], i32 0
+; CHECK-NEXT: [[TMP99:%.*]] = sub <2 x i32> [[TMP98]], [[TMP80]]
+; CHECK-NEXT: [[TMP100:%.*]] = add <2 x i32> [[TMP86]], [[TMP99]]
+; CHECK-NEXT: [[TMP101:%.*]] = shufflevector <2 x i32> [[TMP97]], <2 x i32> [[TMP100]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP102:%.*]] = add <2 x i32> [[TMP97]], [[TMP100]]
+; CHECK-NEXT: [[TMP103:%.*]] = sub <2 x i32> [[TMP100]], [[TMP97]]
+; CHECK-NEXT: [[TMP104:%.*]] = extractelement <2 x i32> [[TMP102]], i32 0
+; CHECK-NEXT: [[TMP105:%.*]] = extractelement <2 x i32> [[TMP102]], i32 1
+; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP105]], [[TMP104]]
+; CHECK-NEXT: [[TMP106:%.*]] = extractelement <2 x i32> [[TMP103]], i32 1
+; CHECK-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP105]], 15
+; CHECK-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
+; CHECK-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
+; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP106]], 15
+; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
+; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
+; CHECK-NEXT: [[TMP107:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP107]] to <2 x i32>
; CHECK-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; CHECK-NEXT: [[TMP143:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
-; CHECK-NEXT: [[TMP144:%.*]] = insertelement <2 x ptr> [[TMP143]], ptr [[ARRAYIDX22_1]], i32 1
-; CHECK-NEXT: [[TMP145:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP144]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32>
-; CHECK-NEXT: [[TMP147:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
-; CHECK-NEXT: [[TMP148:%.*]] = shufflevector <2 x ptr> [[TMP147]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP149:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP150:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP149]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP151:%.*]] = zext <2 x i8> [[TMP150]] to <2 x i32>
-; CHECK-NEXT: [[TMP152:%.*]] = shufflevector <2 x ptr> [[TMP144]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP153:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP154:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP153]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32>
-; CHECK-NEXT: [[TMP156:%.*]] = sub <2 x i32> [[TMP151]], [[TMP155]]
-; CHECK-NEXT: [[TMP157:%.*]] = shl <2 x i32> [[TMP156]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP158:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT: [[TMP159:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP158]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP160:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32>
-; CHECK-NEXT: [[TMP161:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP162:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP161]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP163:%.*]] = zext <2 x i8> [[TMP162]] to <2 x i32>
-; CHECK-NEXT: [[TMP164:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP165:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP164]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP166:%.*]] = zext <2 x i8> [[TMP165]] to <2 x i32>
-; CHECK-NEXT: [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP166]]
-; CHECK-NEXT: [[TMP168:%.*]] = shl <2 x i32> [[TMP167]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP169:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT: [[TMP170:%.*]] = sub <2 x i32> [[TMP169]], [[TMP160]]
-; CHECK-NEXT: [[TMP171:%.*]] = add <2 x i32> [[TMP168]], [[TMP170]]
-; CHECK-NEXT: [[TMP172:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT: [[TMP173:%.*]] = sub <2 x i32> [[TMP172]], [[TMP146]]
-; CHECK-NEXT: [[TMP174:%.*]] = add <2 x i32> [[TMP157]], [[TMP173]]
-; CHECK-NEXT: [[TMP175:%.*]] = add <2 x i32> [[TMP171]], [[TMP174]]
-; CHECK-NEXT: [[TMP176:%.*]] = sub <2 x i32> [[TMP174]], [[TMP171]]
-; CHECK-NEXT: [[TMP177:%.*]] = extractelement <2 x i32> [[TMP175]], i32 0
-; CHECK-NEXT: [[TMP178:%.*]] = extractelement <2 x i32> [[TMP175]], i32 1
-; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP177]], [[TMP178]]
-; CHECK-NEXT: [[TMP179:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP181:%.*]] = add <2 x i32> [[TMP179]], [[TMP180]]
-; CHECK-NEXT: [[TMP182:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
-; CHECK-NEXT: [[TMP183:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
-; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP182]], [[TMP183]]
-; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP178]], 15
+; CHECK-NEXT: [[TMP109:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
+; CHECK-NEXT: [[TMP110:%.*]] = insertelement <2 x ptr> [[TMP109]], ptr [[ARRAYIDX22_1]], i32 1
+; CHECK-NEXT: [[TMP111:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP110]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
+; CHECK-NEXT: [[TMP113:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
+; CHECK-NEXT: [[TMP114:%.*]] = shufflevector <2 x ptr> [[TMP113]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP115:%.*]] = getelementptr i8, <2 x ptr> [[TMP114]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT: [[TMP116:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP115]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP117:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
+; CHECK-NEXT: [[TMP118:%.*]] = shufflevector <2 x ptr> [[TMP110]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP119:%.*]] = getelementptr i8, <2 x ptr> [[TMP118]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT: [[TMP120:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP119]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
+; CHECK-NEXT: [[TMP122:%.*]] = sub <2 x i32> [[TMP117]], [[TMP121]]
+; CHECK-NEXT: [[TMP123:%.*]] = shl <2 x i32> [[TMP122]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP124:%.*]] = getelementptr i8, <2 x ptr> [[TMP118]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT: [[TMP125:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP124]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP126:%.*]] = zext <2 x i8> [[TMP125]] to <2 x i32>
+; CHECK-NEXT: [[TMP127:%.*]] = getelementptr i8, <2 x ptr> [[TMP114]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT: [[TMP128:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP127]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP129:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32>
+; CHECK-NEXT: [[TMP130:%.*]] = getelementptr i8, <2 x ptr> [[TMP118]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT: [[TMP131:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP130]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
+; CHECK-NEXT: [[TMP133:%.*]] = sub <2 x i32> [[TMP129]], [[TMP132]]
+; CHECK-NEXT: [[TMP134:%.*]] = shl <2 x i32> [[TMP133]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP135:%.*]] = insertelement <2 x i32> [[TMP108]], i32 [[CONV33_1]], i32 1
+; CHECK-NEXT: [[TMP136:%.*]] = sub <2 x i32> [[TMP135]], [[TMP126]]
+; CHECK-NEXT: [[TMP137:%.*]] = add <2 x i32> [[TMP134]], [[TMP136]]
+; CHECK-NEXT: [[TMP138:%.*]] = insertelement <2 x i32> [[TMP108]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT: [[TMP139:%.*]] = sub <2 x i32> [[TMP138]], [[TMP112]]
+; CHECK-NEXT: [[TMP140:%.*]] = add <2 x i32> [[TMP123]], [[TMP139]]
+; CHECK-NEXT: [[TMP141:%.*]] = add <2 x i32> [[TMP137]], [[TMP140]]
+; CHECK-NEXT: [[TMP142:%.*]] = sub <2 x i32> [[TMP140]], [[TMP137]]
+; CHECK-NEXT: [[TMP143:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
+; CHECK-NEXT: [[TMP144:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
+; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP144]], [[TMP143]]
+; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP144]], 15
; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT: [[TMP184:%.*]] = lshr <2 x i32> [[TMP142]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP185:%.*]] = and <2 x i32> [[TMP184]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP186:%.*]] = mul <2 x i32> [[TMP185]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP187:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0
-; CHECK-NEXT: [[TMP188:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP181]], i32 0
-; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP181]], i32 1
-; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[TMP189]], [[TMP190]]
-; CHECK-NEXT: [[TMP191:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP176]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP192:%.*]] = lshr <2 x i32> [[TMP191]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP193:%.*]] = and <2 x i32> [[TMP192]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP194:%.*]] = mul <2 x i32> [[TMP193]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP195:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
-; CHECK-NEXT: [[TMP196:%.*]] = shufflevector <2 x i32> [[TMP195]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
-; CHECK-NEXT: [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0
-; CHECK-NEXT: [[TMP200:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP201:%.*]] = insertelement <2 x i32> <i32 15, i32 poison>, i32 [[ADD46]], i32 1
-; CHECK-NEXT: [[TMP202:%.*]] = lshr <2 x i32> [[TMP200]], [[TMP201]]
-; CHECK-NEXT: [[TMP203:%.*]] = sub <2 x i32> [[TMP200]], [[TMP201]]
-; CHECK-NEXT: [[TMP204:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> [[TMP203]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP205:%.*]] = extractelement <2 x i32> [[TMP204]], i32 1
-; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP205]]
-; CHECK-NEXT: [[TMP206:%.*]] = insertelement <2 x i32> <i32 65537, i32 poison>, i32 [[SUB51_1]], i32 1
-; CHECK-NEXT: [[TMP207:%.*]] = and <2 x i32> [[TMP204]], [[TMP206]]
-; CHECK-NEXT: [[TMP208:%.*]] = sub <2 x i32> [[TMP204]], [[TMP206]]
-; CHECK-NEXT: [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP208]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP210:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT: [[TMP211:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP212:%.*]] = add <2 x i32> [[TMP211]], [[TMP198]]
-; CHECK-NEXT: [[TMP213:%.*]] = sub <2 x i32> [[TMP211]], [[TMP198]]
-; CHECK-NEXT: [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP212]], <2 x i32> [[TMP213]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP215:%.*]] = insertelement <2 x i32> [[TMP133]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT: [[TMP216:%.*]] = lshr <2 x i32> [[TMP215]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP217:%.*]] = and <2 x i32> [[TMP216]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP218:%.*]] = mul <2 x i32> [[TMP217]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP219]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> [[TMP181]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP222:%.*]] = sub <2 x i32> [[TMP220]], [[TMP221]]
-; CHECK-NEXT: [[TMP223:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP224:%.*]] = insertelement <2 x i32> [[TMP223]], i32 [[ADD46]], i32 1
-; CHECK-NEXT: [[TMP225:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1
-; CHECK-NEXT: [[TMP226:%.*]] = add <2 x i32> [[TMP224]], [[TMP225]]
-; CHECK-NEXT: [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP229:%.*]] = add <2 x i32> [[TMP227]], [[TMP228]]
-; CHECK-NEXT: [[TMP230:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
-; CHECK-NEXT: [[TMP231:%.*]] = extractelement <2 x i32> [[TMP229]], i32 0
-; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP231]], [[TMP230]]
-; CHECK-NEXT: [[TMP232:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1
-; CHECK-NEXT: [[TMP233:%.*]] = lshr <2 x i32> [[TMP232]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP234:%.*]] = and <2 x i32> [[TMP233]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP235:%.*]] = mul <2 x i32> [[TMP234]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP236:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
-; CHECK-NEXT: [[TMP237:%.*]] = extractelement <2 x i32> [[TMP229]], i32 1
-; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[TMP237]], [[TMP236]]
-; CHECK-NEXT: [[TMP238:%.*]] = sub <2 x i32> [[TMP226]], [[TMP229]]
+; CHECK-NEXT: [[TMP145:%.*]] = lshr <2 x i32> [[TMP108]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP146:%.*]] = and <2 x i32> [[TMP145]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP147:%.*]] = mul <2 x i32> [[TMP146]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
+; CHECK-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]]
; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
-; CHECK-NEXT: [[TMP239:%.*]] = extractelement <2 x i32> [[TMP238]], i32 1
-; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP239]]
+; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
+; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]]
; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
-; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP92]]
+; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP74]]
; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
-; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP93]]
+; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]]
; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP178]]
+; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP144]]
+; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
+; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP105]]
; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
-; CHECK-NEXT: [[TMP240:%.*]] = shufflevector <2 x i32> [[TMP222]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP241:%.*]] = insertelement <2 x i32> [[TMP240]], i32 [[SUB102]], i32 1
-; CHECK-NEXT: [[TMP242:%.*]] = add <2 x i32> [[TMP238]], [[TMP241]]
-; CHECK-NEXT: [[TMP243:%.*]] = sub <2 x i32> [[TMP238]], [[TMP241]]
-; CHECK-NEXT: [[TMP244:%.*]] = shufflevector <2 x i32> [[TMP242]], <2 x i32> [[TMP243]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP245:%.*]] = add <2 x i32> [[TMP235]], [[TMP244]]
-; CHECK-NEXT: [[TMP246:%.*]] = xor <2 x i32> [[TMP245]], [[TMP232]]
-; CHECK-NEXT: [[TMP247:%.*]] = extractelement <2 x i32> [[TMP246]], i32 1
-; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP247]]
-; CHECK-NEXT: [[TMP248:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
-; CHECK-NEXT: [[TMP249:%.*]] = shufflevector <2 x i32> [[TMP248]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP250:%.*]] = add <2 x i32> [[TMP196]], [[TMP249]]
-; CHECK-NEXT: [[TMP251:%.*]] = sub <2 x i32> [[TMP196]], [[TMP249]]
-; CHECK-NEXT: [[TMP252:%.*]] = shufflevector <2 x i32> [[TMP250]], <2 x i32> [[TMP251]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP253:%.*]] = add <2 x i32> [[TMP194]], [[TMP252]]
-; CHECK-NEXT: [[TMP254:%.*]] = xor <2 x i32> [[TMP253]], [[TMP191]]
-; CHECK-NEXT: [[TMP255:%.*]] = extractelement <2 x i32> [[TMP246]], i32 0
-; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[TMP255]], [[ADD113]]
-; CHECK-NEXT: [[TMP256:%.*]] = extractelement <2 x i32> [[TMP254]], i32 0
-; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP256]]
-; CHECK-NEXT: [[TMP257:%.*]] = extractelement <2 x i32> [[TMP254]], i32 1
-; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP257]]
-; CHECK-NEXT: [[TMP258:%.*]] = shufflevector <2 x i32> [[TMP209]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP258]], <2 x i32> [[TMP238]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP260:%.*]] = add <2 x i32> [[TMP222]], [[TMP259]]
-; CHECK-NEXT: [[TMP261:%.*]] = sub <2 x i32> [[TMP222]], [[TMP259]]
-; CHECK-NEXT: [[TMP262:%.*]] = shufflevector <2 x i32> [[TMP260]], <2 x i32> [[TMP261]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP263:%.*]] = add <2 x i32> [[TMP218]], [[TMP262]]
-; CHECK-NEXT: [[TMP264:%.*]] = xor <2 x i32> [[TMP263]], [[TMP215]]
-; CHECK-NEXT: [[TMP265:%.*]] = extractelement <2 x i32> [[TMP264]], i32 1
-; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP265]]
-; CHECK-NEXT: [[TMP266:%.*]] = shufflevector <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[TMP222]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP267:%.*]] = mul <2 x i32> [[TMP209]], [[TMP266]]
-; CHECK-NEXT: [[TMP268:%.*]] = sub <2 x i32> [[TMP209]], [[TMP266]]
-; CHECK-NEXT: [[TMP269:%.*]] = shufflevector <2 x i32> [[TMP267]], <2 x i32> [[TMP268]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP270:%.*]] = add <2 x i32> [[TMP186]], [[TMP214]]
-; CHECK-NEXT: [[TMP271:%.*]] = xor <2 x i32> [[TMP270]], [[TMP142]]
-; CHECK-NEXT: [[TMP272:%.*]] = extractelement <2 x i32> [[TMP269]], i32 0
-; CHECK-NEXT: [[TMP273:%.*]] = extractelement <2 x i32> [[TMP269]], i32 1
-; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[TMP272]], [[TMP273]]
-; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[ADD44]]
-; CHECK-NEXT: [[TMP274:%.*]] = extractelement <2 x i32> [[TMP264]], i32 0
-; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[TMP274]], [[ADD113_1]]
-; CHECK-NEXT: [[TMP275:%.*]] = extractelement <2 x i32> [[TMP271]], i32 0
-; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP275]]
-; CHECK-NEXT: [[TMP276:%.*]] = extractelement <2 x i32> [[TMP271]], i32 1
-; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP276]]
+; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
+; CHECK-NEXT: [[TMP148:%.*]] = shufflevector <2 x i32> [[TMP103]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP149:%.*]] = insertelement <2 x i32> [[TMP148]], i32 [[SUB47_2]], i32 1
+; CHECK-NEXT: [[TMP150:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[SUB45_2]], i32 1
+; CHECK-NEXT: [[TMP151:%.*]] = add <2 x i32> [[TMP149]], [[TMP150]]
+; CHECK-NEXT: [[TMP152:%.*]] = shufflevector <2 x i32> [[TMP142]], <2 x i32> [[TMP71]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP153:%.*]] = shufflevector <2 x i32> [[TMP142]], <2 x i32> [[TMP71]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP154:%.*]] = add <2 x i32> [[TMP152]], [[TMP153]]
+; CHECK-NEXT: [[TMP155:%.*]] = extractelement <2 x i32> [[TMP151]], i32 1
+; CHECK-NEXT: [[TMP156:%.*]] = extractelement <2 x i32> [[TMP154]], i32 1
+; CHECK-NEXT: [[TMP157:%.*]] = shufflevector <2 x i32> [[TMP154]], <2 x i32> [[TMP151]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP156]], [[TMP155]]
+; CHECK-NEXT: [[TMP158:%.*]] = extractelement <2 x i32> [[TMP151]], i32 0
+; CHECK-NEXT: [[TMP159:%.*]] = extractelement <2 x i32> [[TMP154]], i32 0
+; CHECK-NEXT: [[TMP160:%.*]] = shufflevector <2 x i32> [[TMP154]], <2 x i32> [[TMP151]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[TMP159]], [[TMP158]]
+; CHECK-NEXT: [[TMP161:%.*]] = sub <2 x i32> [[TMP151]], [[TMP154]]
+; CHECK-NEXT: [[TMP162:%.*]] = extractelement <2 x i32> [[TMP161]], i32 0
+; CHECK-NEXT: [[TMP163:%.*]] = extractelement <2 x i32> [[TMP161]], i32 1
+; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[TMP163]], [[TMP162]]
+; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[TMP162]], [[TMP163]]
+; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]]
+; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP75]]
+; CHECK-NEXT: [[TMP164:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP142]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP165:%.*]] = lshr <2 x i32> [[TMP164]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP166:%.*]] = and <2 x i32> [[TMP165]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP167:%.*]] = mul <2 x i32> [[TMP166]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP168:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
+; CHECK-NEXT: [[TMP169:%.*]] = shufflevector <2 x i32> [[TMP168]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP170:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
+; CHECK-NEXT: [[TMP171:%.*]] = shufflevector <2 x i32> [[TMP170]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP172:%.*]] = add <2 x i32> [[TMP169]], [[TMP171]]
+; CHECK-NEXT: [[TMP173:%.*]] = sub <2 x i32> [[TMP169]], [[TMP171]]
+; CHECK-NEXT: [[TMP174:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> [[TMP173]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP175:%.*]] = add <2 x i32> [[TMP167]], [[TMP174]]
+; CHECK-NEXT: [[TMP176:%.*]] = xor <2 x i32> [[TMP175]], [[TMP164]]
+; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
+; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP106]]
+; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
+; CHECK-NEXT: [[TMP177:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0
+; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP177]]
+; CHECK-NEXT: [[TMP178:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1
+; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP178]]
+; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
+; CHECK-NEXT: [[TMP179:%.*]] = shufflevector <2 x i32> [[TMP102]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: [[TMP180:%.*]] = insertelement <2 x i32> [[TMP179]], i32 [[ADD44_2]], i32 0
+; CHECK-NEXT: [[TMP181:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[ADD46_2]], i32 0
+; CHECK-NEXT: [[TMP182:%.*]] = sub <2 x i32> [[TMP180]], [[TMP181]]
+; CHECK-NEXT: [[TMP183:%.*]] = shufflevector <2 x i32> [[TMP70]], <2 x i32> [[TMP141]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP184:%.*]] = shufflevector <2 x i32> [[TMP70]], <2 x i32> [[TMP141]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP185:%.*]] = sub <2 x i32> [[TMP183]], [[TMP184]]
+; CHECK-NEXT: [[TMP186:%.*]] = extractelement <2 x i32> [[TMP182]], i32 0
+; CHECK-NEXT: [[TMP187:%.*]] = extractelement <2 x i32> [[TMP185]], i32 0
+; CHECK-NEXT: [[TMP188:%.*]] = shufflevector <2 x i32> [[TMP185]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP187]], [[TMP186]]
+; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP182]], i32 1
+; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP185]], i32 1
+; CHECK-NEXT: [[TMP191:%.*]] = shufflevector <2 x i32> [[TMP185]], <2 x i32> [[TMP182]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[TMP190]], [[TMP189]]
+; CHECK-NEXT: [[TMP192:%.*]] = sub <2 x i32> [[TMP182]], [[TMP185]]
+; CHECK-NEXT: [[TMP193:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; CHECK-NEXT: [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP193]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP195:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; CHECK-NEXT: [[TMP196:%.*]] = shufflevector <2 x i32> [[TMP195]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP197:%.*]] = add <2 x i32> [[TMP194]], [[TMP196]]
+; CHECK-NEXT: [[TMP198:%.*]] = sub <2 x i32> [[TMP194]], [[TMP196]]
+; CHECK-NEXT: [[TMP199:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> [[TMP198]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP200:%.*]] = extractelement <2 x i32> [[TMP192]], i32 0
+; CHECK-NEXT: [[TMP201:%.*]] = extractelement <2 x i32> [[TMP192]], i32 1
+; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[TMP200]], [[TMP201]]
+; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP201]], [[TMP200]]
+; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
+; CHECK-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
+; CHECK-NEXT: [[TMP202:%.*]] = add <2 x i32> [[TMP147]], [[TMP199]]
+; CHECK-NEXT: [[TMP203:%.*]] = xor <2 x i32> [[TMP202]], [[TMP108]]
+; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP104]], 15
+; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
+; CHECK-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
+; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
+; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP104]]
+; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
+; CHECK-NEXT: [[TMP204:%.*]] = extractelement <2 x i32> [[TMP203]], i32 0
+; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP204]]
+; CHECK-NEXT: [[TMP205:%.*]] = extractelement <2 x i32> [[TMP203]], i32 1
+; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP205]]
; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT: [[TMP277:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
-; CHECK-NEXT: [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP277]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP279:%.*]] = add <2 x i32> [[TMP278]], [[TMP188]]
-; CHECK-NEXT: [[TMP280:%.*]] = sub <2 x i32> [[TMP278]], [[TMP188]]
-; CHECK-NEXT: [[TMP281:%.*]] = shufflevector <2 x i32> [[TMP279]], <2 x i32> [[TMP280]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP282:%.*]] = add <2 x i32> [[TMP104]], [[TMP281]]
-; CHECK-NEXT: [[TMP283:%.*]] = sub <2 x i32> [[TMP281]], [[TMP104]]
-; CHECK-NEXT: [[TMP284:%.*]] = add <2 x i32> [[TMP138]], [[TMP282]]
-; CHECK-NEXT: [[TMP285:%.*]] = xor <2 x i32> [[TMP284]], [[TMP107]]
-; CHECK-NEXT: [[TMP286:%.*]] = lshr <2 x i32> [[TMP97]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP287:%.*]] = and <2 x i32> [[TMP286]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP288:%.*]] = mul <2 x i32> [[TMP287]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP289:%.*]] = add <2 x i32> [[TMP288]], [[TMP283]]
-; CHECK-NEXT: [[TMP290:%.*]] = xor <2 x i32> [[TMP289]], [[TMP97]]
-; CHECK-NEXT: [[TMP291:%.*]] = extractelement <2 x i32> [[TMP285]], i32 1
-; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[TMP291]], [[ADD113_2]]
-; CHECK-NEXT: [[TMP292:%.*]] = extractelement <2 x i32> [[TMP285]], i32 0
-; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP292]]
-; CHECK-NEXT: [[TMP293:%.*]] = extractelement <2 x i32> [[TMP290]], i32 0
-; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP293]]
-; CHECK-NEXT: [[TMP294:%.*]] = extractelement <2 x i32> [[TMP290]], i32 1
-; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP294]]
+; CHECK-NEXT: [[TMP206:%.*]] = insertelement <2 x i32> [[TMP148]], i32 [[SUB45_2]], i32 0
+; CHECK-NEXT: [[TMP207:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[SUB47_2]], i32 0
+; CHECK-NEXT: [[TMP208:%.*]] = sub <2 x i32> [[TMP206]], [[TMP207]]
+; CHECK-NEXT: [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP71]], <2 x i32> [[TMP142]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP210:%.*]] = shufflevector <2 x i32> [[TMP71]], <2 x i32> [[TMP142]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP211:%.*]] = sub <2 x i32> [[TMP209]], [[TMP210]]
+; CHECK-NEXT: [[TMP212:%.*]] = extractelement <2 x i32> [[TMP208]], i32 0
+; CHECK-NEXT: [[TMP213:%.*]] = extractelement <2 x i32> [[TMP211]], i32 0
+; CHECK-NEXT: [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP211]], <2 x i32> [[TMP208]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP213]], [[TMP212]]
+; CHECK-NEXT: [[TMP215:%.*]] = extractelement <2 x i32> [[TMP208]], i32 1
+; CHECK-NEXT: [[TMP216:%.*]] = extractelement <2 x i32> [[TMP211]], i32 1
+; CHECK-NEXT: [[TMP217:%.*]] = shufflevector <2 x i32> [[TMP211]], <2 x i32> [[TMP208]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[ADD78_3:%.*]] = add i32 [[TMP216]], [[TMP215]]
+; CHECK-NEXT: [[TMP218:%.*]] = sub <2 x i32> [[TMP208]], [[TMP211]]
+; CHECK-NEXT: [[TMP219:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
+; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP219]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP221:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
+; CHECK-NEXT: [[TMP222:%.*]] = shufflevector <2 x i32> [[TMP221]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP223:%.*]] = add <2 x i32> [[TMP220]], [[TMP222]]
+; CHECK-NEXT: [[TMP224:%.*]] = sub <2 x i32> [[TMP220]], [[TMP222]]
+; CHECK-NEXT: [[TMP225:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> [[TMP224]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP226:%.*]] = extractelement <2 x i32> [[TMP218]], i32 0
+; CHECK-NEXT: [[TMP227:%.*]] = extractelement <2 x i32> [[TMP218]], i32 1
+; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[TMP226]], [[TMP227]]
+; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[TMP227]], [[TMP226]]
+; CHECK-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_3]]
+; CHECK-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV]]
+; CHECK-NEXT: [[TMP228:%.*]] = lshr <2 x i32> [[TMP77]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP229:%.*]] = and <2 x i32> [[TMP228]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP230:%.*]] = mul <2 x i32> [[TMP229]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP231:%.*]] = add <2 x i32> [[TMP230]], [[TMP225]]
+; CHECK-NEXT: [[TMP232:%.*]] = xor <2 x i32> [[TMP231]], [[TMP77]]
+; CHECK-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
+; CHECK-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
+; CHECK-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
+; CHECK-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
+; CHECK-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
+; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
+; CHECK-NEXT: [[TMP233:%.*]] = extractelement <2 x i32> [[TMP232]], i32 0
+; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP233]]
+; CHECK-NEXT: [[TMP234:%.*]] = extractelement <2 x i32> [[TMP232]], i32 1
+; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP234]]
+; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
; CHECK-NEXT: ret i32 [[ADD113_3]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
index abf1d7abdc1226..5c261d69cd53e8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -23,16 +23,16 @@ define void @Test(i32) {
; FORCE_REDUCTION-NEXT: entry:
; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]]
; FORCE_REDUCTION: loop:
-; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP9:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1
-; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
-; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP4]])
-; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP5]]
-; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP3]]
-; FORCE_REDUCTION-NEXT: [[VAL_43:%.*]] = add i32 [[TMP3]], 14910
-; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
-; FORCE_REDUCTION-NEXT: [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[VAL_43]], i32 1
+; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
+; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
+; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]]
+; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX]], i32 0
+; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = and <2 x i32> [[TMP5]], [[TMP6]]
+; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]]
+; FORCE_REDUCTION-NEXT: [[TMP9]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <2 x i32> <i32 0, i32 3>
; FORCE_REDUCTION-NEXT: br label [[LOOP]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
index 10369e3aa270eb..fd9528aa8df3ab 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
@@ -70,8 +70,10 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
; THRESHOLD-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
; THRESHOLD-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
; THRESHOLD-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
+; THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
; THRESHOLD-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
; THRESHOLD-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
+; THRESHOLD-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
; THRESHOLD-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
; THRESHOLD-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
; THRESHOLD-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index 9376bcd220a2c3..18d5b09001762b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -104,8 +104,10 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
; THRESHOLD-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
; THRESHOLD-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
; THRESHOLD-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
+; THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
; THRESHOLD-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
; THRESHOLD-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
+; THRESHOLD-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
; THRESHOLD-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
; THRESHOLD-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
; THRESHOLD-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
index ec90ca9bc674df..2cdbd5cff4468c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64 -S | FileCheck %s --check-prefixes=SSE2
; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v2 -S | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v3 -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v4 -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v3 -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v4 -S | FileCheck %s --check-prefixes=AVX512
; PR51746
; typedef int v4si __attribute__ ((vector_size (16)));
@@ -18,33 +18,44 @@
define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
; SSE2-LABEL: @reduce_and4(
; SSE2-NEXT: entry:
-; SSE2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
-; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
-; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
-; SSE2-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
-; SSE2-NEXT: ret i32 [[OP_RDX1]]
+; SSE2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]])
+; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP5]], [[ACC:%.*]]
+; SSE2-NEXT: ret i32 [[OP_RDX]]
;
; SSE42-LABEL: @reduce_and4(
; SSE42-NEXT: entry:
-; SSE42-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
-; SSE42-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
-; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
-; SSE42-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
-; SSE42-NEXT: ret i32 [[OP_RDX1]]
+; SSE42-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]])
+; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP5]], [[ACC:%.*]]
+; SSE42-NEXT: ret i32 [[OP_RDX]]
;
-; AVX-LABEL: @reduce_and4(
-; AVX-NEXT: entry:
-; AVX-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
-; AVX-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
-; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
-; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
-; AVX-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
-; AVX-NEXT: ret i32 [[OP_RDX1]]
+; AVX2-LABEL: @reduce_and4(
+; AVX2-NEXT: entry:
+; AVX2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP2]])
+; AVX2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[ACC:%.*]]
+; AVX2-NEXT: ret i32 [[OP_RDX]]
+;
+; AVX512-LABEL: @reduce_and4(
+; AVX512-NEXT: entry:
+; AVX512-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; AVX512-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; AVX512-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
+; AVX512-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
+; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
+; AVX512-NEXT: ret i32 [[OP_RDX1]]
;
entry:
%vecext = extractelement <4 x i32> %v1, i64 0
@@ -92,31 +103,41 @@ entry:
define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
; SSE2-LABEL: @reduce_and4_transpose(
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
-; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
-; SSE2-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
-; SSE2-NEXT: ret i32 [[OP_RDX1]]
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP5]])
+; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[ACC:%.*]]
+; SSE2-NEXT: ret i32 [[OP_RDX]]
;
; SSE42-LABEL: @reduce_and4_transpose(
-; SSE42-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; SSE42-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
-; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
-; SSE42-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
-; SSE42-NEXT: ret i32 [[OP_RDX1]]
+; SSE42-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP5]])
+; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[ACC:%.*]]
+; SSE42-NEXT: ret i32 [[OP_RDX]]
+;
+; AVX2-LABEL: @reduce_and4_transpose(
+; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]])
+; AVX2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP4]], [[ACC:%.*]]
+; AVX2-NEXT: ret i32 [[OP_RDX]]
;
-; AVX-LABEL: @reduce_and4_transpose(
-; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; AVX-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
-; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
-; AVX-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
-; AVX-NEXT: ret i32 [[OP_RDX1]]
+; AVX512-LABEL: @reduce_and4_transpose(
+; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; AVX512-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; AVX512-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
+; AVX512-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
+; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
+; AVX512-NEXT: ret i32 [[OP_RDX1]]
;
%vecext = extractelement <4 x i32> %v1, i64 0
%vecext1 = extractelement <4 x i32> %v2, i64 0
More information about the llvm-commits
mailing list