[llvm] d9cbf39 - [SLP] Pass VecPred argument to getCmpSelInstrCost.

Tue Nov 3 02:17:11 PST 2020

Author: Florian Hahn
Date: 2020-11-03T10:16:43Z
New Revision: d9cbf39a377b4e2b1d41013d5f1b0c3e2c874640

URL: https://github.com/llvm/llvm-project/commit/d9cbf39a377b4e2b1d41013d5f1b0c3e2c874640
DIFF: https://github.com/llvm/llvm-project/commit/d9cbf39a377b4e2b1d41013d5f1b0c3e2c874640.diff

LOG: [SLP] Pass VecPred argument to getCmpSelInstrCost.

Check if all compares in VL have the same predicate and pass it to
getCmpSelInstrCost, to improve cost-modeling on targets that only
support compare/select combinations for certain uniform predicates.

This leads to additional vectorization in some cases

```
Same hash: 217 (filtered out)
Remaining: 19
Metric: SLP.NumVectorInstructions

Program                                        base    slp2    diff
 test-suite...marks/SciMark2-C/scimark2.test    11.00   26.00  136.4%
 test-suite...T2006/445.gobmk/445.gobmk.test    79.00  135.00  70.9%
 test-suite...ediabench/gsm/toast/toast.test    54.00   71.00  31.5%
 test-suite...telecomm-gsm/telecomm-gsm.test    54.00   71.00  31.5%
 test-suite...CI_Purple/SMG2000/smg2000.test   426.00  542.00  27.2%
 test-suite...ch/g721/g721encode/encode.test    30.00   24.00  -20.0%
 test-suite...000/186.crafty/186.crafty.test   116.00  138.00  19.0%
 test-suite...ications/JM/ldecod/ldecod.test   697.00  765.00   9.8%
 test-suite...6/464.h264ref/464.h264ref.test   822.00  886.00   7.8%
 test-suite...chmarks/MallocBench/gs/gs.test   154.00  162.00   5.2%
 test-suite...nsumer-lame/consumer-lame.test   621.00  651.00   4.8%
 test-suite...lications/ClamAV/clamscan.test   223.00  231.00   3.6%
 test-suite...marks/7zip/7zip-benchmark.test   680.00  695.00   2.2%
 test-suite...CFP2000/177.mesa/177.mesa.test   2121.00 2129.00  0.4%
 test-suite...:: External/Povray/povray.test   2406.00 2412.00  0.2%
 test-suite...TimberWolfMC/timberwolfmc.test   634.00  634.00   0.0%
 test-suite...CFP2006/433.milc/433.milc.test   1036.00 1036.00  0.0%
 test-suite.../Benchmarks/nbench/nbench.test   321.00  321.00   0.0%
 test-suite...ctions-flt/Reductions-flt.test    NaN      5.00   nan%
```

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D90124

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 113ecd058ab5..09401996ef63 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3547,9 +3547,26 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       }
       auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost =
-          TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
-                                  CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
+
+      // Check if all entries in VL are either compares or selects with compares
+      // as condition that have the same predicates.
+      CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE;
+      bool First = true;
+      for (auto *V : VL) {
+        CmpInst::Predicate CurrentPred;
+        auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
+        if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&
+             !match(V, MatchCmp)) ||
+            (!First && VecPred != CurrentPred)) {
+          VecPred = CmpInst::BAD_ICMP_PREDICATE;
+          break;
+        }
+        First = false;
+        VecPred = CurrentPred;
+      }
+
+      int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
+                                            VecPred, CostKind, VL0);
       // Check if it is possible and profitable to use min/max for selects in
       // VL.
       //

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index 1f801834add0..2666a9f3bd6d 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -15,7 +15,7 @@ target triple = "aarch64--linux"
 ; YAML-NEXT: Function:        test_select
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '-8'
+; YAML-NEXT:   - Cost:            '-20'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '8'
 
@@ -244,7 +244,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; YAML-NEXT: Function:        test_unrolled_select
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '-31'
+; YAML-NEXT:   - Cost:            '-37'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '10'
 

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
index 17be1f760509..42dc58a98a5f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
@@ -165,19 +165,18 @@ entry:
   ret void
 }
 
-; There is no <2 x i64> version of umin.
+; There is no <2 x i64> version of umin, but we can efficiently lower
+; compare/select pairs with uniform predicates.
 define void @select_umin_2xi64(i64* %ptr, i64 %x) {
 ; CHECK-LABEL: @select_umin_2xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i64, i64* [[PTR:%.*]], align 8
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp ult i64 [[L_0]], 16383
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i64 [[L_0]], i64 16383
-; CHECK-NEXT:    store i64 [[S_0]], i64* [[PTR]], align 4
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR]], i64 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i64, i64* [[GEP_1]], align 8
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp ult i64 [[L_1]], 16383
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i64 [[L_1]], i64 16383
-; CHECK-NEXT:    store i64 [[S_1]], i64* [[GEP_1]], align 4
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <2 x i64> [[TMP1]], <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP1]], <2 x i64> <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -305,19 +304,18 @@ entry:
   ret void
 }
 
-; There is no <2 x i64> version of umin.
+; There is no <2 x i64> version of umin, but we can efficiently lower
+; compare/select pairs with uniform predicates.
 define void @select_umin_ule_2xi64(i64* %ptr, i64 %x) {
 ; CHECK-LABEL: @select_umin_ule_2xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i64, i64* [[PTR:%.*]], align 8
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp ule i64 [[L_0]], 16383
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i64 [[L_0]], i64 16383
-; CHECK-NEXT:    store i64 [[S_0]], i64* [[PTR]], align 4
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR]], i64 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i64, i64* [[GEP_1]], align 8
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp ule i64 [[L_1]], 16383
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i64 [[L_1]], i64 16383
-; CHECK-NEXT:    store i64 [[S_1]], i64* [[GEP_1]], align 4
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i64> [[TMP1]], <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP1]], <2 x i64> <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -444,19 +442,18 @@ entry:
   ret void
 }
 
-; There is no <2 x i64> version of smin.
+; There is no <2 x i64> version of smin, but we can efficiently lower
+; compare/select pairs with uniform predicates.
 define void @select_smin_2xi64(i64* %ptr, i64 %x) {
 ; CHECK-LABEL: @select_smin_2xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i64, i64* [[PTR:%.*]], align 8
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp slt i64 [[L_0]], 16383
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i64 [[L_0]], i64 16383
-; CHECK-NEXT:    store i64 [[S_0]], i64* [[PTR]], align 4
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR]], i64 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i64, i64* [[GEP_1]], align 8
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp slt i64 [[L_1]], 16383
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i64 [[L_1]], i64 16383
-; CHECK-NEXT:    store i64 [[S_1]], i64* [[GEP_1]], align 4
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <2 x i64> [[TMP1]], <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP1]], <2 x i64> <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -583,19 +580,18 @@ entry:
   ret void
 }
 
-; There is no <2 x i64> version of smin.
+; There is no <2 x i64> version of smin, but we can efficiently lower
+; compare/select pairs with uniform predicates.
 define void @select_smin_sle_2xi64(i64* %ptr, i64 %x) {
 ; CHECK-LABEL: @select_smin_sle_2xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i64, i64* [[PTR:%.*]], align 8
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp sle i64 [[L_0]], 16383
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i64 [[L_0]], i64 16383
-; CHECK-NEXT:    store i64 [[S_0]], i64* [[PTR]], align 4
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR]], i64 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i64, i64* [[GEP_1]], align 8
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sle i64 [[L_1]], 16383
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i64 [[L_1]], i64 16383
-; CHECK-NEXT:    store i64 [[S_1]], i64* [[GEP_1]], align 4
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sle <2 x i64> [[TMP1]], <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP1]], <2 x i64> <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -721,19 +717,18 @@ entry:
   ret void
 }
 
-; There is no <2 x i64> version of umax.
+; There is no <2 x i64> version of umax, but we can efficiently lower
+; compare/select pairs with uniform predicates.
 define void @select_umax_2xi64(i64* %ptr, i64 %x) {
 ; CHECK-LABEL: @select_umax_2xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i64, i64* [[PTR:%.*]], align 8
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp ugt i64 [[L_0]], 16383
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i64 [[L_0]], i64 16383
-; CHECK-NEXT:    store i64 [[S_0]], i64* [[PTR]], align 4
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR]], i64 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i64, i64* [[GEP_1]], align 8
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp ugt i64 [[L_1]], 16383
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i64 [[L_1]], i64 16383
-; CHECK-NEXT:    store i64 [[S_1]], i64* [[GEP_1]], align 4
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <2 x i64> [[TMP1]], <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP1]], <2 x i64> <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -860,19 +855,18 @@ entry:
   ret void
 }
 
-; There is no <2 x i64> version of umax.
+; There is no <2 x i64> version of umax, but we can efficiently lower
+; compare/select pairs with uniform predicates.
 define void @select_umax_uge_2xi64(i64* %ptr, i64 %x) {
 ; CHECK-LABEL: @select_umax_uge_2xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i64, i64* [[PTR:%.*]], align 8
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp uge i64 [[L_0]], 16383
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i64 [[L_0]], i64 16383
-; CHECK-NEXT:    store i64 [[S_0]], i64* [[PTR]], align 4
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR]], i64 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i64, i64* [[GEP_1]], align 8
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp uge i64 [[L_1]], 16383
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i64 [[L_1]], i64 16383
-; CHECK-NEXT:    store i64 [[S_1]], i64* [[GEP_1]], align 4
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp uge <2 x i64> [[TMP1]], <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP1]], <2 x i64> <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -999,19 +993,18 @@ entry:
   ret void
 }
 
-; There is no <2 x i64> version of smax.
+; There is no <2 x i64> version of smax, but we can efficiently lower
+; compare/select pairs with uniform predicates.
 define void @select_smax_2xi64(i64* %ptr, i64 %x) {
 ; CHECK-LABEL: @select_smax_2xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i64, i64* [[PTR:%.*]], align 8
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp sgt i64 [[L_0]], 16383
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i64 [[L_0]], i64 16383
-; CHECK-NEXT:    store i64 [[S_0]], i64* [[PTR]], align 4
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR]], i64 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i64, i64* [[GEP_1]], align 8
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i64 [[L_1]], 16383
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i64 [[L_1]], i64 16383
-; CHECK-NEXT:    store i64 [[S_1]], i64* [[GEP_1]], align 4
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i64> [[TMP1]], <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP1]], <2 x i64> <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1139,19 +1132,18 @@ entry:
   ret void
 }
 
-; There is no <2 x i64> version of smax.
+; There is no <2 x i64> version of smax, but we can efficiently lower
+; compare/select pairs with uniform predicates.
 define void @select_smax_sge_2xi64(i64* %ptr, i64 %x) {
 ; CHECK-LABEL: @select_smax_sge_2xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i64, i64* [[PTR:%.*]], align 8
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp sge i64 [[L_0]], 16383
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i64 [[L_0]], i64 16383
-; CHECK-NEXT:    store i64 [[S_0]], i64* [[PTR]], align 4
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR]], i64 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i64, i64* [[GEP_1]], align 8
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sge i64 [[L_1]], 16383
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i64 [[L_1]], i64 16383
-; CHECK-NEXT:    store i64 [[S_1]], i64* [[GEP_1]], align 4
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sge <2 x i64> [[TMP1]], <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP1]], <2 x i64> <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
index 1b916f62d877..984163c7ab35 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
@@ -193,45 +193,27 @@ entry:
 define void @select_uniform_ugt_8xi8(i8* %ptr, i8 %x) {
 ; CHECK-LABEL: @select_uniform_ugt_8xi8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i8, i8* [[PTR:%.*]], align 1
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp ugt i8 [[L_0]], -1
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i8 [[L_0]], i8 [[X:%.*]]
-; CHECK-NEXT:    store i8 [[S_0]], i8* [[PTR]], align 2
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i8, i8* [[GEP_1]], align 1
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp ugt i8 [[L_1]], -1
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i8 [[L_1]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_1]], i8* [[GEP_1]], align 2
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i8 1
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 2
-; CHECK-NEXT:    [[L_2:%.*]] = load i8, i8* [[GEP_2]], align 1
-; CHECK-NEXT:    [[CMP_2:%.*]] = icmp ugt i8 [[L_2]], -1
-; CHECK-NEXT:    [[S_2:%.*]] = select i1 [[CMP_2]], i8 [[L_2]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_2]], i8* [[GEP_2]], align 2
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 3
-; CHECK-NEXT:    [[L_3:%.*]] = load i8, i8* [[GEP_3]], align 1
-; CHECK-NEXT:    [[CMP_3:%.*]] = icmp ugt i8 [[L_3]], -1
-; CHECK-NEXT:    [[S_3:%.*]] = select i1 [[CMP_3]], i8 [[L_3]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_3]], i8* [[GEP_3]], align 2
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 4
-; CHECK-NEXT:    [[L_4:%.*]] = load i8, i8* [[GEP_4]], align 1
-; CHECK-NEXT:    [[CMP_4:%.*]] = icmp ugt i8 [[L_4]], -1
-; CHECK-NEXT:    [[S_4:%.*]] = select i1 [[CMP_4]], i8 [[L_4]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_4]], i8* [[GEP_4]], align 2
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 5
-; CHECK-NEXT:    [[L_5:%.*]] = load i8, i8* [[GEP_5]], align 1
-; CHECK-NEXT:    [[CMP_5:%.*]] = icmp ugt i8 [[L_5]], -1
-; CHECK-NEXT:    [[S_5:%.*]] = select i1 [[CMP_5]], i8 [[L_5]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_5]], i8* [[GEP_5]], align 2
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 6
-; CHECK-NEXT:    [[L_6:%.*]] = load i8, i8* [[GEP_6]], align 1
-; CHECK-NEXT:    [[CMP_6:%.*]] = icmp ugt i8 [[L_6]], -1
-; CHECK-NEXT:    [[S_6:%.*]] = select i1 [[CMP_6]], i8 [[L_6]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_6]], i8* [[GEP_6]], align 2
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 7
-; CHECK-NEXT:    [[L_7:%.*]] = load i8, i8* [[GEP_7]], align 1
-; CHECK-NEXT:    [[CMP_7:%.*]] = icmp ugt i8 [[L_7]], -1
-; CHECK-NEXT:    [[S_7:%.*]] = select i1 [[CMP_7]], i8 [[L_7]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_7]], i8* [[GEP_7]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> undef, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i8> [[TMP3]], i8 [[X]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i8> [[TMP4]], i8 [[X]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[X]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i8> [[TMP6]], i8 [[X]], i32 4
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> [[TMP7]], i8 [[X]], i32 5
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[X]], i32 6
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[X]], i32 7
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP2]], <8 x i8> [[TMP1]], <8 x i8> [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8* [[PTR]] to <8 x i8>*
+; CHECK-NEXT:    store <8 x i8> [[TMP11]], <8 x i8>* [[TMP12]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -287,50 +269,34 @@ entry:
 define void @select_uniform_ugt_16xi8(i8* %ptr, i8 %x) {
 ; CHECK-LABEL: @select_uniform_ugt_16xi8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i8, i8* [[PTR:%.*]], align 1
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp ugt i8 [[L_0]], -1
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i8 [[L_0]], i8 [[X:%.*]]
-; CHECK-NEXT:    store i8 [[S_0]], i8* [[PTR]], align 2
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i8, i8* [[GEP_1]], align 1
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp ugt i8 [[L_1]], -1
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i8 [[L_1]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_1]], i8* [[GEP_1]], align 2
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i8 1
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 2
-; CHECK-NEXT:    [[L_2:%.*]] = load i8, i8* [[GEP_2]], align 1
-; CHECK-NEXT:    [[CMP_2:%.*]] = icmp ugt i8 [[L_2]], -1
-; CHECK-NEXT:    [[S_2:%.*]] = select i1 [[CMP_2]], i8 [[L_2]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_2]], i8* [[GEP_2]], align 2
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 3
-; CHECK-NEXT:    [[L_3:%.*]] = load i8, i8* [[GEP_3]], align 1
-; CHECK-NEXT:    [[CMP_3:%.*]] = icmp ugt i8 [[L_3]], -1
-; CHECK-NEXT:    [[S_3:%.*]] = select i1 [[CMP_3]], i8 [[L_3]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_3]], i8* [[GEP_3]], align 2
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 4
-; CHECK-NEXT:    [[L_4:%.*]] = load i8, i8* [[GEP_4]], align 1
-; CHECK-NEXT:    [[CMP_4:%.*]] = icmp ugt i8 [[L_4]], -1
-; CHECK-NEXT:    [[S_4:%.*]] = select i1 [[CMP_4]], i8 [[L_4]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_4]], i8* [[GEP_4]], align 2
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 5
-; CHECK-NEXT:    [[L_5:%.*]] = load i8, i8* [[GEP_5]], align 1
-; CHECK-NEXT:    [[CMP_5:%.*]] = icmp ugt i8 [[L_5]], -1
-; CHECK-NEXT:    [[S_5:%.*]] = select i1 [[CMP_5]], i8 [[L_5]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_5]], i8* [[GEP_5]], align 2
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 6
-; CHECK-NEXT:    [[L_6:%.*]] = load i8, i8* [[GEP_6]], align 1
-; CHECK-NEXT:    [[CMP_6:%.*]] = icmp ugt i8 [[L_6]], -1
-; CHECK-NEXT:    [[S_6:%.*]] = select i1 [[CMP_6]], i8 [[L_6]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_6]], i8* [[GEP_6]], align 2
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 7
-; CHECK-NEXT:    [[L_7:%.*]] = load i8, i8* [[GEP_7]], align 1
-; CHECK-NEXT:    [[CMP_7:%.*]] = icmp ugt i8 [[L_7]], -1
-; CHECK-NEXT:    [[S_7:%.*]] = select i1 [[CMP_7]], i8 [[L_7]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_7]], i8* [[GEP_7]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> undef, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i8> [[TMP3]], i8 [[X]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i8> [[TMP4]], i8 [[X]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[X]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i8> [[TMP6]], i8 [[X]], i32 4
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> [[TMP7]], i8 [[X]], i32 5
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[X]], i32 6
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[X]], i32 7
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP2]], <8 x i8> [[TMP1]], <8 x i8> [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8* [[PTR]] to <8 x i8>*
+; CHECK-NEXT:    store <8 x i8> [[TMP11]], <8 x i8>* [[TMP12]], align 2
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 8
 ; CHECK-NEXT:    [[L_8:%.*]] = load i8, i8* [[GEP_8]], align 1
 ; CHECK-NEXT:    [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1
-; CHECK-NEXT:    [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[L_0]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_0]], i8* [[GEP_8]], align 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP13]], i8 [[X]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i8> [[TMP11]], i32 0
+; CHECK-NEXT:    store i8 [[TMP14]], i8* [[GEP_8]], align 2
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i8 9
 ; CHECK-NEXT:    [[L_9:%.*]] = load i8, i8* [[GEP_9]], align 1
 ; CHECK-NEXT:    [[CMP_9:%.*]] = icmp ugt i8 [[L_9]], -1
@@ -471,25 +437,19 @@ entry:
 define void @select_uniform_ugt_4xi16(i16* %ptr, i16 %x) {
 ; CHECK-LABEL: @select_uniform_ugt_4xi16(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i16, i16* [[PTR:%.*]], align 2
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp ugt i16 [[L_0]], 16383
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i16 [[L_0]], i16 [[X:%.*]]
-; CHECK-NEXT:    store i16 [[S_0]], i16* [[PTR]], align 2
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i16 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i16, i16* [[GEP_1]], align 2
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp ugt i16 [[L_1]], 16383
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i16 [[L_1]], i16 [[X]]
-; CHECK-NEXT:    store i16 [[S_1]], i16* [[GEP_1]], align 2
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i16 1
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i16 2
-; CHECK-NEXT:    [[L_2:%.*]] = load i16, i16* [[GEP_2]], align 2
-; CHECK-NEXT:    [[CMP_2:%.*]] = icmp ugt i16 [[L_2]], 16383
-; CHECK-NEXT:    [[S_2:%.*]] = select i1 [[CMP_2]], i16 [[L_2]], i16 [[X]]
-; CHECK-NEXT:    store i16 [[S_2]], i16* [[GEP_2]], align 2
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i16 3
-; CHECK-NEXT:    [[L_3:%.*]] = load i16, i16* [[GEP_3]], align 2
-; CHECK-NEXT:    [[CMP_3:%.*]] = icmp ugt i16 [[L_3]], 16383
-; CHECK-NEXT:    [[S_3:%.*]] = select i1 [[CMP_3]], i16 [[L_3]], i16 [[X]]
-; CHECK-NEXT:    store i16 [[S_3]], i16* [[GEP_3]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[PTR]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <4 x i16> [[TMP1]], <i16 16383, i16 16383, i16 16383, i16 16383>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[X]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i16> [[TMP4]], i16 [[X]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i16> [[TMP5]], i16 [[X]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP1]], <4 x i16> [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[PTR]] to <4 x i16>*
+; CHECK-NEXT:    store <4 x i16> [[TMP7]], <4 x i16>* [[TMP8]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -522,45 +482,27 @@ entry:
 define void @select_uniform_ult_8xi16(i16* %ptr, i16 %x) {
 ; CHECK-LABEL: @select_uniform_ult_8xi16(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i16, i16* [[PTR:%.*]], align 2
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp ult i16 [[L_0]], 16383
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i16 [[L_0]], i16 [[X:%.*]]
-; CHECK-NEXT:    store i16 [[S_0]], i16* [[PTR]], align 2
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i16 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i16, i16* [[GEP_1]], align 2
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp ult i16 [[L_1]], 16383
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i16 [[L_1]], i16 [[X]]
-; CHECK-NEXT:    store i16 [[S_1]], i16* [[GEP_1]], align 2
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i16 1
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i16 2
-; CHECK-NEXT:    [[L_2:%.*]] = load i16, i16* [[GEP_2]], align 2
-; CHECK-NEXT:    [[CMP_2:%.*]] = icmp ult i16 [[L_2]], 16383
-; CHECK-NEXT:    [[S_2:%.*]] = select i1 [[CMP_2]], i16 [[L_2]], i16 [[X]]
-; CHECK-NEXT:    store i16 [[S_2]], i16* [[GEP_2]], align 2
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i16 3
-; CHECK-NEXT:    [[L_3:%.*]] = load i16, i16* [[GEP_3]], align 2
-; CHECK-NEXT:    [[CMP_3:%.*]] = icmp ult i16 [[L_3]], 16383
-; CHECK-NEXT:    [[S_3:%.*]] = select i1 [[CMP_3]], i16 [[L_3]], i16 [[X]]
-; CHECK-NEXT:    store i16 [[S_3]], i16* [[GEP_3]], align 2
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i16 4
-; CHECK-NEXT:    [[L_4:%.*]] = load i16, i16* [[GEP_4]], align 2
-; CHECK-NEXT:    [[CMP_4:%.*]] = icmp ult i16 [[L_4]], 16383
-; CHECK-NEXT:    [[S_4:%.*]] = select i1 [[CMP_4]], i16 [[L_4]], i16 [[X]]
-; CHECK-NEXT:    store i16 [[S_4]], i16* [[GEP_4]], align 2
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i16 5
-; CHECK-NEXT:    [[L_5:%.*]] = load i16, i16* [[GEP_5]], align 2
-; CHECK-NEXT:    [[CMP_5:%.*]] = icmp ult i16 [[L_5]], 16383
-; CHECK-NEXT:    [[S_5:%.*]] = select i1 [[CMP_5]], i16 [[L_5]], i16 [[X]]
-; CHECK-NEXT:    store i16 [[S_5]], i16* [[GEP_5]], align 2
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i16 6
-; CHECK-NEXT:    [[L_6:%.*]] = load i16, i16* [[GEP_6]], align 2
-; CHECK-NEXT:    [[CMP_6:%.*]] = icmp ult i16 [[L_6]], 16383
-; CHECK-NEXT:    [[S_6:%.*]] = select i1 [[CMP_6]], i16 [[L_6]], i16 [[X]]
-; CHECK-NEXT:    store i16 [[S_6]], i16* [[GEP_6]], align 2
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i16 7
-; CHECK-NEXT:    [[L_7:%.*]] = load i16, i16* [[GEP_7]], align 2
-; CHECK-NEXT:    [[CMP_7:%.*]] = icmp ult i16 [[L_7]], 16383
-; CHECK-NEXT:    [[S_7:%.*]] = select i1 [[CMP_7]], i16 [[L_7]], i16 [[X]]
-; CHECK-NEXT:    store i16 [[S_7]], i16* [[GEP_7]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[PTR]] to <8 x i16>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <8 x i16> [[TMP1]], <i16 16383, i16 16383, i16 16383, i16 16383, i16 16383, i16 16383, i16 16383, i16 16383>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[X]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[X]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[X]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[X]], i32 4
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i16> [[TMP7]], i16 [[X]], i32 5
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i16> [[TMP8]], i16 [[X]], i32 6
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x i16> [[TMP9]], i16 [[X]], i32 7
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16* [[PTR]] to <8 x i16>*
+; CHECK-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* [[TMP12]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -616,15 +558,15 @@ entry:
 define void @select_uniform_eq_2xi32(i32* %ptr, i32 %x) {
 ; CHECK-LABEL: @select_uniform_eq_2xi32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp eq i32 [[L_0]], 16383
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i32 [[L_0]], i32 [[X:%.*]]
-; CHECK-NEXT:    store i32 [[S_0]], i32* [[PTR]], align 2
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i32, i32* [[GEP_1]], align 4
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp eq i32 [[L_1]], 16383
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i32 [[L_1]], i32 [[X]]
-; CHECK-NEXT:    store i32 [[S_1]], i32* [[GEP_1]], align 2
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[PTR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], <i32 16383, i32 16383>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[X]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[PTR]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -645,25 +587,19 @@ entry:
 define void @select_uniform_eq_4xi32(i32* %ptr, i32 %x) {
 ; CHECK-LABEL: @select_uniform_eq_4xi32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp eq i32 [[L_0]], 16383
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i32 [[L_0]], i32 [[X:%.*]]
-; CHECK-NEXT:    store i32 [[S_0]], i32* [[PTR]], align 2
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i32, i32* [[GEP_1]], align 4
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp eq i32 [[L_1]], 16383
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i32 [[L_1]], i32 [[X]]
-; CHECK-NEXT:    store i32 [[S_1]], i32* [[GEP_1]], align 2
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 1
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 2
-; CHECK-NEXT:    [[L_2:%.*]] = load i32, i32* [[GEP_2]], align 4
-; CHECK-NEXT:    [[CMP_2:%.*]] = icmp eq i32 [[L_2]], 16383
-; CHECK-NEXT:    [[S_2:%.*]] = select i1 [[CMP_2]], i32 [[L_2]], i32 [[X]]
-; CHECK-NEXT:    store i32 [[S_2]], i32* [[GEP_2]], align 2
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 3
-; CHECK-NEXT:    [[L_3:%.*]] = load i32, i32* [[GEP_3]], align 4
-; CHECK-NEXT:    [[CMP_3:%.*]] = icmp eq i32 [[L_3]], 16383
-; CHECK-NEXT:    [[S_3:%.*]] = select i1 [[CMP_3]], i32 [[L_3]], i32 [[X]]
-; CHECK-NEXT:    store i32 [[S_3]], i32* [[GEP_3]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[PTR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], <i32 16383, i32 16383, i32 16383, i32 16383>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[X]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[X]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[PTR]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -695,15 +631,15 @@ entry:
 define void @select_uniform_ne_2xi64(i64* %ptr, i64 %x) {
 ; CHECK-LABEL: @select_uniform_ne_2xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_0:%.*]] = load i64, i64* [[PTR:%.*]], align 8
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp ne i64 [[L_0]], 16383
-; CHECK-NEXT:    [[S_0:%.*]] = select i1 [[CMP_0]], i64 [[L_0]], i64 [[X:%.*]]
-; CHECK-NEXT:    store i64 [[S_0]], i64* [[PTR]], align 2
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR]], i64 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i64, i64* [[GEP_1]], align 8
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp ne i64 [[L_1]], 16383
-; CHECK-NEXT:    [[S_1:%.*]] = select i1 [[CMP_1]], i64 [[L_1]], i64 [[X]]
-; CHECK-NEXT:    store i64 [[S_1]], i64* [[GEP_1]], align 2
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i64, i64* [[PTR:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], <i64 16383, i64 16383>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[X]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP1]], <2 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[PTR]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry: