[llvm] 8aa8006 - [AArch64][CostModel] Detects that {extract,insert}-element at lane 0 has the same cost as the other lane for vector instructions in the IR.

Fri Sep 9 10:05:37 PDT 2022

Author: Mingming Liu
Date: 2022-09-09T09:47:30-07:00
New Revision: 8aa800614bf65f8db7b8293d514719e6921abe16

URL: https://github.com/llvm/llvm-project/commit/8aa800614bf65f8db7b8293d514719e6921abe16
DIFF: https://github.com/llvm/llvm-project/commit/8aa800614bf65f8db7b8293d514719e6921abe16.diff

LOG: [AArch64][CostModel] Detects that {extract,insert}-element at lane 0 has the same cost as the other lane for vector instructions in the IR.

Currently, {extract,insert}-element has zero cost at lane 0 [1]. However, there is a cost (by fmov instruction [2], or ext/ins instruction) to move values from SIMD registers to GPR registers, when the element is used explicitly as integers.

See https://godbolt.org/z/faPE1nTn8, when fmov is generated for d* register -> x* register conversion.

Implementation-wise, add a private method `AArch64TTIImpl::getVectorInstrCostHelper` as a helper function. This way, instruction-based method could share the core logic (e.g.,
returning zero cost if type is legalized to scalar).

[1] https://github.com/llvm/llvm-project/blob/2cf320d41ed708679e01eeeb93f58d6c5c88ba7a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp#L1853
[2] https://github.com/llvm/llvm-project/blob/2cf320d41ed708679e01eeeb93f58d6c5c88ba7a/llvm/lib/Target/AArch64/AArch64InstrInfo.td#L8150-L8157

Differential Revision: https://reviews.llvm.org/D128302

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
    llvm/test/Analysis/CostModel/AArch64/kryo-inseltpoison.ll
    llvm/test/Analysis/CostModel/AArch64/kryo.ll
    llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll
    llvm/test/Transforms/LICM/AArch64/extract-element.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b94cd9de50d17..00cb4f3784fc5 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1968,8 +1968,9 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
   return 0;
 }
 
-InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                                   unsigned Index) {
+InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(Type *Val,
+                                                         unsigned Index,
+                                                         bool HasRealUse) {
   assert(Val->isVectorTy() && "This must be a vector type");
 
   if (Index != -1U) {
@@ -1988,7 +1989,18 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
     }
 
     // The element at index zero is already inside the vector.
-    if (Index == 0)
+    // - For a physical (HasRealUse==true) insert-element or extract-element
+    // instruction that extracts integers, an explicit FPR -> GPR move is
+    // needed. So it has non-zero cost.
+    // - For the rest of cases (virtual instruction or element type is float),
+    // consider the instruction free.
+    //
+    // FIXME:
+    // If the extract-element and insert-element instructions could be
+    // simplified away (e.g., could be combined into users by looking at use-def
+    // context), they have no cost. This is not done in the first place for
+    // compile-time considerations.
+    if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
       return 0;
   }
 
@@ -1996,6 +2008,16 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   return ST->getVectorInsertExtractBaseCost();
 }
 
+InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                                   unsigned Index) {
+  return getVectorInstrCostHelper(Val, Index, false /* HasRealUse */);
+}
+
+InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
+                                                   Type *Val, unsigned Index) {
+  return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */);
+}
+
 InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index bf917b5b9d84b..473a00d5a9f57 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -59,6 +59,14 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   bool isWideningInstruction(Type *Ty, unsigned Opcode,
                              ArrayRef<const Value *> Args);
 
+  // A helper function called by 'getVectorInstrCost'.
+  //
+  // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
+  // indicates whether the vector instruction is available in the input IR or
+  // just imaginary in vectorizer passes.
+  InstructionCost getVectorInstrCostHelper(Type *Val, unsigned Index,
+                                           bool HasRealUse);
+
 public:
   explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -173,9 +181,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                  const Instruction *I = nullptr);
 
-  using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      unsigned Index);
+  InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
+                                     unsigned Index);
 
   InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                                          bool IsUnsigned,

diff  --git a/llvm/test/Analysis/CostModel/AArch64/kryo-inseltpoison.ll b/llvm/test/Analysis/CostModel/AArch64/kryo-inseltpoison.ll
index c9ee6c93e8693..ca1e6655170a1 100644
--- a/llvm/test/Analysis/CostModel/AArch64/kryo-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/kryo-inseltpoison.ll
@@ -6,18 +6,16 @@ target triple = "aarch64--linux-gnu"
 ; CHECK-LABEL: vectorInstrCost
 define void @vectorInstrCost() {
 
-    ; Vector extracts - extracting the first element should have a zero cost;
-    ; all other elements should have a cost of two.
+    ; Vector extracts - extracting elements should have a cost of two.
     ;
-    ; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0
+    ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 0
     ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1
     %t1 = extractelement <2 x i64> undef, i32 0
     %t2 = extractelement <2 x i64> undef, i32 1
 
-    ; Vector inserts - inserting the first element should have a zero cost; all
-    ; other elements should have a cost of two.
+    ; Vector inserts - inserting elements should have a cost of two.
     ;
-    ; CHECK: cost of 0 {{.*}} insertelement <2 x i64> poison, i64 undef, i32 0
+    ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> poison, i64 undef, i32 0
     ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> poison, i64 undef, i32 1
     %t3 = insertelement <2 x i64> poison, i64 undef, i32 0
     %t4 = insertelement <2 x i64> poison, i64 undef, i32 1

diff  --git a/llvm/test/Analysis/CostModel/AArch64/kryo.ll b/llvm/test/Analysis/CostModel/AArch64/kryo.ll
index 8dca867264951..0ee72e10e9b3e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/kryo.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/kryo.ll
@@ -6,45 +6,18 @@ target triple = "aarch64--linux-gnu"
 ; CHECK-LABEL: vectorInstrCost
 define void @vectorInstrCost() {
 
-    ; Vector extracts - extracting the first element should have a zero cost;
-    ; all other elements should have a cost of two.
+    ; Vector extracts - extracting elements should have a cost of two.
     ;
-    ; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0
+    ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 0
     ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1
     %t1 = extractelement <2 x i64> undef, i32 0
     %t2 = extractelement <2 x i64> undef, i32 1
 
-    ; Vector inserts - inserting the first element should have a zero cost; all
-    ; other elements should have a cost of two.
+    ; Vector inserts - inserting elements should have a cost of two.
     ;
-    ; CHECK: cost of 0 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0
+    ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0
     ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1
     %t3 = insertelement <2 x i64> undef, i64 undef, i32 0
     %t4 = insertelement <2 x i64> undef, i64 undef, i32 1
-
     ret void
 }
-
-; CHECK-LABEL: vectorInstrExtractCost
-define i64 @vectorInstrExtractCost(<4 x i64> %vecreg) {
-    
-    ; Vector extracts - extracting each element at index 0 is considered
-    ; free in the current implementation. When extracting element at index
-    ; 2, 2 is rounded to 0, so extracting element at index 2 has cost 0 as 
-    ; well.
-    ;
-    ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 1
-    ; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 2
-    %t1 = extractelement <4 x i64> %vecreg, i32 1
-    %t2 = extractelement <4 x i64> %vecreg, i32 2
-    %ele = add i64 %t2, 1
-    %cond = icmp eq i64 %t1, %ele
-
-    ; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 0
-    ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 3
-    %t0 = extractelement <4 x i64> %vecreg, i32 0
-    %t3 = extractelement <4 x i64> %vecreg, i32 3
-    %val = select i1 %cond, i64 %t0 , i64 %t3
-
-    ret i64 %val
-}

diff  --git a/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll b/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll
index 5a3da82c8c0c1..ad79609b94660 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll
@@ -9,10 +9,10 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @ins_el0() #0 {
 ; CHECK-DEFAULT-LABEL: 'ins_el0'
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement <vscale x 4 x float> zeroinitializer, float 0.000000e+00, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement <vscale x 2 x double> zeroinitializer, double 0.000000e+00, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -27,10 +27,10 @@ define void @ins_el0() #0 {
 ; CHECK-LOW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-HIGH-LABEL: 'ins_el0'
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
 ; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement <vscale x 4 x float> zeroinitializer, float 0.000000e+00, i64 0
 ; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement <vscale x 2 x double> zeroinitializer, double 0.000000e+00, i64 0
 ; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -84,10 +84,10 @@ define void @ins_el1() #0 {
 
 define void @ext_el0() #0 {
 ; CHECK-DEFAULT-LABEL: 'ext_el0'
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement <vscale x 4 x float> zeroinitializer, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement <vscale x 2 x double> zeroinitializer, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -102,10 +102,10 @@ define void @ext_el0() #0 {
 ; CHECK-LOW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-HIGH-LABEL: 'ext_el0'
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
 ; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement <vscale x 4 x float> zeroinitializer, i64 0
 ; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement <vscale x 2 x double> zeroinitializer, i64 0
 ; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void

diff  --git a/llvm/test/Transforms/LICM/AArch64/extract-element.ll b/llvm/test/Transforms/LICM/AArch64/extract-element.ll
index b156b81d6708d..4d36edc75bea7 100644
--- a/llvm/test/Transforms/LICM/AArch64/extract-element.ll
+++ b/llvm/test/Transforms/LICM/AArch64/extract-element.ll
@@ -18,24 +18,23 @@ define i1 @func(ptr %0, i64 %1) {
 ; CHECK-NEXT:    [[TMP12]] = add i64 [[TMP4]], 1
 ; CHECK-NEXT:    br label [[TMP3]]
 ; CHECK:       .split.loop.exit:
-; CHECK-NEXT:    [[DOTLCSSA7:%.*]] = phi <1 x i64> [ [[TMP8]], [[TMP6]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i64 [ [[TMP9]], [[TMP6]] ]
 ; CHECK-NEXT:    [[DOTLCSSA6:%.*]] = phi i64 [ [[TMP4]], [[TMP6]] ]
 ; CHECK-NEXT:    [[DOTPH:%.*]] = phi i1 [ [[TMP5]], [[TMP6]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[DOTLCSSA7]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], -1
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[DOTLCSSA6]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp uge i64 [[TMP15]], [[TMP1]]
-; CHECK-NEXT:    br label [[TMP17:%.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[DOTLCSSA]], -1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], [[DOTLCSSA6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp uge i64 [[TMP14]], [[TMP1]]
+; CHECK-NEXT:    br label [[TMP16:%.*]]
 ; CHECK:       .split.loop.exit2:
 ; CHECK-NEXT:    [[DOTPH3:%.*]] = phi i1 [ [[TMP5]], [[TMP3]] ]
 ; CHECK-NEXT:    [[DOTPH4:%.*]] = phi i1 [ undef, [[TMP3]] ]
-; CHECK-NEXT:    br label [[TMP17]]
-; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = phi i1 [ [[TMP16]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = xor i1 [[TMP18]], true
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i1 true, i1 [[TMP19]]
-; CHECK-NEXT:    ret i1 [[TMP21]]
+; CHECK-NEXT:    br label [[TMP16]]
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i1 [ [[TMP15]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP17]], true
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i1 true, i1 [[TMP18]]
+; CHECK-NEXT:    ret i1 [[TMP20]]
 ;
   br label %3