[llvm] [InstCombine] Optimistically allow multiple shufflevector uses in foldOpPhi (PR #114278)

Mon Nov 11 11:18:49 PST 2024

https://github.com/MatzeB updated https://github.com/llvm/llvm-project/pull/114278

>From 42aba4bbb4503bbe0b54eafb6cef695f7df91c6c Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Tue, 29 Oct 2024 16:38:30 -0700
Subject: [PATCH 1/3] Optimistically multiple uses in foldOpPhi for
 shufflevector

We would like to optimize situations of the form:
```
loop:
    %phi = phi zeroinitializer, %interleaved

    %deinterleave_a = shufflevector %phi, poison ; pick half of the lanes
    %deinterleave_b = shufflevector %phi, posion ; pick remaining lanes

    ...

    %interleaved = shufflevector %a, %b ; interleave lanes of a+b
```
where the interleave and de-interleave patterns cancel each other out.
This could be handled by `foldOpPhi` but requires to proceed with
two uses on the `Phi` operation.

This extends `foldOpPhi` proceed with more than one use if all
uses are `shufflevector` and are guaranteed to simplify with all
`Phi` predecessors.
---
 .../InstCombine/InstructionCombining.cpp      | 55 +++++++++++++-----
 .../InstCombine/vec_shuffle-phi-multiuse.ll   | 57 +++++++++++++++++++
 2 files changed, 97 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 2a54390c0f1882..9b2074e481f54f 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1773,17 +1773,33 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
   if (NumPHIValues == 0)
     return nullptr;
 
-  // We normally only transform phis with a single use.  However, if a PHI has
-  // multiple uses and they are all the same operation, we can fold *all* of the
-  // uses into the PHI.
+  // We normally only transform phis with a single use.
+  bool AllUsesIdentical = false;
+  bool MultipleUses = false;
   if (!PN->hasOneUse()) {
-    // Walk the use list for the instruction, comparing them to I.
+    // Exceptions:
+    //   - All uses are identical.
+    //   - All uses are shufflevector instructions that fully simplify; this
+    //     helps interleave -> phi -> 2x de-interleave+de patterns.
+    if (isa<ShuffleVectorInst>(I)) {
+      MultipleUses = true;
+    }
+    AllUsesIdentical = true;
+    unsigned NumUses = 0;
     for (User *U : PN->users()) {
+      ++NumUses;
       Instruction *UI = cast<Instruction>(U);
-      if (UI != &I && !I.isIdenticalTo(UI))
+      if (UI == &I)
+        continue;
+
+      if (!I.isIdenticalTo(UI))
+        AllUsesIdentical = false;
+      // Only inspect first 4 uses to avoid quadratic complexity.
+      if (!isa<ShuffleVectorInst>(UI) || NumUses > 4)
+        MultipleUses = false;
+      if (!AllUsesIdentical && !MultipleUses)
         return nullptr;
     }
-    // Otherwise, we can replace *all* users with the new PHI we form.
   }
 
   // Check that all operands are phi-translatable.
@@ -1834,6 +1850,11 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
       continue;
     }
 
+    // Be conservative in MultipleUses case and do not allow non-simplified
+    // vals.
+    if (MultipleUses)
+      return nullptr;
+
     if (SeenNonSimplifiedInVal)
       return nullptr; // More than one non-simplified value.
     SeenNonSimplifiedInVal = true;
@@ -1895,17 +1916,21 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
   for (unsigned i = 0; i != NumPHIValues; ++i)
     NewPN->addIncoming(NewPhiValues[i], PN->getIncomingBlock(i));
 
-  for (User *U : make_early_inc_range(PN->users())) {
-    Instruction *User = cast<Instruction>(U);
-    if (User == &I)
-      continue;
-    replaceInstUsesWith(*User, NewPN);
-    eraseInstFromFunction(*User);
+  if (AllUsesIdentical) {
+    for (User *U : make_early_inc_range(PN->users())) {
+      Instruction *User = cast<Instruction>(U);
+      if (User == &I)
+        continue;
+      replaceInstUsesWith(*User, NewPN);
+      eraseInstFromFunction(*User);
+    }
   }
 
-  replaceAllDbgUsesWith(const_cast<PHINode &>(*PN),
-                        const_cast<PHINode &>(*NewPN),
-                        const_cast<PHINode &>(*PN), DT);
+  if (!MultipleUses || AllUsesIdentical) {
+    replaceAllDbgUsesWith(const_cast<PHINode &>(*PN),
+                          const_cast<PHINode &>(*NewPN),
+                          const_cast<PHINode &>(*PN), DT);
+  }
   return replaceInstUsesWith(I, NewPN);
 }
 
diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll b/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
new file mode 100644
index 00000000000000..597eae101e9f2d
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
+
+define void @f(ptr %p_begin, ptr %p_end, ptr %out) {
+; CHECK-LABEL: define void @f(
+; CHECK-SAME: ptr [[P_BEGIN:%.*]], ptr [[P_END:%.*]], ptr [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[ACC:%.*]] = phi <4 x float> [ zeroinitializer, %[[ENTRY]] ], [ [[SUM_LOWS:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ODDS:%.*]] = phi <4 x float> [ zeroinitializer, %[[ENTRY]] ], [ [[SUM_HIGHS:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[P:%.*]] = phi ptr [ [[P_BEGIN]], %[[ENTRY]] ], [ [[P_INC:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = load <4 x i8>, ptr [[P]], align 4
+; CHECK-NEXT:    [[HIGHS:%.*]] = ashr <4 x i8> [[VAL]], <i8 4, i8 4, i8 4, i8 4>
+; CHECK-NEXT:    [[LOWS:%.*]] = and <4 x i8> [[VAL]], <i8 15, i8 15, i8 15, i8 15>
+; CHECK-NEXT:    [[HIGHS_F:%.*]] = sitofp <4 x i8> [[HIGHS]] to <4 x float>
+; CHECK-NEXT:    [[LOWS_F:%.*]] = uitofp nneg <4 x i8> [[LOWS]] to <4 x float>
+; CHECK-NEXT:    [[SUM_LOWS]] = fadd <4 x float> [[ACC]], [[LOWS_F]]
+; CHECK-NEXT:    [[SUM_HIGHS]] = fadd <4 x float> [[ODDS]], [[HIGHS_F]]
+; CHECK-NEXT:    [[P_INC]] = getelementptr inbounds i8, ptr [[P]], i64 4
+; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[P_INC]], [[P_END]]
+; CHECK-NEXT:    br i1 [[C]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[INTERLEAVE:%.*]] = shufflevector <4 x float> [[SUM_LOWS]], <4 x float> [[SUM_HIGHS]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x float> [[INTERLEAVE]], ptr [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %acc = phi <8 x float> [ zeroinitializer, %entry ], [ %interleave, %loop ]
+  %p = phi ptr [%p_begin, %entry ], [%p_inc, %loop]
+
+  %evens = shufflevector <8 x float> %acc, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %odds = shufflevector <8 x float> %acc, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+  %val = load <4 x i8>, ptr %p, align 4
+  %highs = ashr <4 x i8> %val, <i8 4, i8 4, i8 4, i8 4>
+  %lows = and <4 x i8> %val, <i8 15, i8 15, i8 15, i8 15>
+
+  %highs_f = sitofp <4 x i8> %highs to <4 x float>
+  %lows_f = sitofp <4 x i8> %lows to <4 x float>
+
+  %sum_lows = fadd <4 x float> %evens, %lows_f
+  %sum_highs = fadd <4 x float> %odds, %highs_f
+
+  %interleave = shufflevector <4 x float> %sum_lows, <4 x float> %sum_highs, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+
+  %p_inc = getelementptr inbounds i8, ptr %p, i32 4
+  %c = icmp eq ptr %p_inc, %p_end
+  br i1 %c, label %exit, label %loop
+
+exit:
+  store <8 x float> %interleave, ptr %out, align 4
+  ret void
+}

>From 394d2df634fbb1f2d22d85308abb411d5d16c67f Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Thu, 7 Nov 2024 14:57:37 -0800
Subject: [PATCH 2/3] Address review feedback

---
 .../InstCombine/InstructionCombining.cpp       | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 9b2074e481f54f..714d13d4986066 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1775,15 +1775,13 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
 
   // We normally only transform phis with a single use.
   bool AllUsesIdentical = false;
-  bool MultipleUses = false;
+  bool MultipleShuffleVectorUses = false;
   if (!PN->hasOneUse()) {
     // Exceptions:
     //   - All uses are identical.
     //   - All uses are shufflevector instructions that fully simplify; this
     //     helps interleave -> phi -> 2x de-interleave+de patterns.
-    if (isa<ShuffleVectorInst>(I)) {
-      MultipleUses = true;
-    }
+    MultipleShuffleVectorUses = isa<ShuffleVectorInst>(I);
     AllUsesIdentical = true;
     unsigned NumUses = 0;
     for (User *U : PN->users()) {
@@ -1796,8 +1794,8 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
         AllUsesIdentical = false;
       // Only inspect first 4 uses to avoid quadratic complexity.
       if (!isa<ShuffleVectorInst>(UI) || NumUses > 4)
-        MultipleUses = false;
-      if (!AllUsesIdentical && !MultipleUses)
+        MultipleShuffleVectorUses = false;
+      if (!AllUsesIdentical && !MultipleShuffleVectorUses)
         return nullptr;
     }
   }
@@ -1850,9 +1848,9 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
       continue;
     }
 
-    // Be conservative in MultipleUses case and do not allow non-simplified
-    // vals.
-    if (MultipleUses)
+    // Be conservative in cases with multiple uses and require all inputs to
+    // simplify.
+    if (MultipleShuffleVectorUses)
       return nullptr;
 
     if (SeenNonSimplifiedInVal)
@@ -1926,7 +1924,7 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
     }
   }
 
-  if (!MultipleUses || AllUsesIdentical) {
+  if (!MultipleShuffleVectorUses || AllUsesIdentical) {
     replaceAllDbgUsesWith(const_cast<PHINode &>(*PN),
                           const_cast<PHINode &>(*NewPN),
                           const_cast<PHINode &>(*PN), DT);

>From 745fac7275e3f8ccece6381f6883b42c40f90c31 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Mon, 11 Nov 2024 09:55:03 -0800
Subject: [PATCH 3/3] Ensure that all users simplify

Also add more tests as suggested
---
 .../InstCombine/InstCombineInternal.h         |   7 +
 .../InstCombine/InstructionCombining.cpp      | 142 ++++++++++--------
 .../InstCombine/vec_shuffle-phi-multiuse.ll   | 112 +++++++++++++-
 3 files changed, 198 insertions(+), 63 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 7a060cdab2d37d..1562fc4441b844 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -454,6 +454,13 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 
   Instruction *hoistFNegAboveFMulFDiv(Value *FNegOp, Instruction &FMFSource);
 
+  /// Helper for `foldOpIntoPhi`. Tests is a given user of the phi node would
+  /// simplify when folded.
+  bool canFoldUserIntoPhi(Instruction &User, PHINode &PN,
+                          SmallVectorImpl<Value *> &NewPhiValues,
+                          SmallVectorImpl<unsigned> &OpsToMoveUseToIncomingBB,
+                          bool AllowOneNonSimplifiedValue);
+
 public:
   /// Create and insert the idiom we use to indicate a block is unreachable
   /// without having to rewrite the CFG from within InstCombine.
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 714d13d4986066..f2bc19f966a491 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1768,71 +1768,41 @@ static Value *simplifyInstructionWithPHI(Instruction &I, PHINode *PN,
   return nullptr;
 }
 
-Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
-  unsigned NumPHIValues = PN->getNumIncomingValues();
-  if (NumPHIValues == 0)
-    return nullptr;
-
-  // We normally only transform phis with a single use.
-  bool AllUsesIdentical = false;
-  bool MultipleShuffleVectorUses = false;
-  if (!PN->hasOneUse()) {
-    // Exceptions:
-    //   - All uses are identical.
-    //   - All uses are shufflevector instructions that fully simplify; this
-    //     helps interleave -> phi -> 2x de-interleave+de patterns.
-    MultipleShuffleVectorUses = isa<ShuffleVectorInst>(I);
-    AllUsesIdentical = true;
-    unsigned NumUses = 0;
-    for (User *U : PN->users()) {
-      ++NumUses;
-      Instruction *UI = cast<Instruction>(U);
-      if (UI == &I)
-        continue;
-
-      if (!I.isIdenticalTo(UI))
-        AllUsesIdentical = false;
-      // Only inspect first 4 uses to avoid quadratic complexity.
-      if (!isa<ShuffleVectorInst>(UI) || NumUses > 4)
-        MultipleShuffleVectorUses = false;
-      if (!AllUsesIdentical && !MultipleShuffleVectorUses)
-        return nullptr;
-    }
-  }
-
+bool InstCombinerImpl::canFoldUserIntoPhi(
+    Instruction &User, PHINode &PN, SmallVectorImpl<Value *> &NewPhiValues,
+    SmallVectorImpl<unsigned> &OpsToMoveUseToIncomingBB,
+    bool AllowOneNonSimplifiedValue) {
   // Check that all operands are phi-translatable.
-  for (Value *Op : I.operands()) {
-    if (Op == PN)
+  for (Value *Op : User.operands()) {
+    if (Op == &PN)
       continue;
 
     // Non-instructions never require phi-translation.
-    auto *I = dyn_cast<Instruction>(Op);
-    if (!I)
+    auto *OpInst = dyn_cast<Instruction>(Op);
+    if (!OpInst)
       continue;
 
     // Phi-translate can handle phi nodes in the same block.
-    if (isa<PHINode>(I))
-      if (I->getParent() == PN->getParent())
-        continue;
+    if (isa<PHINode>(OpInst) && OpInst->getParent() == PN.getParent())
+      continue;
 
     // Operand dominates the block, no phi-translation necessary.
-    if (DT.dominates(I, PN->getParent()))
+    if (DT.dominates(OpInst, PN.getParent()))
       continue;
 
     // Not phi-translatable, bail out.
-    return nullptr;
+    return false;
   }
 
   // Check to see whether the instruction can be folded into each phi operand.
   // If there is one operand that does not fold, remember the BB it is in.
-  SmallVector<Value *> NewPhiValues;
-  SmallVector<unsigned int> OpsToMoveUseToIncomingBB;
   bool SeenNonSimplifiedInVal = false;
-  for (unsigned i = 0; i != NumPHIValues; ++i) {
-    Value *InVal = PN->getIncomingValue(i);
-    BasicBlock *InBB = PN->getIncomingBlock(i);
+  for (unsigned I = 0, E = PN.getNumIncomingValues(); I != E; ++I) {
+    Value *InVal = PN.getIncomingValue(I);
+    BasicBlock *InBB = PN.getIncomingBlock(I);
 
-    if (auto *NewVal = simplifyInstructionWithPHI(I, PN, InVal, InBB, DL, SQ)) {
+    if (auto *NewVal =
+            simplifyInstructionWithPHI(User, &PN, InVal, InBB, DL, SQ)) {
       NewPhiValues.push_back(NewVal);
       continue;
     }
@@ -1842,19 +1812,14 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
     // because we know that it will simplify to a single icmp.
     const APInt *Ignored;
     if (isa<CmpIntrinsic>(InVal) && InVal->hasOneUser() &&
-        match(&I, m_ICmp(m_Specific(PN), m_APInt(Ignored)))) {
-      OpsToMoveUseToIncomingBB.push_back(i);
+        match(&User, m_ICmp(m_Specific(&PN), m_APInt(Ignored)))) {
+      OpsToMoveUseToIncomingBB.push_back(I);
       NewPhiValues.push_back(nullptr);
       continue;
     }
 
-    // Be conservative in cases with multiple uses and require all inputs to
-    // simplify.
-    if (MultipleShuffleVectorUses)
-      return nullptr;
-
-    if (SeenNonSimplifiedInVal)
-      return nullptr; // More than one non-simplified value.
+    if (!AllowOneNonSimplifiedValue || SeenNonSimplifiedInVal)
+      return false; // More than one non-simplified value.
     SeenNonSimplifiedInVal = true;
 
     // If there is exactly one non-simplified value, we can insert a copy of the
@@ -1864,23 +1829,78 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
     // block. Also, make sure that the pred block is not dead code.
     BranchInst *BI = dyn_cast<BranchInst>(InBB->getTerminator());
     if (!BI || !BI->isUnconditional() || !DT.isReachableFromEntry(InBB))
-      return nullptr;
+      return false;
 
     NewPhiValues.push_back(nullptr);
-    OpsToMoveUseToIncomingBB.push_back(i);
+    OpsToMoveUseToIncomingBB.push_back(I);
 
     // If the InVal is an invoke at the end of the pred block, then we can't
     // insert a computation after it without breaking the edge.
     if (isa<InvokeInst>(InVal))
       if (cast<Instruction>(InVal)->getParent() == InBB)
-        return nullptr;
+        return false;
 
     // Do not push the operation across a loop backedge. This could result in
     // an infinite combine loop, and is generally non-profitable (especially
     // if the operation was originally outside the loop).
-    if (isBackEdge(InBB, PN->getParent()))
-      return nullptr;
+    if (isBackEdge(InBB, PN.getParent()))
+      return false;
   }
+  return true;
+}
+
+Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
+  unsigned NumPHIValues = PN->getNumIncomingValues();
+  if (NumPHIValues == 0)
+    return nullptr;
+
+  // We normally only transform phis with a single use.
+  bool AllUsesIdentical = false;
+  bool MultipleShuffleVectorUses = false;
+  if (!PN->hasOneUse()) {
+    // Exceptions:
+    //   - All uses are identical.
+    //   - All uses are shufflevector instructions that fully simplify; this
+    //     helps interleave -> phi -> 2x de-interleave+de patterns.
+    MultipleShuffleVectorUses = isa<ShuffleVectorInst>(I);
+    AllUsesIdentical = true;
+    unsigned NumUses = 0;
+    for (User *U : PN->users()) {
+      ++NumUses;
+      Instruction *UI = cast<Instruction>(U);
+      if (UI == &I)
+        continue;
+
+      if (!I.isIdenticalTo(UI))
+        AllUsesIdentical = false;
+      // Only inspect first 4 uses to avoid quadratic complexity.
+      if (!isa<ShuffleVectorInst>(UI) || NumUses > 4)
+        MultipleShuffleVectorUses = false;
+      if (!AllUsesIdentical && !MultipleShuffleVectorUses)
+        return nullptr;
+    }
+
+    // Check that other uses will simplify as well.
+    if (MultipleShuffleVectorUses) {
+      for (User *U : PN->users()) {
+        if (U == &I)
+          continue;
+        SmallVector<Value *, 4> dummy_vals;
+        SmallVector<unsigned, 4> dummy_ints;
+        if (!canFoldUserIntoPhi(*cast<Instruction>(U), *PN, dummy_vals,
+                                dummy_ints,
+                                /*AllowOneNonSimplifiedValue=*/false))
+          return nullptr;
+      }
+    }
+  }
+
+  SmallVector<Value *> NewPhiValues;
+  SmallVector<unsigned int> OpsToMoveUseToIncomingBB;
+  if (!canFoldUserIntoPhi(
+          I, *PN, NewPhiValues, OpsToMoveUseToIncomingBB,
+          /*AllowOneNonSimplifiedValue=*/!MultipleShuffleVectorUses))
+    return nullptr;
 
   // Clone the instruction that uses the phi node and move it into the incoming
   // BB because we know that the next iteration of InstCombine will simplify it.
diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll b/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
index 597eae101e9f2d..91ce6aab5af7f6 100644
--- a/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
@@ -1,8 +1,39 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -S -passes=instcombine | FileCheck %s
 
-define void @f(ptr %p_begin, ptr %p_end, ptr %out) {
-; CHECK-LABEL: define void @f(
+define <4 x i16> @f0(i1 %c, ptr %p0, ptr %p1) {
+; CHECK-LABEL: define <4 x i16> @f0(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P0:%.*]], ptr [[P1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[MERGE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[LOAD0:%.*]] = load <4 x i16>, ptr [[P0]], align 16
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <4 x i16>, ptr [[P1]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = sub <4 x i16> [[LOAD0]], [[LOAD1]]
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[SUB:%.*]] = phi <4 x i16> [ <i16 -87, i16 327, i16 51, i16 755>, %[[ENTRY]] ], [ [[TMP0]], %[[THEN]] ]
+; CHECK-NEXT:    ret <4 x i16> [[SUB]]
+;
+entry:
+  br i1 %c, label %then, label %merge
+
+then:
+  %load0 = load <4 x i16>, ptr %p0, align 16
+  %load1 = load <4 x i16>, ptr %p1, align 16
+  %interleave = shufflevector <4 x i16> %load0, <4 x i16> %load1, <8 x i32> <i32 0, i32 7, i32 1, i32 6, i32 2, i32 5, i32 3, i32 4>
+  br label %merge
+
+merge:
+  %phi = phi <8 x i16> [<i16 1, i16 22, i16 333, i16 4, i16 55, i16 6, i16 777, i16 88>, %entry], [%interleave, %then]
+  %shuf0 = shufflevector <8 x i16> %phi, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuf1 = shufflevector <8 x i16> %phi, <8 x i16> poison, <4 x i32> <i32 7, i32 5, i32 3, i32 1>
+  %sub = sub <4 x i16> %shuf0, %shuf1
+  ret <4 x i16> %sub
+}
+
+define void @deinterleave_interleave(ptr %p_begin, ptr %p_end, ptr %out) {
+; CHECK-LABEL: define void @deinterleave_interleave(
 ; CHECK-SAME: ptr [[P_BEGIN:%.*]], ptr [[P_END:%.*]], ptr [[OUT:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
@@ -55,3 +86,80 @@ exit:
   store <8 x float> %interleave, ptr %out, align 4
   ret void
 }
+
+; Currently we only optimize if all uses are shufflevectors.
+define <4 x i16> @noopt_only_shuffles(i1 %c, ptr %p) {
+; CHECK-LABEL: define <4 x i16> @noopt_only_shuffles(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[MERGE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    store i32 42, ptr [[P]], align 4
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi <4 x i16> [ <i16 1, i16 22, i16 333, i16 4>, %[[ENTRY]] ], [ <i16 555, i16 6, i16 77, i16 8>, %[[THEN]] ]
+; CHECK-NEXT:    [[SHUF0:%.*]] = shufflevector <4 x i16> [[PHI]], <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[ADD1:%.*]] = add nuw nsw <4 x i16> [[PHI]], <i16 1, i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[XOR:%.*]] = xor <4 x i16> [[SHUF0]], [[ADD1]]
+; CHECK-NEXT:    ret <4 x i16> [[XOR]]
+;
+entry:
+  br i1 %c, label %then, label %merge
+
+then:
+  store i32 42, ptr %p, align 4
+  br label %merge
+
+merge:
+  %phi = phi <4 x i16> [<i16 1, i16 22, i16 333, i16 4>, %entry], [<i16 555, i16 6, i16 77, i16 8>, %then]
+  %shuf0 = shufflevector <4 x i16> %phi, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %add1 = add <4 x i16> %phi, <i16 1, i16 1, i16 1, i16 1>
+  %xor = xor <4 x i16> %shuf0, %add1
+  ret <4 x i16> %xor
+}
+
+; Currently we don't optimize if one of the phi inputs does not simplify.
+define <4 x i16> @noopt_must_fully_simplify(i1 %c, ptr %p0, ptr %p1, ptr %p2, ptr %p3) {
+; CHECK-LABEL: define <4 x i16> @noopt_must_fully_simplify(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P0:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]], ptr [[P3:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[LOAD_T0:%.*]] = load <4 x i16>, ptr [[P0]], align 16
+; CHECK-NEXT:    [[LOAD_T1:%.*]] = load <4 x i16>, ptr [[P1]], align 16
+; CHECK-NEXT:    [[INTERLEAVE_THEN:%.*]] = shufflevector <4 x i16> [[LOAD_T0]], <4 x i16> [[LOAD_T1]], <8 x i32> <i32 0, i32 7, i32 1, i32 6, i32 2, i32 5, i32 3, i32 4>
+; CHECK-NEXT:    br label %[[MERGE:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[LOAD_E0:%.*]] = load <4 x i16>, ptr [[P2]], align 16
+; CHECK-NEXT:    [[LOAD_E1:%.*]] = load <4 x i16>, ptr [[P3]], align 16
+; CHECK-NEXT:    [[INTERLEAVE_ELSE:%.*]] = shufflevector <4 x i16> [[LOAD_E0]], <4 x i16> [[LOAD_E1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi <8 x i16> [ [[INTERLEAVE_ELSE]], %[[ELSE]] ], [ [[INTERLEAVE_THEN]], %[[THEN]] ]
+; CHECK-NEXT:    [[SHUF0:%.*]] = shufflevector <8 x i16> [[PHI]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[SHUF1:%.*]] = shufflevector <8 x i16> [[PHI]], <8 x i16> poison, <4 x i32> <i32 7, i32 5, i32 3, i32 1>
+; CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[SHUF0]], [[SHUF1]]
+; CHECK-NEXT:    ret <4 x i16> [[SUB]]
+;
+entry:
+  br i1 %c, label %then, label %else
+
+then:
+  %load_t0 = load <4 x i16>, ptr %p0, align 16
+  %load_t1 = load <4 x i16>, ptr %p1, align 16
+  %interleave_then = shufflevector <4 x i16> %load_t0, <4 x i16> %load_t1, <8 x i32> <i32 0, i32 7, i32 1, i32 6, i32 2, i32 5, i32 3, i32 4>
+  br label %merge
+
+else:
+  %load_e0 = load <4 x i16>, ptr %p2, align 16
+  %load_e1 = load <4 x i16>, ptr %p3, align 16
+  %interleave_else = shufflevector <4 x i16> %load_e0, <4 x i16> %load_e1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  br label %merge
+
+merge:
+  %phi = phi <8 x i16> [%interleave_else, %else], [%interleave_then, %then]
+  %shuf0 = shufflevector <8 x i16> %phi, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuf1 = shufflevector <8 x i16> %phi, <8 x i16> poison, <4 x i32> <i32 7, i32 5, i32 3, i32 1>
+  %sub = sub <4 x i16> %shuf0, %shuf1
+  ret <4 x i16> %sub
+}