[llvm] 7687548 - [InstCombine] Optimistically allow multiple shufflevector uses in foldOpPhi (#114278)

Thu Dec 12 17:20:52 PST 2024

Author: Matthias Braun
Date: 2024-12-12T17:20:48-08:00
New Revision: 768754807f17754fb450ec672779b827ad5df4b4

URL: https://github.com/llvm/llvm-project/commit/768754807f17754fb450ec672779b827ad5df4b4
DIFF: https://github.com/llvm/llvm-project/commit/768754807f17754fb450ec672779b827ad5df4b4.diff

LOG: [InstCombine] Optimistically allow multiple shufflevector uses in foldOpPhi (#114278)

We would like to optimize situations of the form that happen after loop
vectorization+SROA:
```
loop:
    %phi = phi zeroinitializer, %interleaved

    %deinterleave_a = shufflevector %phi, poison ; pick half of the lanes
    %deinterleave_b = shufflevector %phi, posion ; pick remaining lanes

    ... %a = ... %b = ...

    %interleaved = shufflevector %a, %b ; interleave lanes of a+b
```
where the interleave and de-interleave shuffle operations cancel each
other out.
This could be handled by `foldOpPhi` but does not currently work because
it does
not proceed when there are multiple uses of the `Phi` operation.

This extends `foldOpPhi` to allow multiple `shufflevector` uses when
they are
shown to simplify for all `Phi` input values.

Added: 
    llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll

Modified: 
    llvm/lib/Transforms/InstCombine/InstCombineInternal.h
    llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
    llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 28474fec8238ee..3a074ee70dc487 100644

--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -600,7 +600,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   /// Given a binary operator, cast instruction, or select which has a PHI node
   /// as operand #0, see if we can fold the instruction into the PHI (which is
   /// only possible if all operands to the PHI are constants).
-  Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN);
+  Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN,
+                             bool AllowMultipleUses = false);
 
   /// For a binary operator with 2 phi operands, try to hoist the binary
   /// operation before the phi. This can result in fewer instructions in

diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 09eafd09451b24..302968f88cc2f7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -2978,7 +2978,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
       }
     }
     if (auto *PN = dyn_cast<PHINode>(LHS)) {
-      if (Instruction *I = foldOpIntoPhi(SVI, PN))
+      if (Instruction *I = foldOpIntoPhi(SVI, PN, /*AllowMultipleUses=*/true))
         return I;
     }
   }

diff  --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 8f55e5b3cc28a2..af537da8fa2747 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1763,7 +1763,8 @@ static Value *simplifyInstructionWithPHI(Instruction &I, PHINode *PN,
   return nullptr;
 }
 
-Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
+Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN,
+                                             bool AllowMultipleUses) {
   unsigned NumPHIValues = PN->getNumIncomingValues();
   if (NumPHIValues == 0)
     return nullptr;
@@ -1771,7 +1772,9 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
   // We normally only transform phis with a single use.  However, if a PHI has
   // multiple uses and they are all the same operation, we can fold *all* of the
   // uses into the PHI.
-  if (!PN->hasOneUse()) {
+  bool OneUse = PN->hasOneUse();
+  bool IdenticalUsers = false;
+  if (!AllowMultipleUses && !OneUse) {
     // Walk the use list for the instruction, comparing them to I.
     for (User *U : PN->users()) {
       Instruction *UI = cast<Instruction>(U);
@@ -1779,6 +1782,7 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
         return nullptr;
     }
     // Otherwise, we can replace *all* users with the new PHI we form.
+    IdenticalUsers = true;
   }
 
   // Check that all operands are phi-translatable.
@@ -1829,6 +1833,9 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
       continue;
     }
 
+    if (!OneUse && !IdenticalUsers)
+      return nullptr;
+
     if (SeenNonSimplifiedInVal)
       return nullptr; // More than one non-simplified value.
     SeenNonSimplifiedInVal = true;
@@ -1890,17 +1897,22 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
   for (unsigned i = 0; i != NumPHIValues; ++i)
     NewPN->addIncoming(NewPhiValues[i], PN->getIncomingBlock(i));
 
-  for (User *U : make_early_inc_range(PN->users())) {
-    Instruction *User = cast<Instruction>(U);
-    if (User == &I)
-      continue;
-    replaceInstUsesWith(*User, NewPN);
-    eraseInstFromFunction(*User);
+  if (IdenticalUsers) {
+    for (User *U : make_early_inc_range(PN->users())) {
+      Instruction *User = cast<Instruction>(U);
+      if (User == &I)
+        continue;
+      replaceInstUsesWith(*User, NewPN);
+      eraseInstFromFunction(*User);
+    }
+    OneUse = true;
   }
 
-  replaceAllDbgUsesWith(const_cast<PHINode &>(*PN),
-                        const_cast<PHINode &>(*NewPN),
-                        const_cast<PHINode &>(*PN), DT);
+  if (OneUse) {
+    replaceAllDbgUsesWith(const_cast<PHINode &>(*PN),
+                          const_cast<PHINode &>(*NewPN),
+                          const_cast<PHINode &>(*PN), DT);
+  }
   return replaceInstUsesWith(I, NewPN);
 }
 

diff  --git a/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll b/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
new file mode 100644
index 00000000000000..44b25bb6fd727a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
+
+define <4 x i16> @f0(i1 %c, ptr %p0, ptr %p1) {
+; CHECK-LABEL: define <4 x i16> @f0(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P0:%.*]], ptr [[P1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[MERGE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[LOAD0:%.*]] = load <4 x i16>, ptr [[P0]], align 16
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <4 x i16>, ptr [[P1]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = sub <4 x i16> [[LOAD0]], [[LOAD1]]
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[SUB:%.*]] = phi <4 x i16> [ <i16 -87, i16 327, i16 51, i16 755>, %[[ENTRY]] ], [ [[TMP0]], %[[THEN]] ]
+; CHECK-NEXT:    ret <4 x i16> [[SUB]]
+;
+entry:
+  br i1 %c, label %then, label %merge
+
+then:
+  %load0 = load <4 x i16>, ptr %p0, align 16
+  %load1 = load <4 x i16>, ptr %p1, align 16
+  %interleave = shufflevector <4 x i16> %load0, <4 x i16> %load1, <8 x i32> <i32 0, i32 7, i32 1, i32 6, i32 2, i32 5, i32 3, i32 4>
+  br label %merge
+
+merge:
+  %phi = phi <8 x i16> [<i16 1, i16 22, i16 333, i16 4, i16 55, i16 6, i16 777, i16 88>, %entry], [%interleave, %then]
+  %shuf0 = shufflevector <8 x i16> %phi, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuf1 = shufflevector <8 x i16> %phi, <8 x i16> poison, <4 x i32> <i32 7, i32 5, i32 3, i32 1>
+  %sub = sub <4 x i16> %shuf0, %shuf1
+  ret <4 x i16> %sub
+}
+
+define void @deinterleave_interleave(ptr %p_begin, ptr %p_end, ptr %out) {
+; CHECK-LABEL: define void @deinterleave_interleave(
+; CHECK-SAME: ptr [[P_BEGIN:%.*]], ptr [[P_END:%.*]], ptr [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[ACC:%.*]] = phi <4 x float> [ zeroinitializer, %[[ENTRY]] ], [ [[SUM_LOWS:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ODDS:%.*]] = phi <4 x float> [ zeroinitializer, %[[ENTRY]] ], [ [[SUM_HIGHS:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[P:%.*]] = phi ptr [ [[P_BEGIN]], %[[ENTRY]] ], [ [[P_INC:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = load <4 x i8>, ptr [[P]], align 4
+; CHECK-NEXT:    [[HIGHS:%.*]] = ashr <4 x i8> [[VAL]], splat (i8 4)
+; CHECK-NEXT:    [[LOWS:%.*]] = and <4 x i8> [[VAL]], splat (i8 15)
+; CHECK-NEXT:    [[HIGHS_F:%.*]] = sitofp <4 x i8> [[HIGHS]] to <4 x float>
+; CHECK-NEXT:    [[LOWS_F:%.*]] = uitofp nneg <4 x i8> [[LOWS]] to <4 x float>
+; CHECK-NEXT:    [[SUM_LOWS]] = fadd <4 x float> [[ACC]], [[LOWS_F]]
+; CHECK-NEXT:    [[SUM_HIGHS]] = fadd <4 x float> [[ODDS]], [[HIGHS_F]]
+; CHECK-NEXT:    [[P_INC]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4
+; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[P_INC]], [[P_END]]
+; CHECK-NEXT:    br i1 [[C]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[INTERLEAVE:%.*]] = shufflevector <4 x float> [[SUM_LOWS]], <4 x float> [[SUM_HIGHS]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x float> [[INTERLEAVE]], ptr [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %acc = phi <8 x float> [ zeroinitializer, %entry ], [ %interleave, %loop ]
+  %p = phi ptr [%p_begin, %entry ], [%p_inc, %loop]
+
+  %evens = shufflevector <8 x float> %acc, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %odds = shufflevector <8 x float> %acc, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+  %val = load <4 x i8>, ptr %p, align 4
+  %highs = ashr <4 x i8> %val, <i8 4, i8 4, i8 4, i8 4>
+  %lows = and <4 x i8> %val, <i8 15, i8 15, i8 15, i8 15>
+
+  %highs_f = sitofp <4 x i8> %highs to <4 x float>
+  %lows_f = sitofp <4 x i8> %lows to <4 x float>
+
+  %sum_lows = fadd <4 x float> %evens, %lows_f
+  %sum_highs = fadd <4 x float> %odds, %highs_f
+
+  %interleave = shufflevector <4 x float> %sum_lows, <4 x float> %sum_highs, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+
+  %p_inc = getelementptr inbounds i8, ptr %p, i32 4
+  %c = icmp eq ptr %p_inc, %p_end
+  br i1 %c, label %exit, label %loop
+
+exit:
+  store <8 x float> %interleave, ptr %out, align 4
+  ret void
+}
+
+define <4 x i16> @f1(i1 %c, ptr %p) {
+; CHECK-LABEL: define <4 x i16> @f1(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[MERGE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    store i32 42, ptr [[P]], align 4
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[XOR:%.*]] = phi <4 x i16> [ <i16 3, i16 346, i16 undef, i16 undef>, %[[ENTRY]] ], [ <i16 7, i16 74, i16 undef, i16 undef>, %[[THEN]] ]
+; CHECK-NEXT:    ret <4 x i16> [[XOR]]
+;
+entry:
+  br i1 %c, label %then, label %merge
+
+then:
+  store i32 42, ptr %p, align 4
+  br label %merge
+
+merge:
+  %phi = phi <4 x i16> [<i16 1, i16 22, i16 333, i16 4>, %entry], [<i16 555, i16 6, i16 77, i16 8>, %then]
+  %shuf0 = shufflevector <4 x i16> %phi, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %add1 = add <4 x i16> %phi, <i16 1, i16 1, i16 1, i16 1>
+  %xor = xor <4 x i16> %shuf0, %add1
+  ret <4 x i16> %xor
+}