[llvm] [InstCombine] Optimistically allow multiple shufflevector uses in foldOpPhi (PR #114278)
Matthias Braun via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 11 11:18:49 PST 2024
https://github.com/MatzeB updated https://github.com/llvm/llvm-project/pull/114278
>From 42aba4bbb4503bbe0b54eafb6cef695f7df91c6c Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Tue, 29 Oct 2024 16:38:30 -0700
Subject: [PATCH 1/3] Optimistically multiple uses in foldOpPhi for
shufflevector
We would like to optimize situations of the form:
```
loop:
%phi = phi zeroinitializer, %interleaved
%deinterleave_a = shufflevector %phi, poison ; pick half of the lanes
%deinterleave_b = shufflevector %phi, posion ; pick remaining lanes
...
%interleaved = shufflevector %a, %b ; interleave lanes of a+b
```
where the interleave and de-interleave patterns cancel each other out.
This could be handled by `foldOpPhi` but requires to proceed with
two uses on the `Phi` operation.
This extends `foldOpPhi` proceed with more than one use if all
uses are `shufflevector` and are guaranteed to simplify with all
`Phi` predecessors.
---
.../InstCombine/InstructionCombining.cpp | 55 +++++++++++++-----
.../InstCombine/vec_shuffle-phi-multiuse.ll | 57 +++++++++++++++++++
2 files changed, 97 insertions(+), 15 deletions(-)
create mode 100644 llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 2a54390c0f1882..9b2074e481f54f 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1773,17 +1773,33 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
if (NumPHIValues == 0)
return nullptr;
- // We normally only transform phis with a single use. However, if a PHI has
- // multiple uses and they are all the same operation, we can fold *all* of the
- // uses into the PHI.
+ // We normally only transform phis with a single use.
+ bool AllUsesIdentical = false;
+ bool MultipleUses = false;
if (!PN->hasOneUse()) {
- // Walk the use list for the instruction, comparing them to I.
+ // Exceptions:
+ // - All uses are identical.
+ // - All uses are shufflevector instructions that fully simplify; this
+ // helps interleave -> phi -> 2x de-interleave+de patterns.
+ if (isa<ShuffleVectorInst>(I)) {
+ MultipleUses = true;
+ }
+ AllUsesIdentical = true;
+ unsigned NumUses = 0;
for (User *U : PN->users()) {
+ ++NumUses;
Instruction *UI = cast<Instruction>(U);
- if (UI != &I && !I.isIdenticalTo(UI))
+ if (UI == &I)
+ continue;
+
+ if (!I.isIdenticalTo(UI))
+ AllUsesIdentical = false;
+ // Only inspect first 4 uses to avoid quadratic complexity.
+ if (!isa<ShuffleVectorInst>(UI) || NumUses > 4)
+ MultipleUses = false;
+ if (!AllUsesIdentical && !MultipleUses)
return nullptr;
}
- // Otherwise, we can replace *all* users with the new PHI we form.
}
// Check that all operands are phi-translatable.
@@ -1834,6 +1850,11 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
continue;
}
+ // Be conservative in MultipleUses case and do not allow non-simplified
+ // vals.
+ if (MultipleUses)
+ return nullptr;
+
if (SeenNonSimplifiedInVal)
return nullptr; // More than one non-simplified value.
SeenNonSimplifiedInVal = true;
@@ -1895,17 +1916,21 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
for (unsigned i = 0; i != NumPHIValues; ++i)
NewPN->addIncoming(NewPhiValues[i], PN->getIncomingBlock(i));
- for (User *U : make_early_inc_range(PN->users())) {
- Instruction *User = cast<Instruction>(U);
- if (User == &I)
- continue;
- replaceInstUsesWith(*User, NewPN);
- eraseInstFromFunction(*User);
+ if (AllUsesIdentical) {
+ for (User *U : make_early_inc_range(PN->users())) {
+ Instruction *User = cast<Instruction>(U);
+ if (User == &I)
+ continue;
+ replaceInstUsesWith(*User, NewPN);
+ eraseInstFromFunction(*User);
+ }
}
- replaceAllDbgUsesWith(const_cast<PHINode &>(*PN),
- const_cast<PHINode &>(*NewPN),
- const_cast<PHINode &>(*PN), DT);
+ if (!MultipleUses || AllUsesIdentical) {
+ replaceAllDbgUsesWith(const_cast<PHINode &>(*PN),
+ const_cast<PHINode &>(*NewPN),
+ const_cast<PHINode &>(*PN), DT);
+ }
return replaceInstUsesWith(I, NewPN);
}
diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll b/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
new file mode 100644
index 00000000000000..597eae101e9f2d
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
+
+define void @f(ptr %p_begin, ptr %p_end, ptr %out) {
+; CHECK-LABEL: define void @f(
+; CHECK-SAME: ptr [[P_BEGIN:%.*]], ptr [[P_END:%.*]], ptr [[OUT:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[ACC:%.*]] = phi <4 x float> [ zeroinitializer, %[[ENTRY]] ], [ [[SUM_LOWS:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[ODDS:%.*]] = phi <4 x float> [ zeroinitializer, %[[ENTRY]] ], [ [[SUM_HIGHS:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[P:%.*]] = phi ptr [ [[P_BEGIN]], %[[ENTRY]] ], [ [[P_INC:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[VAL:%.*]] = load <4 x i8>, ptr [[P]], align 4
+; CHECK-NEXT: [[HIGHS:%.*]] = ashr <4 x i8> [[VAL]], <i8 4, i8 4, i8 4, i8 4>
+; CHECK-NEXT: [[LOWS:%.*]] = and <4 x i8> [[VAL]], <i8 15, i8 15, i8 15, i8 15>
+; CHECK-NEXT: [[HIGHS_F:%.*]] = sitofp <4 x i8> [[HIGHS]] to <4 x float>
+; CHECK-NEXT: [[LOWS_F:%.*]] = uitofp nneg <4 x i8> [[LOWS]] to <4 x float>
+; CHECK-NEXT: [[SUM_LOWS]] = fadd <4 x float> [[ACC]], [[LOWS_F]]
+; CHECK-NEXT: [[SUM_HIGHS]] = fadd <4 x float> [[ODDS]], [[HIGHS_F]]
+; CHECK-NEXT: [[P_INC]] = getelementptr inbounds i8, ptr [[P]], i64 4
+; CHECK-NEXT: [[C:%.*]] = icmp eq ptr [[P_INC]], [[P_END]]
+; CHECK-NEXT: br i1 [[C]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[INTERLEAVE:%.*]] = shufflevector <4 x float> [[SUM_LOWS]], <4 x float> [[SUM_HIGHS]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT: store <8 x float> [[INTERLEAVE]], ptr [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %acc = phi <8 x float> [ zeroinitializer, %entry ], [ %interleave, %loop ]
+ %p = phi ptr [%p_begin, %entry ], [%p_inc, %loop]
+
+ %evens = shufflevector <8 x float> %acc, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %odds = shufflevector <8 x float> %acc, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+ %val = load <4 x i8>, ptr %p, align 4
+ %highs = ashr <4 x i8> %val, <i8 4, i8 4, i8 4, i8 4>
+ %lows = and <4 x i8> %val, <i8 15, i8 15, i8 15, i8 15>
+
+ %highs_f = sitofp <4 x i8> %highs to <4 x float>
+ %lows_f = sitofp <4 x i8> %lows to <4 x float>
+
+ %sum_lows = fadd <4 x float> %evens, %lows_f
+ %sum_highs = fadd <4 x float> %odds, %highs_f
+
+ %interleave = shufflevector <4 x float> %sum_lows, <4 x float> %sum_highs, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+
+ %p_inc = getelementptr inbounds i8, ptr %p, i32 4
+ %c = icmp eq ptr %p_inc, %p_end
+ br i1 %c, label %exit, label %loop
+
+exit:
+ store <8 x float> %interleave, ptr %out, align 4
+ ret void
+}
>From 394d2df634fbb1f2d22d85308abb411d5d16c67f Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Thu, 7 Nov 2024 14:57:37 -0800
Subject: [PATCH 2/3] Address review feedback
---
.../InstCombine/InstructionCombining.cpp | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 9b2074e481f54f..714d13d4986066 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1775,15 +1775,13 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
// We normally only transform phis with a single use.
bool AllUsesIdentical = false;
- bool MultipleUses = false;
+ bool MultipleShuffleVectorUses = false;
if (!PN->hasOneUse()) {
// Exceptions:
// - All uses are identical.
// - All uses are shufflevector instructions that fully simplify; this
// helps interleave -> phi -> 2x de-interleave+de patterns.
- if (isa<ShuffleVectorInst>(I)) {
- MultipleUses = true;
- }
+ MultipleShuffleVectorUses = isa<ShuffleVectorInst>(I);
AllUsesIdentical = true;
unsigned NumUses = 0;
for (User *U : PN->users()) {
@@ -1796,8 +1794,8 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
AllUsesIdentical = false;
// Only inspect first 4 uses to avoid quadratic complexity.
if (!isa<ShuffleVectorInst>(UI) || NumUses > 4)
- MultipleUses = false;
- if (!AllUsesIdentical && !MultipleUses)
+ MultipleShuffleVectorUses = false;
+ if (!AllUsesIdentical && !MultipleShuffleVectorUses)
return nullptr;
}
}
@@ -1850,9 +1848,9 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
continue;
}
- // Be conservative in MultipleUses case and do not allow non-simplified
- // vals.
- if (MultipleUses)
+ // Be conservative in cases with multiple uses and require all inputs to
+ // simplify.
+ if (MultipleShuffleVectorUses)
return nullptr;
if (SeenNonSimplifiedInVal)
@@ -1926,7 +1924,7 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
}
}
- if (!MultipleUses || AllUsesIdentical) {
+ if (!MultipleShuffleVectorUses || AllUsesIdentical) {
replaceAllDbgUsesWith(const_cast<PHINode &>(*PN),
const_cast<PHINode &>(*NewPN),
const_cast<PHINode &>(*PN), DT);
>From 745fac7275e3f8ccece6381f6883b42c40f90c31 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Mon, 11 Nov 2024 09:55:03 -0800
Subject: [PATCH 3/3] Ensure that all users simplify
Also add more tests as suggested
---
.../InstCombine/InstCombineInternal.h | 7 +
.../InstCombine/InstructionCombining.cpp | 142 ++++++++++--------
.../InstCombine/vec_shuffle-phi-multiuse.ll | 112 +++++++++++++-
3 files changed, 198 insertions(+), 63 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 7a060cdab2d37d..1562fc4441b844 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -454,6 +454,13 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
Instruction *hoistFNegAboveFMulFDiv(Value *FNegOp, Instruction &FMFSource);
+ /// Helper for `foldOpIntoPhi`. Tests is a given user of the phi node would
+ /// simplify when folded.
+ bool canFoldUserIntoPhi(Instruction &User, PHINode &PN,
+ SmallVectorImpl<Value *> &NewPhiValues,
+ SmallVectorImpl<unsigned> &OpsToMoveUseToIncomingBB,
+ bool AllowOneNonSimplifiedValue);
+
public:
/// Create and insert the idiom we use to indicate a block is unreachable
/// without having to rewrite the CFG from within InstCombine.
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 714d13d4986066..f2bc19f966a491 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1768,71 +1768,41 @@ static Value *simplifyInstructionWithPHI(Instruction &I, PHINode *PN,
return nullptr;
}
-Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
- unsigned NumPHIValues = PN->getNumIncomingValues();
- if (NumPHIValues == 0)
- return nullptr;
-
- // We normally only transform phis with a single use.
- bool AllUsesIdentical = false;
- bool MultipleShuffleVectorUses = false;
- if (!PN->hasOneUse()) {
- // Exceptions:
- // - All uses are identical.
- // - All uses are shufflevector instructions that fully simplify; this
- // helps interleave -> phi -> 2x de-interleave+de patterns.
- MultipleShuffleVectorUses = isa<ShuffleVectorInst>(I);
- AllUsesIdentical = true;
- unsigned NumUses = 0;
- for (User *U : PN->users()) {
- ++NumUses;
- Instruction *UI = cast<Instruction>(U);
- if (UI == &I)
- continue;
-
- if (!I.isIdenticalTo(UI))
- AllUsesIdentical = false;
- // Only inspect first 4 uses to avoid quadratic complexity.
- if (!isa<ShuffleVectorInst>(UI) || NumUses > 4)
- MultipleShuffleVectorUses = false;
- if (!AllUsesIdentical && !MultipleShuffleVectorUses)
- return nullptr;
- }
- }
-
+bool InstCombinerImpl::canFoldUserIntoPhi(
+ Instruction &User, PHINode &PN, SmallVectorImpl<Value *> &NewPhiValues,
+ SmallVectorImpl<unsigned> &OpsToMoveUseToIncomingBB,
+ bool AllowOneNonSimplifiedValue) {
// Check that all operands are phi-translatable.
- for (Value *Op : I.operands()) {
- if (Op == PN)
+ for (Value *Op : User.operands()) {
+ if (Op == &PN)
continue;
// Non-instructions never require phi-translation.
- auto *I = dyn_cast<Instruction>(Op);
- if (!I)
+ auto *OpInst = dyn_cast<Instruction>(Op);
+ if (!OpInst)
continue;
// Phi-translate can handle phi nodes in the same block.
- if (isa<PHINode>(I))
- if (I->getParent() == PN->getParent())
- continue;
+ if (isa<PHINode>(OpInst) && OpInst->getParent() == PN.getParent())
+ continue;
// Operand dominates the block, no phi-translation necessary.
- if (DT.dominates(I, PN->getParent()))
+ if (DT.dominates(OpInst, PN.getParent()))
continue;
// Not phi-translatable, bail out.
- return nullptr;
+ return false;
}
// Check to see whether the instruction can be folded into each phi operand.
// If there is one operand that does not fold, remember the BB it is in.
- SmallVector<Value *> NewPhiValues;
- SmallVector<unsigned int> OpsToMoveUseToIncomingBB;
bool SeenNonSimplifiedInVal = false;
- for (unsigned i = 0; i != NumPHIValues; ++i) {
- Value *InVal = PN->getIncomingValue(i);
- BasicBlock *InBB = PN->getIncomingBlock(i);
+ for (unsigned I = 0, E = PN.getNumIncomingValues(); I != E; ++I) {
+ Value *InVal = PN.getIncomingValue(I);
+ BasicBlock *InBB = PN.getIncomingBlock(I);
- if (auto *NewVal = simplifyInstructionWithPHI(I, PN, InVal, InBB, DL, SQ)) {
+ if (auto *NewVal =
+ simplifyInstructionWithPHI(User, &PN, InVal, InBB, DL, SQ)) {
NewPhiValues.push_back(NewVal);
continue;
}
@@ -1842,19 +1812,14 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
// because we know that it will simplify to a single icmp.
const APInt *Ignored;
if (isa<CmpIntrinsic>(InVal) && InVal->hasOneUser() &&
- match(&I, m_ICmp(m_Specific(PN), m_APInt(Ignored)))) {
- OpsToMoveUseToIncomingBB.push_back(i);
+ match(&User, m_ICmp(m_Specific(&PN), m_APInt(Ignored)))) {
+ OpsToMoveUseToIncomingBB.push_back(I);
NewPhiValues.push_back(nullptr);
continue;
}
- // Be conservative in cases with multiple uses and require all inputs to
- // simplify.
- if (MultipleShuffleVectorUses)
- return nullptr;
-
- if (SeenNonSimplifiedInVal)
- return nullptr; // More than one non-simplified value.
+ if (!AllowOneNonSimplifiedValue || SeenNonSimplifiedInVal)
+ return false; // More than one non-simplified value.
SeenNonSimplifiedInVal = true;
// If there is exactly one non-simplified value, we can insert a copy of the
@@ -1864,23 +1829,78 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
// block. Also, make sure that the pred block is not dead code.
BranchInst *BI = dyn_cast<BranchInst>(InBB->getTerminator());
if (!BI || !BI->isUnconditional() || !DT.isReachableFromEntry(InBB))
- return nullptr;
+ return false;
NewPhiValues.push_back(nullptr);
- OpsToMoveUseToIncomingBB.push_back(i);
+ OpsToMoveUseToIncomingBB.push_back(I);
// If the InVal is an invoke at the end of the pred block, then we can't
// insert a computation after it without breaking the edge.
if (isa<InvokeInst>(InVal))
if (cast<Instruction>(InVal)->getParent() == InBB)
- return nullptr;
+ return false;
// Do not push the operation across a loop backedge. This could result in
// an infinite combine loop, and is generally non-profitable (especially
// if the operation was originally outside the loop).
- if (isBackEdge(InBB, PN->getParent()))
- return nullptr;
+ if (isBackEdge(InBB, PN.getParent()))
+ return false;
}
+ return true;
+}
+
+Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
+ unsigned NumPHIValues = PN->getNumIncomingValues();
+ if (NumPHIValues == 0)
+ return nullptr;
+
+ // We normally only transform phis with a single use.
+ bool AllUsesIdentical = false;
+ bool MultipleShuffleVectorUses = false;
+ if (!PN->hasOneUse()) {
+ // Exceptions:
+ // - All uses are identical.
+ // - All uses are shufflevector instructions that fully simplify; this
+ // helps interleave -> phi -> 2x de-interleave+de patterns.
+ MultipleShuffleVectorUses = isa<ShuffleVectorInst>(I);
+ AllUsesIdentical = true;
+ unsigned NumUses = 0;
+ for (User *U : PN->users()) {
+ ++NumUses;
+ Instruction *UI = cast<Instruction>(U);
+ if (UI == &I)
+ continue;
+
+ if (!I.isIdenticalTo(UI))
+ AllUsesIdentical = false;
+ // Only inspect first 4 uses to avoid quadratic complexity.
+ if (!isa<ShuffleVectorInst>(UI) || NumUses > 4)
+ MultipleShuffleVectorUses = false;
+ if (!AllUsesIdentical && !MultipleShuffleVectorUses)
+ return nullptr;
+ }
+
+ // Check that other uses will simplify as well.
+ if (MultipleShuffleVectorUses) {
+ for (User *U : PN->users()) {
+ if (U == &I)
+ continue;
+ SmallVector<Value *, 4> dummy_vals;
+ SmallVector<unsigned, 4> dummy_ints;
+ if (!canFoldUserIntoPhi(*cast<Instruction>(U), *PN, dummy_vals,
+ dummy_ints,
+ /*AllowOneNonSimplifiedValue=*/false))
+ return nullptr;
+ }
+ }
+ }
+
+ SmallVector<Value *> NewPhiValues;
+ SmallVector<unsigned int> OpsToMoveUseToIncomingBB;
+ if (!canFoldUserIntoPhi(
+ I, *PN, NewPhiValues, OpsToMoveUseToIncomingBB,
+ /*AllowOneNonSimplifiedValue=*/!MultipleShuffleVectorUses))
+ return nullptr;
// Clone the instruction that uses the phi node and move it into the incoming
// BB because we know that the next iteration of InstCombine will simplify it.
diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll b/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
index 597eae101e9f2d..91ce6aab5af7f6 100644
--- a/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle-phi-multiuse.ll
@@ -1,8 +1,39 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -S -passes=instcombine | FileCheck %s
-define void @f(ptr %p_begin, ptr %p_end, ptr %out) {
-; CHECK-LABEL: define void @f(
+define <4 x i16> @f0(i1 %c, ptr %p0, ptr %p1) {
+; CHECK-LABEL: define <4 x i16> @f0(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P0:%.*]], ptr [[P1:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[MERGE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[LOAD0:%.*]] = load <4 x i16>, ptr [[P0]], align 16
+; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i16>, ptr [[P1]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = sub <4 x i16> [[LOAD0]], [[LOAD1]]
+; CHECK-NEXT: br label %[[MERGE]]
+; CHECK: [[MERGE]]:
+; CHECK-NEXT: [[SUB:%.*]] = phi <4 x i16> [ <i16 -87, i16 327, i16 51, i16 755>, %[[ENTRY]] ], [ [[TMP0]], %[[THEN]] ]
+; CHECK-NEXT: ret <4 x i16> [[SUB]]
+;
+entry:
+ br i1 %c, label %then, label %merge
+
+then:
+ %load0 = load <4 x i16>, ptr %p0, align 16
+ %load1 = load <4 x i16>, ptr %p1, align 16
+ %interleave = shufflevector <4 x i16> %load0, <4 x i16> %load1, <8 x i32> <i32 0, i32 7, i32 1, i32 6, i32 2, i32 5, i32 3, i32 4>
+ br label %merge
+
+merge:
+ %phi = phi <8 x i16> [<i16 1, i16 22, i16 333, i16 4, i16 55, i16 6, i16 777, i16 88>, %entry], [%interleave, %then]
+ %shuf0 = shufflevector <8 x i16> %phi, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %shuf1 = shufflevector <8 x i16> %phi, <8 x i16> poison, <4 x i32> <i32 7, i32 5, i32 3, i32 1>
+ %sub = sub <4 x i16> %shuf0, %shuf1
+ ret <4 x i16> %sub
+}
+
+define void @deinterleave_interleave(ptr %p_begin, ptr %p_end, ptr %out) {
+; CHECK-LABEL: define void @deinterleave_interleave(
; CHECK-SAME: ptr [[P_BEGIN:%.*]], ptr [[P_END:%.*]], ptr [[OUT:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[LOOP:.*]]
@@ -55,3 +86,80 @@ exit:
store <8 x float> %interleave, ptr %out, align 4
ret void
}
+
+; Currently we only optimize if all uses are shufflevectors.
+define <4 x i16> @noopt_only_shuffles(i1 %c, ptr %p) {
+; CHECK-LABEL: define <4 x i16> @noopt_only_shuffles(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[MERGE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: store i32 42, ptr [[P]], align 4
+; CHECK-NEXT: br label %[[MERGE]]
+; CHECK: [[MERGE]]:
+; CHECK-NEXT: [[PHI:%.*]] = phi <4 x i16> [ <i16 1, i16 22, i16 333, i16 4>, %[[ENTRY]] ], [ <i16 555, i16 6, i16 77, i16 8>, %[[THEN]] ]
+; CHECK-NEXT: [[SHUF0:%.*]] = shufflevector <4 x i16> [[PHI]], <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw <4 x i16> [[PHI]], <i16 1, i16 1, i16 1, i16 1>
+; CHECK-NEXT: [[XOR:%.*]] = xor <4 x i16> [[SHUF0]], [[ADD1]]
+; CHECK-NEXT: ret <4 x i16> [[XOR]]
+;
+entry:
+ br i1 %c, label %then, label %merge
+
+then:
+ store i32 42, ptr %p, align 4
+ br label %merge
+
+merge:
+ %phi = phi <4 x i16> [<i16 1, i16 22, i16 333, i16 4>, %entry], [<i16 555, i16 6, i16 77, i16 8>, %then]
+ %shuf0 = shufflevector <4 x i16> %phi, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %add1 = add <4 x i16> %phi, <i16 1, i16 1, i16 1, i16 1>
+ %xor = xor <4 x i16> %shuf0, %add1
+ ret <4 x i16> %xor
+}
+
+; Currently we don't optimize if one of the phi inputs does not simplify.
+define <4 x i16> @noopt_must_fully_simplify(i1 %c, ptr %p0, ptr %p1, ptr %p2, ptr %p3) {
+; CHECK-LABEL: define <4 x i16> @noopt_must_fully_simplify(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P0:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]], ptr [[P3:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[LOAD_T0:%.*]] = load <4 x i16>, ptr [[P0]], align 16
+; CHECK-NEXT: [[LOAD_T1:%.*]] = load <4 x i16>, ptr [[P1]], align 16
+; CHECK-NEXT: [[INTERLEAVE_THEN:%.*]] = shufflevector <4 x i16> [[LOAD_T0]], <4 x i16> [[LOAD_T1]], <8 x i32> <i32 0, i32 7, i32 1, i32 6, i32 2, i32 5, i32 3, i32 4>
+; CHECK-NEXT: br label %[[MERGE:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[LOAD_E0:%.*]] = load <4 x i16>, ptr [[P2]], align 16
+; CHECK-NEXT: [[LOAD_E1:%.*]] = load <4 x i16>, ptr [[P3]], align 16
+; CHECK-NEXT: [[INTERLEAVE_ELSE:%.*]] = shufflevector <4 x i16> [[LOAD_E0]], <4 x i16> [[LOAD_E1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT: br label %[[MERGE]]
+; CHECK: [[MERGE]]:
+; CHECK-NEXT: [[PHI:%.*]] = phi <8 x i16> [ [[INTERLEAVE_ELSE]], %[[ELSE]] ], [ [[INTERLEAVE_THEN]], %[[THEN]] ]
+; CHECK-NEXT: [[SHUF0:%.*]] = shufflevector <8 x i16> [[PHI]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[SHUF1:%.*]] = shufflevector <8 x i16> [[PHI]], <8 x i16> poison, <4 x i32> <i32 7, i32 5, i32 3, i32 1>
+; CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[SHUF0]], [[SHUF1]]
+; CHECK-NEXT: ret <4 x i16> [[SUB]]
+;
+entry:
+ br i1 %c, label %then, label %else
+
+then:
+ %load_t0 = load <4 x i16>, ptr %p0, align 16
+ %load_t1 = load <4 x i16>, ptr %p1, align 16
+ %interleave_then = shufflevector <4 x i16> %load_t0, <4 x i16> %load_t1, <8 x i32> <i32 0, i32 7, i32 1, i32 6, i32 2, i32 5, i32 3, i32 4>
+ br label %merge
+
+else:
+ %load_e0 = load <4 x i16>, ptr %p2, align 16
+ %load_e1 = load <4 x i16>, ptr %p3, align 16
+ %interleave_else = shufflevector <4 x i16> %load_e0, <4 x i16> %load_e1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+ br label %merge
+
+merge:
+ %phi = phi <8 x i16> [%interleave_else, %else], [%interleave_then, %then]
+ %shuf0 = shufflevector <8 x i16> %phi, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %shuf1 = shufflevector <8 x i16> %phi, <8 x i16> poison, <4 x i32> <i32 7, i32 5, i32 3, i32 1>
+ %sub = sub <4 x i16> %shuf0, %shuf1
+ ret <4 x i16> %sub
+}
More information about the llvm-commits
mailing list