[llvm] baab4aa - [VectorCombine] convert scalar fneg with insert/extract to vector fneg
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 10 12:00:10 PDT 2022
Author: Sanjay Patel
Date: 2022-10-10T14:59:56-04:00
New Revision: baab4aa1ba5f68634b4936375e19c8686b1b474a
URL: https://github.com/llvm/llvm-project/commit/baab4aa1ba5f68634b4936375e19c8686b1b474a
DIFF: https://github.com/llvm/llvm-project/commit/baab4aa1ba5f68634b4936375e19c8686b1b474a.diff
LOG: [VectorCombine] convert scalar fneg with insert/extract to vector fneg
insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> shuffle DestVec, (fneg SrcVec), Mask
This is a specialized form of what could be a more general fold for a binop.
It's also possible that fneg is overlooked by SLP in this kind of
insert/extract pattern since it's a unary op.
This shows up in the motivating example from #issue 58139, but it won't solve
it (that probably requires some x86-specific backend changes). There are also
some small enhancements (see TODO comments) that can be done as follow-up
patches.
Differential Revision: https://reviews.llvm.org/D135278
Added:
Modified:
llvm/lib/Transforms/Vectorize/VectorCombine.cpp
llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 8a2eaafacf491..7046859c26d61 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -30,6 +30,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Vectorize.h"
+#include <numeric>
#define DEBUG_TYPE "vector-combine"
#include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -97,6 +98,7 @@ class VectorCombine {
void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
Instruction &I);
bool foldExtractExtract(Instruction &I);
+ bool foldInsExtFNeg(Instruction &I);
bool foldBitcastShuf(Instruction &I);
bool scalarizeBinopOrCmp(Instruction &I);
bool foldExtractedCmps(Instruction &I);
@@ -533,6 +535,67 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
return true;
}
+/// Try to replace an extract + scalar fneg + insert with a vector fneg +
+/// shuffle.
+bool VectorCombine::foldInsExtFNeg(Instruction &I) {
+ auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
+ if (!VecTy)
+ return false;
+
+ // Match an insert (op (extract)) pattern.
+ Value *DestVec;
+ uint64_t Index;
+ Instruction *FNeg;
+ if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),
+ m_ConstantInt(Index))))
+ return false;
+
+ Value *SrcVec;
+ if (!match(FNeg, m_FNeg(m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))
+ return false;
+
+ if (SrcVec->getType() != VecTy)
+ return false;
+
+ // Ignore bogus insert/extract index.
+ unsigned NumElts = VecTy->getNumElements();
+ if (Index >= NumElts)
+ return false;
+
+ // We are inserting the negated element into the same lane that we extracted
+ // from. This is equivalent to a select-shuffle that chooses all but the
+ // negated element from the destination vector.
+ SmallVector<int> Mask(NumElts);
+ std::iota(Mask.begin(), Mask.end(), 0);
+ Mask[Index] = Index + NumElts;
+
+ Type *ScalarTy = VecTy->getScalarType();
+ InstructionCost OldCost =
+ TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) +
+ TTI.getVectorInstrCost(I, VecTy, Index);
+
+ // If the extract has one use, it will be eliminated, so count it in the
+ // original cost. If it has more than one use, ignore the cost because it will
+ // be the same before/after.
+ Instruction *Extract = cast<Instruction>(FNeg->getOperand(0));
+ if (Extract->hasOneUse())
+ OldCost += TTI.getVectorInstrCost(*Extract, VecTy, Index);
+
+ InstructionCost NewCost =
+ TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) +
+ TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask);
+
+ if (NewCost > OldCost)
+ return false;
+
+ // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -->
+ // shuffle DestVec, (fneg SrcVec), Mask
+ Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
+ Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
+ replaceValue(I, *Shuf);
+ return true;
+}
+
/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
/// destination type followed by shuffle. This can enable further transforms by
/// moving bitcasts or shuffles together.
@@ -1571,6 +1634,7 @@ bool VectorCombine::run() {
if (!ScalarizationOnly) {
MadeChange |= vectorizeLoadInsert(I);
MadeChange |= foldExtractExtract(I);
+ MadeChange |= foldInsExtFNeg(I);
MadeChange |= foldBitcastShuf(I);
MadeChange |= foldExtractedCmps(I);
MadeChange |= foldShuffleOfBinops(I);
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
index 11c82d86d582b..0abccb128b734 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
@@ -1,9 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
declare void @use(float)
+; TODO: The insert is costed as free, so creating a shuffle appears to be a loss.
+
define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) {
; CHECK-LABEL: @ext0_v4f32(
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
@@ -21,9 +23,8 @@ define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) {
define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) {
; CHECK-LABEL: @ext2_v4f32(
-; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2
-; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
-; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 2
+; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]]
+; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
; CHECK-NEXT: ret <4 x float> [[R]]
;
%e = extractelement <4 x float> %x, i32 2
@@ -36,9 +37,8 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) {
define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
; CHECK-LABEL: @ext1_v2f64(
-; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
-; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
+; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: ret <2 x double> [[R]]
;
%e = extractelement <2 x double> %x, i32 1
@@ -47,12 +47,20 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
ret <2 x double> %r
}
+; The vector fneg would cost twice as much as the scalar op with SSE,
+; so we don't transform there (the shuffle would also be more expensive).
+
define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
-; CHECK-LABEL: @ext7_v8f32(
-; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
-; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
-; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
-; CHECK-NEXT: ret <8 x float> [[R]]
+; SSE-LABEL: @ext7_v8f32(
+; SSE-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
+; SSE-NEXT: [[N:%.*]] = fneg float [[E]]
+; SSE-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
+; SSE-NEXT: ret <8 x float> [[R]]
+;
+; AVX-LABEL: @ext7_v8f32(
+; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
+; AVX-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX-NEXT: ret <8 x float> [[R]]
;
%e = extractelement <8 x float> %x, i32 7
%n = fneg float %e
@@ -60,13 +68,22 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
ret <8 x float> %r
}
+; Same as above with an extra use of the extracted element.
+
define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
-; CHECK-LABEL: @ext7_v8f32_use1(
-; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 5
-; CHECK-NEXT: call void @use(float [[E]])
-; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
-; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 5
-; CHECK-NEXT: ret <8 x float> [[R]]
+; SSE-LABEL: @ext7_v8f32_use1(
+; SSE-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 5
+; SSE-NEXT: call void @use(float [[E]])
+; SSE-NEXT: [[N:%.*]] = fneg float [[E]]
+; SSE-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 5
+; SSE-NEXT: ret <8 x float> [[R]]
+;
+; AVX-LABEL: @ext7_v8f32_use1(
+; AVX-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 5
+; AVX-NEXT: call void @use(float [[E]])
+; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X]]
+; AVX-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 6, i32 7>
+; AVX-NEXT: ret <8 x float> [[R]]
;
%e = extractelement <8 x float> %x, i32 5
call void @use(float %e)
@@ -75,6 +92,8 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
ret <8 x float> %r
}
+; Negative test - the transform is likely not profitable if the fneg has another use.
+
define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
; CHECK-LABEL: @ext7_v8f32_use2(
; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 3
@@ -90,6 +109,8 @@ define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
ret <8 x float> %r
}
+; Negative test - can't convert variable index to a shuffle.
+
define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) {
; CHECK-LABEL: @ext_index_var_v2f64(
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]]
@@ -103,6 +124,9 @@ define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %
ret <2 x double> %r
}
+; Negative test - require same extract/insert index for simple shuffle.
+; TODO: We could handle this by adjusting the cost calculation.
+
define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
; CHECK-LABEL: @ext1_v2f64_ins0(
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
@@ -116,6 +140,8 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
ret <2 x double> %r
}
+; Negative test - avoid changing poison ops
+
define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) {
; CHECK-LABEL: @ext12_v4f32(
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 12
More information about the llvm-commits
mailing list