[llvm] 0d2a0b4 - [VectorCombine] scalarize binop of inserted elements into vector constants
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Fri May 8 13:36:17 PDT 2020
Author: Sanjay Patel
Date: 2020-05-08T16:31:12-04:00
New Revision: 0d2a0b44c812e5aafa418e4806171867f3181cbe
URL: https://github.com/llvm/llvm-project/commit/0d2a0b44c812e5aafa418e4806171867f3181cbe
DIFF: https://github.com/llvm/llvm-project/commit/0d2a0b44c812e5aafa418e4806171867f3181cbe.diff
LOG: [VectorCombine] scalarize binop of inserted elements into vector constants
As with the extractelement patterns that are currently in vector-combine,
there are going to be several possible variations on this theme. This
should be the clearest, simplest example.
Scalarization is the right direction for target-independent canonicalization,
and InstCombine has some of those folds already, but it doesn't do this.
I proposed a similar transform in D50992. Here in vector-combine, we can
check the cost model to be sure it's profitable, so there should be less risk.
Differential Revision: https://reviews.llvm.org/D79452
Added:
Modified:
llvm/lib/Transforms/Vectorize/VectorCombine.cpp
llvm/test/Transforms/VectorCombine/X86/insert-binop.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 02f42a1b4fe6..d72d0baf37f4 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -34,6 +34,7 @@ using namespace llvm::PatternMatch;
#define DEBUG_TYPE "vector-combine"
STATISTIC(NumVecCmp, "Number of vector compares formed");
STATISTIC(NumVecBO, "Number of vector binops formed");
+STATISTIC(NumScalarBO, "Number of scalar binops formed");
static cl::opt<bool> DisableVectorCombine(
"disable-vector-combine", cl::init(false), cl::Hidden,
@@ -308,6 +309,64 @@ static bool foldBitcastShuf(Instruction &I, const TargetTransformInfo &TTI) {
return true;
}
+/// Match a vector binop instruction with inserted scalar operands and convert
+/// to scalar binop followed by insertelement.
+static bool scalarizeBinop(Instruction &I, const TargetTransformInfo &TTI) {
+ Instruction *Ins0, *Ins1;
+ if (!match(&I, m_BinOp(m_Instruction(Ins0), m_Instruction(Ins1))))
+ return false;
+
+ // TODO: Loosen restriction for one-use by adjusting cost equation.
+ // TODO: Deal with mismatched index constants and variable indexes?
+ Constant *VecC0, *VecC1;
+ Value *V0, *V1;
+ uint64_t Index;
+ if (!match(Ins0, m_OneUse(m_InsertElement(m_Constant(VecC0), m_Value(V0),
+ m_ConstantInt(Index)))) ||
+ !match(Ins1, m_OneUse(m_InsertElement(m_Constant(VecC1), m_Value(V1),
+ m_SpecificInt(Index)))))
+ return false;
+
+ Type *ScalarTy = V0->getType();
+ Type *VecTy = I.getType();
+ assert(VecTy->isVectorTy() && ScalarTy == V1->getType() &&
+ (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy()) &&
+ "Unexpected types for insert into binop");
+
+ Instruction::BinaryOps Opcode = cast<BinaryOperator>(&I)->getOpcode();
+ int ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
+ int VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+
+ // Get cost estimate for the insert element. This cost will factor into
+ // both sequences.
+ int InsertCost =
+ TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);
+ int OldCost = InsertCost + InsertCost + VectorOpCost;
+ int NewCost = ScalarOpCost + InsertCost;
+
+ // We want to scalarize unless the vector variant actually has lower cost.
+ if (OldCost < NewCost)
+ return false;
+
+ // vec_bo (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
+ // inselt NewVecC, (scalar_bo V0, V1), Index
+ ++NumScalarBO;
+ IRBuilder<> Builder(&I);
+ Value *Scalar = Builder.CreateBinOp(Opcode, V0, V1, I.getName() + ".scalar");
+
+ // All IR flags are safe to back-propagate. There is no potential for extra
+ // poison to be created by the scalar instruction.
+ if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
+ ScalarInst->copyIRFlags(&I);
+
+ // Fold the vector constants in the original vectors into a new base vector.
+ Constant *NewVecC = ConstantExpr::get(Opcode, VecC0, VecC1);
+ Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
+ I.replaceAllUsesWith(Insert);
+ Insert->takeName(&I);
+ return true;
+}
+
/// This is the entry point for all transforms. Pass manager
diff erences are
/// handled in the callers of this function.
static bool runImpl(Function &F, const TargetTransformInfo &TTI,
@@ -330,6 +389,7 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI,
continue;
MadeChange |= foldExtractExtract(I, TTI);
MadeChange |= foldBitcastShuf(I, TTI);
+ MadeChange |= scalarizeBinop(I, TTI);
}
}
diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll
index 8136de8b3037..52b2b8cee6a9 100644
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll
@@ -8,9 +8,8 @@ declare void @use(<4 x i32>)
define <16 x i8> @ins0_ins0_add(i8 %x, i8 %y) {
; CHECK-LABEL: @ins0_ins0_add(
-; CHECK-NEXT: [[I0:%.*]] = insertelement <16 x i8> undef, i8 [[X:%.*]], i32 0
-; CHECK-NEXT: [[I1:%.*]] = insertelement <16 x i8> undef, i8 [[Y:%.*]], i32 0
-; CHECK-NEXT: [[R:%.*]] = add <16 x i8> [[I0]], [[I1]]
+; CHECK-NEXT: [[R_SCALAR:%.*]] = add i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[R:%.*]] = insertelement <16 x i8> undef, i8 [[R_SCALAR]], i64 0
; CHECK-NEXT: ret <16 x i8> [[R]]
;
%i0 = insertelement <16 x i8> undef, i8 %x, i32 0
@@ -23,9 +22,8 @@ define <16 x i8> @ins0_ins0_add(i8 %x, i8 %y) {
define <8 x i16> @ins0_ins0_sub_flags(i16 %x, i16 %y) {
; CHECK-LABEL: @ins0_ins0_sub_flags(
-; CHECK-NEXT: [[I0:%.*]] = insertelement <8 x i16> undef, i16 [[X:%.*]], i8 5
-; CHECK-NEXT: [[I1:%.*]] = insertelement <8 x i16> undef, i16 [[Y:%.*]], i32 5
-; CHECK-NEXT: [[R:%.*]] = sub nuw nsw <8 x i16> [[I0]], [[I1]]
+; CHECK-NEXT: [[R_SCALAR:%.*]] = sub nuw nsw i16 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[R_SCALAR]], i64 5
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%i0 = insertelement <8 x i16> undef, i16 %x, i8 5
@@ -34,11 +32,13 @@ define <8 x i16> @ins0_ins0_sub_flags(i16 %x, i16 %y) {
ret <8 x i16> %r
}
+; The new vector constant is calculated by constant folding.
+; This is conservatively created as zero rather than undef for 'undef ^ undef'.
+
define <2 x i64> @ins1_ins1_xor(i64 %x, i64 %y) {
; CHECK-LABEL: @ins1_ins1_xor(
-; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i64 1
-; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x i64> undef, i64 [[Y:%.*]], i32 1
-; CHECK-NEXT: [[R:%.*]] = xor <2 x i64> [[I0]], [[I1]]
+; CHECK-NEXT: [[R_SCALAR:%.*]] = xor i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[R_SCALAR]], i64 1
; CHECK-NEXT: ret <2 x i64> [[R]]
;
%i0 = insertelement <2 x i64> undef, i64 %x, i64 1
@@ -51,9 +51,8 @@ define <2 x i64> @ins1_ins1_xor(i64 %x, i64 %y) {
define <2 x double> @ins0_ins0_fadd(double %x, double %y) {
; CHECK-LABEL: @ins0_ins0_fadd(
-; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 0
-; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x double> undef, double [[Y:%.*]], i32 0
-; CHECK-NEXT: [[R:%.*]] = fadd reassoc nsz <2 x double> [[I0]], [[I1]]
+; CHECK-NEXT: [[R_SCALAR:%.*]] = fadd reassoc nsz double [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> undef, double [[R_SCALAR]], i64 0
; CHECK-NEXT: ret <2 x double> [[R]]
;
%i0 = insertelement <2 x double> undef, double %x, i32 0
@@ -62,6 +61,8 @@ define <2 x double> @ins0_ins0_fadd(double %x, double %y) {
ret <2 x double> %r
}
+; Negative test - mismatched indexes (but could fold this).
+
define <16 x i8> @ins1_ins0_add(i8 %x, i8 %y) {
; CHECK-LABEL: @ins1_ins0_add(
; CHECK-NEXT: [[I0:%.*]] = insertelement <16 x i8> undef, i8 [[X:%.*]], i32 1
@@ -75,11 +76,12 @@ define <16 x i8> @ins1_ins0_add(i8 %x, i8 %y) {
ret <16 x i8> %r
}
+; Base vector does not have to be undef.
+
define <4 x i32> @ins0_ins0_mul(i32 %x, i32 %y) {
; CHECK-LABEL: @ins0_ins0_mul(
-; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[X:%.*]], i32 0
-; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x i32> undef, i32 [[Y:%.*]], i32 0
-; CHECK-NEXT: [[R:%.*]] = mul <4 x i32> [[I0]], [[I1]]
+; CHECK-NEXT: [[R_SCALAR:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[R_SCALAR]], i64 0
; CHECK-NEXT: ret <4 x i32> [[R]]
;
%i0 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0
@@ -88,11 +90,12 @@ define <4 x i32> @ins0_ins0_mul(i32 %x, i32 %y) {
ret <4 x i32> %r
}
+; It is safe to scalarize any binop (no extra UB/poison danger).
+
define <2 x i64> @ins1_ins1_sdiv(i64 %x, i64 %y) {
; CHECK-LABEL: @ins1_ins1_sdiv(
-; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x i64> <i64 42, i64 -42>, i64 [[X:%.*]], i64 1
-; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x i64> <i64 -7, i64 128>, i64 [[Y:%.*]], i32 1
-; CHECK-NEXT: [[R:%.*]] = sdiv <2 x i64> [[I0]], [[I1]]
+; CHECK-NEXT: [[R_SCALAR:%.*]] = sdiv i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> <i64 -6, i64 0>, i64 [[R_SCALAR]], i64 1
; CHECK-NEXT: ret <2 x i64> [[R]]
;
%i0 = insertelement <2 x i64> <i64 42, i64 -42>, i64 %x, i64 1
@@ -101,11 +104,12 @@ define <2 x i64> @ins1_ins1_sdiv(i64 %x, i64 %y) {
ret <2 x i64> %r
}
+; Constant folding deals with undef per element - the entire value does not become undef.
+
define <2 x i64> @ins1_ins1_udiv(i64 %x, i64 %y) {
; CHECK-LABEL: @ins1_ins1_udiv(
-; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x i64> <i64 42, i64 undef>, i64 [[X:%.*]], i32 1
-; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x i64> <i64 7, i64 undef>, i64 [[Y:%.*]], i32 1
-; CHECK-NEXT: [[R:%.*]] = udiv <2 x i64> [[I0]], [[I1]]
+; CHECK-NEXT: [[R_SCALAR:%.*]] = udiv i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> <i64 6, i64 undef>, i64 [[R_SCALAR]], i64 1
; CHECK-NEXT: ret <2 x i64> [[R]]
;
%i0 = insertelement <2 x i64> <i64 42, i64 undef>, i64 %x, i32 1
@@ -114,11 +118,13 @@ define <2 x i64> @ins1_ins1_udiv(i64 %x, i64 %y) {
ret <2 x i64> %r
}
+; This could be simplified -- creates immediate UB without the transform because
+; divisor has an undef element -- but that is hidden after the transform.
+
define <2 x i64> @ins1_ins1_urem(i64 %x, i64 %y) {
; CHECK-LABEL: @ins1_ins1_urem(
-; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x i64> <i64 42, i64 undef>, i64 [[X:%.*]], i64 1
-; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x i64> <i64 undef, i64 128>, i64 [[Y:%.*]], i32 1
-; CHECK-NEXT: [[R:%.*]] = urem <2 x i64> [[I0]], [[I1]]
+; CHECK-NEXT: [[R_SCALAR:%.*]] = urem i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> <i64 undef, i64 0>, i64 [[R_SCALAR]], i64 1
; CHECK-NEXT: ret <2 x i64> [[R]]
;
%i0 = insertelement <2 x i64> <i64 42, i64 undef>, i64 %x, i64 1
@@ -127,6 +133,9 @@ define <2 x i64> @ins1_ins1_urem(i64 %x, i64 %y) {
ret <2 x i64> %r
}
+; Negative test
+; TODO: extra use can be accounted for in cost calculation.
+
define <4 x i32> @ins0_ins0_xor(i32 %x, i32 %y) {
; CHECK-LABEL: @ins0_ins0_xor(
; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
More information about the llvm-commits
mailing list