[llvm] 433897d - [InstCombine][X86] simplifyX86immShift - convert variable in-range vector shift by immediate amounts to generic shifts (PR40391)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 19 08:48:05 PDT 2020
Author: Simon Pilgrim
Date: 2020-03-19T15:44:24Z
New Revision: 433897da4abd103a3b72f6c505a2807ab148665a
URL: https://github.com/llvm/llvm-project/commit/433897da4abd103a3b72f6c505a2807ab148665a
DIFF: https://github.com/llvm/llvm-project/commit/433897da4abd103a3b72f6c505a2807ab148665a.diff
LOG: [InstCombine][X86] simplifyX86immShift - convert variable in-range vector shift by immediate amounts to generic shifts (PR40391)
The slli/srli/srai 'immediate' vector shifts (although its not immediate anymore to match gcc) can be replaced with generic shifts if the shift amount is known to be in range.
Added:
Modified:
llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 2138017606b7..d655cbbc3b0b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -296,77 +296,109 @@ static Value *simplifyX86immShift(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder) {
bool LogicalShift = false;
bool ShiftLeft = false;
+ bool IsImm = false;
switch (II.getIntrinsicID()) {
default: llvm_unreachable("Unexpected intrinsic!");
- case Intrinsic::x86_sse2_psra_d:
- case Intrinsic::x86_sse2_psra_w:
case Intrinsic::x86_sse2_psrai_d:
case Intrinsic::x86_sse2_psrai_w:
- case Intrinsic::x86_avx2_psra_d:
- case Intrinsic::x86_avx2_psra_w:
case Intrinsic::x86_avx2_psrai_d:
case Intrinsic::x86_avx2_psrai_w:
- case Intrinsic::x86_avx512_psra_q_128:
case Intrinsic::x86_avx512_psrai_q_128:
- case Intrinsic::x86_avx512_psra_q_256:
case Intrinsic::x86_avx512_psrai_q_256:
- case Intrinsic::x86_avx512_psra_d_512:
- case Intrinsic::x86_avx512_psra_q_512:
- case Intrinsic::x86_avx512_psra_w_512:
case Intrinsic::x86_avx512_psrai_d_512:
case Intrinsic::x86_avx512_psrai_q_512:
case Intrinsic::x86_avx512_psrai_w_512:
- LogicalShift = false; ShiftLeft = false;
+ IsImm = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_avx2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx512_psra_q_128:
+ case Intrinsic::x86_avx512_psra_q_256:
+ case Intrinsic::x86_avx512_psra_d_512:
+ case Intrinsic::x86_avx512_psra_q_512:
+ case Intrinsic::x86_avx512_psra_w_512:
+ LogicalShift = false;
+ ShiftLeft = false;
break;
- case Intrinsic::x86_sse2_psrl_d:
- case Intrinsic::x86_sse2_psrl_q:
- case Intrinsic::x86_sse2_psrl_w:
case Intrinsic::x86_sse2_psrli_d:
case Intrinsic::x86_sse2_psrli_q:
case Intrinsic::x86_sse2_psrli_w:
- case Intrinsic::x86_avx2_psrl_d:
- case Intrinsic::x86_avx2_psrl_q:
- case Intrinsic::x86_avx2_psrl_w:
case Intrinsic::x86_avx2_psrli_d:
case Intrinsic::x86_avx2_psrli_q:
case Intrinsic::x86_avx2_psrli_w:
- case Intrinsic::x86_avx512_psrl_d_512:
- case Intrinsic::x86_avx512_psrl_q_512:
- case Intrinsic::x86_avx512_psrl_w_512:
case Intrinsic::x86_avx512_psrli_d_512:
case Intrinsic::x86_avx512_psrli_q_512:
case Intrinsic::x86_avx512_psrli_w_512:
- LogicalShift = true; ShiftLeft = false;
+ IsImm = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse2_psrl_d:
+ case Intrinsic::x86_sse2_psrl_q:
+ case Intrinsic::x86_sse2_psrl_w:
+ case Intrinsic::x86_avx2_psrl_d:
+ case Intrinsic::x86_avx2_psrl_q:
+ case Intrinsic::x86_avx2_psrl_w:
+ case Intrinsic::x86_avx512_psrl_d_512:
+ case Intrinsic::x86_avx512_psrl_q_512:
+ case Intrinsic::x86_avx512_psrl_w_512:
+ LogicalShift = true;
+ ShiftLeft = false;
break;
- case Intrinsic::x86_sse2_psll_d:
- case Intrinsic::x86_sse2_psll_q:
- case Intrinsic::x86_sse2_psll_w:
case Intrinsic::x86_sse2_pslli_d:
case Intrinsic::x86_sse2_pslli_q:
case Intrinsic::x86_sse2_pslli_w:
- case Intrinsic::x86_avx2_psll_d:
- case Intrinsic::x86_avx2_psll_q:
- case Intrinsic::x86_avx2_psll_w:
case Intrinsic::x86_avx2_pslli_d:
case Intrinsic::x86_avx2_pslli_q:
case Intrinsic::x86_avx2_pslli_w:
- case Intrinsic::x86_avx512_psll_d_512:
- case Intrinsic::x86_avx512_psll_q_512:
- case Intrinsic::x86_avx512_psll_w_512:
case Intrinsic::x86_avx512_pslli_d_512:
case Intrinsic::x86_avx512_pslli_q_512:
case Intrinsic::x86_avx512_pslli_w_512:
- LogicalShift = true; ShiftLeft = true;
+ IsImm = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse2_psll_d:
+ case Intrinsic::x86_sse2_psll_q:
+ case Intrinsic::x86_sse2_psll_w:
+ case Intrinsic::x86_avx2_psll_d:
+ case Intrinsic::x86_avx2_psll_q:
+ case Intrinsic::x86_avx2_psll_w:
+ case Intrinsic::x86_avx512_psll_d_512:
+ case Intrinsic::x86_avx512_psll_q_512:
+ case Intrinsic::x86_avx512_psll_w_512:
+ LogicalShift = true;
+ ShiftLeft = true;
break;
}
assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+ auto Vec = II.getArgOperand(0);
+ auto Amt = II.getArgOperand(1);
+ auto VT = cast<VectorType>(Vec->getType());
+ auto SVT = VT->getElementType();
+ unsigned VWidth = VT->getNumElements();
+ unsigned BitWidth = SVT->getPrimitiveSizeInBits();
+
+ // If the shift amount is guaranteed to be in-range we can replace it with a
+ // generic shift.
+ if (IsImm) {
+ assert(Amt->getType()->isIntegerTy(32) &&
+ "Unexpected shift-by-immediate type");
+ KnownBits KnownAmtBits =
+ llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
+ if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
+ Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
+ Amt = Builder.CreateVectorSplat(VWidth, Amt);
+ return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+ : Builder.CreateLShr(Vec, Amt))
+ : Builder.CreateAShr(Vec, Amt));
+ }
+ }
+
// Simplify if count is constant.
- auto Arg1 = II.getArgOperand(1);
- auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1);
- auto CDV = dyn_cast<ConstantDataVector>(Arg1);
- auto CInt = dyn_cast<ConstantInt>(Arg1);
+ auto CAZ = dyn_cast<ConstantAggregateZero>(Amt);
+ auto CDV = dyn_cast<ConstantDataVector>(Amt);
+ auto CInt = dyn_cast<ConstantInt>(Amt);
if (!CAZ && !CDV && !CInt)
return nullptr;
@@ -390,12 +422,6 @@ static Value *simplifyX86immShift(const IntrinsicInst &II,
else if (CInt)
Count = CInt->getValue();
- auto Vec = II.getArgOperand(0);
- auto VT = cast<VectorType>(Vec->getType());
- auto SVT = VT->getElementType();
- unsigned VWidth = VT->getNumElements();
- unsigned BitWidth = SVT->getPrimitiveSizeInBits();
-
// If shift-by-zero then just return the original value.
if (Count.isNullValue())
return Vec;
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
index 270cc37b7fd9..e8219810dcaf 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
@@ -2680,9 +2680,12 @@ define <32 x i16> @avx512_psllv_w_512_undef(<32 x i16> %v) {
define <8 x i16> @sse2_psrai_w_128_masked(<8 x i16> %v, i32 %a) {
; CHECK-LABEL: @sse2_psrai_w_128_masked(
-; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 15
-; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> [[V:%.*]], i32 [[TMP1]])
-; CHECK-NEXT: ret <8 x i16> [[TMP2]]
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[A:%.*]] to i16
+; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = ashr <8 x i16> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT: ret <8 x i16> [[TMP3]]
;
%1 = and i32 %a, 15
%2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 %1)
@@ -2692,7 +2695,9 @@ define <8 x i16> @sse2_psrai_w_128_masked(<8 x i16> %v, i32 %a) {
define <8 x i32> @avx2_psrai_d_256_masked(<8 x i32> %v, i32 %a) {
; CHECK-LABEL: @avx2_psrai_d_256_masked(
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 31
-; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> [[V:%.*]], i32 [[TMP1]])
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i32> [[V:%.*]], [[DOTSPLAT]]
; CHECK-NEXT: ret <8 x i32> [[TMP2]]
;
%1 = and i32 %a, 31
@@ -2703,8 +2708,11 @@ define <8 x i32> @avx2_psrai_d_256_masked(<8 x i32> %v, i32 %a) {
define <8 x i64> @avx512_psrai_q_512_masked(<8 x i64> %v, i32 %a) {
; CHECK-LABEL: @avx512_psrai_q_512_masked(
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 63
-; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[V:%.*]], i32 [[TMP1]])
-; CHECK-NEXT: ret <8 x i64> [[TMP2]]
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[TMP2]], i32 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = ashr <8 x i64> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT: ret <8 x i64> [[TMP3]]
;
%1 = and i32 %a, 63
%2 = tail call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %v, i32 %1)
@@ -2714,7 +2722,9 @@ define <8 x i64> @avx512_psrai_q_512_masked(<8 x i64> %v, i32 %a) {
define <4 x i32> @sse2_psrli_d_128_masked(<4 x i32> %v, i32 %a) {
; CHECK-LABEL: @sse2_psrli_d_128_masked(
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 31
-; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> [[V:%.*]], i32 [[TMP1]])
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[V:%.*]], [[DOTSPLAT]]
; CHECK-NEXT: ret <4 x i32> [[TMP2]]
;
%1 = and i32 %a, 31
@@ -2725,8 +2735,11 @@ define <4 x i32> @sse2_psrli_d_128_masked(<4 x i32> %v, i32 %a) {
define <4 x i64> @avx2_psrli_q_256_masked(<4 x i64> %v, i32 %a) {
; CHECK-LABEL: @avx2_psrli_q_256_masked(
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 63
-; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> [[V:%.*]], i32 [[TMP1]])
-; CHECK-NEXT: ret <4 x i64> [[TMP2]]
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[TMP2]], i32 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT: ret <4 x i64> [[TMP3]]
;
%1 = and i32 %a, 63
%2 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 %1)
@@ -2735,9 +2748,12 @@ define <4 x i64> @avx2_psrli_q_256_masked(<4 x i64> %v, i32 %a) {
define <32 x i16> @avx512_psrli_w_512_masked(<32 x i16> %v, i32 %a) {
; CHECK-LABEL: @avx512_psrli_w_512_masked(
-; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 15
-; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> [[V:%.*]], i32 [[TMP1]])
-; CHECK-NEXT: ret <32 x i16> [[TMP2]]
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[A:%.*]] to i16
+; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <32 x i16> undef, i16 [[TMP2]], i32 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <32 x i16> [[DOTSPLATINSERT]], <32 x i16> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = lshr <32 x i16> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT: ret <32 x i16> [[TMP3]]
;
%1 = and i32 %a, 15
%2 = tail call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %v, i32 %1)
@@ -2747,8 +2763,11 @@ define <32 x i16> @avx512_psrli_w_512_masked(<32 x i16> %v, i32 %a) {
define <2 x i64> @sse2_pslli_q_128_masked(<2 x i64> %v, i32 %a) {
; CHECK-LABEL: @sse2_pslli_q_128_masked(
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 63
-; CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> [[V:%.*]], i32 [[TMP1]])
-; CHECK-NEXT: ret <2 x i64> [[TMP2]]
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = shl <2 x i64> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT: ret <2 x i64> [[TMP3]]
;
%1 = and i32 %a, 63
%2 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 %1)
@@ -2757,9 +2776,12 @@ define <2 x i64> @sse2_pslli_q_128_masked(<2 x i64> %v, i32 %a) {
define <16 x i16> @avx2_pslli_w_256_masked(<16 x i16> %v, i32 %a) {
; CHECK-LABEL: @avx2_pslli_w_256_masked(
-; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 15
-; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> [[V:%.*]], i32 [[TMP1]])
-; CHECK-NEXT: ret <16 x i16> [[TMP2]]
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[A:%.*]] to i16
+; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> undef, i16 [[TMP2]], i32 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i16> [[V:%.*]], [[DOTSPLAT]]
+; CHECK-NEXT: ret <16 x i16> [[TMP3]]
;
%1 = and i32 %a, 15
%2 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 %1)
@@ -2769,7 +2791,9 @@ define <16 x i16> @avx2_pslli_w_256_masked(<16 x i16> %v, i32 %a) {
define <16 x i32> @avx512_pslli_d_512_masked(<16 x i32> %v, i32 %a) {
; CHECK-LABEL: @avx512_pslli_d_512_masked(
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 31
-; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[V:%.*]], i32 [[TMP1]])
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = shl <16 x i32> [[V:%.*]], [[DOTSPLAT]]
; CHECK-NEXT: ret <16 x i32> [[TMP2]]
;
%1 = and i32 %a, 31
More information about the llvm-commits
mailing list