[llvm] 18127cc - [InstCombine][X86] Try to convert BLENDV(X, Y, SHL()) -> SELECT(ICMP_SGT(0,SHL()),Y,X) (#173389)

Wed Dec 24 04:05:26 PST 2025

Author: Simon Pilgrim
Date: 2025-12-24T12:05:22Z
New Revision: 18127cc079777d2ddd42b50eaa3fc772a10fe7b3

URL: https://github.com/llvm/llvm-project/commit/18127cc079777d2ddd42b50eaa3fc772a10fe7b3
DIFF: https://github.com/llvm/llvm-project/commit/18127cc079777d2ddd42b50eaa3fc772a10fe7b3.diff

LOG: [InstCombine][X86] Try to convert BLENDV(X,Y,SHL()) -> SELECT(ICMP_SGT(0,SHL()),Y,X) (#173389)

We are cautious about converting from BLENDV intrinsics as the mask is
usually bitcast from another type, often of an entirely different width
(especially for PBLENDVB which is often used for all integer types) -
incorrect handling can leave us with select ops working on the wrong
type width, which makes it difficult for other passes to make use of it
(VectorCombine in particular).

Currently BLENDV intrinsics are only folded to generic selects when we
know the mask is from a SEXT(vXi1) bool type.

But a second common use is to shift specific bits to the MSB of the
blend mask - this is common in fp mathlib code when working with bounds
etc. and the backend is pretty good at folding this back to a
VSELECT/BLENDV pattern (often better than using the shift directly
especially when it has a non-uniform shift amount).

I've been looking for other common arithmetic ops that would benefit
from this fold, but haven't found much yet - the remainder are usually
signbit/copysign/logic manipulations that are difficult to guarantee we
have the correct element types.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
    llvm/test/Transforms/InstCombine/X86/blend_x86.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index c4d349044fe80..ffb8f2ef3643b 100644

--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -2871,6 +2871,7 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   case Intrinsic::x86_avx_blendv_pd_256:
   case Intrinsic::x86_avx2_pblendvb: {
     // fold (blend A, A, Mask) -> A
+    auto *OpTy = cast<FixedVectorType>(II.getType());
     Value *Op0 = II.getArgOperand(0);
     Value *Op1 = II.getArgOperand(1);
     Value *Mask = II.getArgOperand(2);
@@ -2892,6 +2893,19 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
     Mask = InstCombiner::peekThroughBitcast(Mask);
 
+    // Bitshift upto the signbit can always be converted to an efficient
+    // test+select pattern.
+    if (match(Mask, m_Shl(m_Value(), m_Value()))) {
+      if (auto *MaskTy = dyn_cast<FixedVectorType>(Mask->getType())) {
+        if (MaskTy->getScalarSizeInBits() == OpTy->getScalarSizeInBits()) {
+          Value *BoolVec = IC.Builder.CreateICmpSGT(
+              ConstantAggregateZero::get(MaskTy), Mask);
+          Value *Sel = IC.Builder.CreateSelect(BoolVec, Op1, Op0);
+          return new BitCastInst(Sel, II.getType());
+        }
+      }
+    }
+
     // Peek through a one-use shuffle - VectorCombine should have simplified
     // this for cases where we're splitting wider vectors to use blendv
     // intrinsics.
@@ -2915,7 +2929,6 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         BoolVec->getType()->isVectorTy() &&
         BoolVec->getType()->getScalarSizeInBits() == 1) {
       auto *MaskTy = cast<FixedVectorType>(Mask->getType());
-      auto *OpTy = cast<FixedVectorType>(II.getType());
       unsigned NumMaskElts = MaskTy->getNumElements();
       unsigned NumOperandElts = OpTy->getNumElements();
 

diff  --git a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
index fb6bd7cdca83a..0916cf7e708ae 100644
--- a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
+++ b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
@@ -311,8 +311,9 @@ define <4 x float> @sel_v16i8_bitcast_shuffle_bitcast_cmp(<8 x float> %a, <8 x f
 
 define <16 x i8> @shl_pblendvb_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: @shl_pblendvb_v16i8(
-; CHECK-NEXT:    [[S:%.*]] = shl <16 x i8> [[A2:%.*]], splat (i8 1)
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A0:%.*]], <16 x i8> [[A1:%.*]], <16 x i8> [[S]])
+; CHECK-NEXT:    [[S_MASK:%.*]] = and <16 x i8> [[A2:%.*]], splat (i8 64)
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq <16 x i8> [[S_MASK]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = select <16 x i1> [[DOTNOT]], <16 x i8> [[A0:%.*]], <16 x i8> [[A1:%.*]]
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %s = shl <16 x i8> %a2, splat (i8 1)
@@ -322,8 +323,8 @@ define <16 x i8> @shl_pblendvb_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2
 
 define <32 x i8> @shl_pblendvb_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: @shl_pblendvb_v32i8(
-; CHECK-NEXT:    [[S:%.*]] = shl nuw <32 x i8> splat (i8 1), [[A2:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]], <32 x i8> [[S]])
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <32 x i8> [[A2:%.*]], splat (i8 7)
+; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A1:%.*]], <32 x i8> [[A0:%.*]]
 ; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
 ;
   %s = shl <32 x i8> splat (i8 1), %a2
@@ -333,9 +334,9 @@ define <32 x i8> @shl_pblendvb_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2
 
 define <4 x float> @shl_blendvps_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
 ; CHECK-LABEL: @shl_blendvps_v4f32(
-; CHECK-NEXT:    [[S:%.*]] = shl <4 x i32> [[A2:%.*]], splat (i32 31)
-; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[S]] to <4 x float>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[B]])
+; CHECK-NEXT:    [[S_MASK:%.*]] = and <4 x i32> [[A2:%.*]], splat (i32 1)
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq <4 x i32> [[S_MASK]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[DOTNOT]], <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %s = shl <4 x i32> %a2, splat (i32 31)
@@ -346,9 +347,8 @@ define <4 x float> @shl_blendvps_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i3
 
 define <4 x double> @shl_blendvpd_v4f64(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
 ; CHECK-LABEL: @shl_blendvpd_v4f64(
-; CHECK-NEXT:    [[S:%.*]] = shl nuw <4 x i64> splat (i64 1), [[A2:%.*]]
-; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i64> [[S]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i64> [[A2:%.*]], splat (i64 63)
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[A1:%.*]], <4 x double> [[A0:%.*]]
 ; CHECK-NEXT:    ret <4 x double> [[TMP2]]
 ;
   %s = shl <4 x i64> splat (i64 1), %a2