[llvm] [InstCombine][X86] Try to convert BLENDV(X,Y,SHL()) -> SELECT(ICMP_SGT(0,SHL()),Y,X) (PR #173389)

Wed Dec 24 03:28:48 PST 2025

https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/173389

>From 3ce4b3e0d5afb0de3d3f3f75df3a4f68d65349c2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 23 Dec 2025 15:41:33 +0000
Subject: [PATCH] [InstCombine][X86] Try to convert BLENDV(X,Y,SHL()) ->
 SELECT(ICMP_SGT(0,SHL()),Y,X)

We are cautious about converting from BLENDV intrinsics as the mask is often bitcast from another type, often of an entirely different width (especially for PBLENDVB which is often used for all integer types) - incorrect handling can leave us with select ops working on the wrong type width, which makes it difficult for other passes to make use of it (VectorCombine in particular).

Currently BLENDV intrinsics are only folded to generic selects when we know the mask is from a SEXT(vXi1) bool type.

But a second common use is to shift specific bits to the MSB of the blend mask - this is common in fp mathlib code when working with bounds etc. and the backend is pretty good at folding this back to a VSELECT/BLENDV pattern (often better than using the shift directly especially when it has a non-uniform shift amount).

I've been looking for other common arithmetic ops that would benefit from this fold, but haven't found much yet - the remainder are usually signbit/copysign/logic manipulations that are difficult to guarantee we have the correct element types.
---
 .../Target/X86/X86InstCombineIntrinsic.cpp    | 15 +++++++++++++-
 .../Transforms/InstCombine/X86/blend_x86.ll   | 20 +++++++++----------
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index c4d349044fe80..ffb8f2ef3643b 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -2871,6 +2871,7 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   case Intrinsic::x86_avx_blendv_pd_256:
   case Intrinsic::x86_avx2_pblendvb: {
     // fold (blend A, A, Mask) -> A
+    auto *OpTy = cast<FixedVectorType>(II.getType());
     Value *Op0 = II.getArgOperand(0);
     Value *Op1 = II.getArgOperand(1);
     Value *Mask = II.getArgOperand(2);
@@ -2892,6 +2893,19 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
     Mask = InstCombiner::peekThroughBitcast(Mask);
 
+    // Bitshift upto the signbit can always be converted to an efficient
+    // test+select pattern.
+    if (match(Mask, m_Shl(m_Value(), m_Value()))) {
+      if (auto *MaskTy = dyn_cast<FixedVectorType>(Mask->getType())) {
+        if (MaskTy->getScalarSizeInBits() == OpTy->getScalarSizeInBits()) {
+          Value *BoolVec = IC.Builder.CreateICmpSGT(
+              ConstantAggregateZero::get(MaskTy), Mask);
+          Value *Sel = IC.Builder.CreateSelect(BoolVec, Op1, Op0);
+          return new BitCastInst(Sel, II.getType());
+        }
+      }
+    }
+
     // Peek through a one-use shuffle - VectorCombine should have simplified
     // this for cases where we're splitting wider vectors to use blendv
     // intrinsics.
@@ -2915,7 +2929,6 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         BoolVec->getType()->isVectorTy() &&
         BoolVec->getType()->getScalarSizeInBits() == 1) {
       auto *MaskTy = cast<FixedVectorType>(Mask->getType());
-      auto *OpTy = cast<FixedVectorType>(II.getType());
       unsigned NumMaskElts = MaskTy->getNumElements();
       unsigned NumOperandElts = OpTy->getNumElements();
 
diff --git a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
index fb6bd7cdca83a..0916cf7e708ae 100644
--- a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
+++ b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
@@ -311,8 +311,9 @@ define <4 x float> @sel_v16i8_bitcast_shuffle_bitcast_cmp(<8 x float> %a, <8 x f
 
 define <16 x i8> @shl_pblendvb_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: @shl_pblendvb_v16i8(
-; CHECK-NEXT:    [[S:%.*]] = shl <16 x i8> [[A2:%.*]], splat (i8 1)
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A0:%.*]], <16 x i8> [[A1:%.*]], <16 x i8> [[S]])
+; CHECK-NEXT:    [[S_MASK:%.*]] = and <16 x i8> [[A2:%.*]], splat (i8 64)
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq <16 x i8> [[S_MASK]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = select <16 x i1> [[DOTNOT]], <16 x i8> [[A0:%.*]], <16 x i8> [[A1:%.*]]
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %s = shl <16 x i8> %a2, splat (i8 1)
@@ -322,8 +323,8 @@ define <16 x i8> @shl_pblendvb_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2
 
 define <32 x i8> @shl_pblendvb_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: @shl_pblendvb_v32i8(
-; CHECK-NEXT:    [[S:%.*]] = shl nuw <32 x i8> splat (i8 1), [[A2:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]], <32 x i8> [[S]])
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <32 x i8> [[A2:%.*]], splat (i8 7)
+; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A1:%.*]], <32 x i8> [[A0:%.*]]
 ; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
 ;
   %s = shl <32 x i8> splat (i8 1), %a2
@@ -333,9 +334,9 @@ define <32 x i8> @shl_pblendvb_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2
 
 define <4 x float> @shl_blendvps_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
 ; CHECK-LABEL: @shl_blendvps_v4f32(
-; CHECK-NEXT:    [[S:%.*]] = shl <4 x i32> [[A2:%.*]], splat (i32 31)
-; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[S]] to <4 x float>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[B]])
+; CHECK-NEXT:    [[S_MASK:%.*]] = and <4 x i32> [[A2:%.*]], splat (i32 1)
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq <4 x i32> [[S_MASK]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[DOTNOT]], <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %s = shl <4 x i32> %a2, splat (i32 31)
@@ -346,9 +347,8 @@ define <4 x float> @shl_blendvps_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i3
 
 define <4 x double> @shl_blendvpd_v4f64(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
 ; CHECK-LABEL: @shl_blendvpd_v4f64(
-; CHECK-NEXT:    [[S:%.*]] = shl nuw <4 x i64> splat (i64 1), [[A2:%.*]]
-; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i64> [[S]] to <4 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i64> [[A2:%.*]], splat (i64 63)
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[A1:%.*]], <4 x double> [[A0:%.*]]
 ; CHECK-NEXT:    ret <4 x double> [[TMP2]]
 ;
   %s = shl <4 x i64> splat (i64 1), %a2