[llvm] [InstCombine] Move extends across identity shuffles. (PR #146901)

Thu Jul 3 07:22:53 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Florian Hahn (fhahn)

<details>
<summary>Changes</summary>

Add a new fold to instcombine to move SExt/ZExt across identity
shuffles, applying the cast after the shuffle. This sinks extends and
can enable more general additional folding of both shuffles (and
related instructions) and extends. If backends prefer splitting up doing
casts first, the extends can be hoisted again in VectorCombine for
example.

A larger example is included in the load_i32_zext_to_v4i32. The wider
extend is easier to compute an accurate cost for and targets (like
AArch64) can lower a single wider extend more efficiently than multiple
separate extends.

This is a generalization of a VectorCombine version
(https://github.com/llvm/llvm-project/pull/141109) as suggested by
@preames.

---
Full diff: https://github.com/llvm/llvm-project/pull/146901.diff


4 Files Affected:

- (modified) llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp (+19-5) 
- (modified) llvm/test/Transforms/InstCombine/X86/blend_x86.ll (+1-1) 
- (added) llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll (+104) 
- (modified) llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll (+32-103) 


``````````diff

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index a746a5229fb9a..a31fd68dc7165 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -2559,17 +2559,16 @@ static Instruction *foldShuffleOfUnaryOps(ShuffleVectorInst &Shuf,
 /// Canonicalize casts after shuffle.
 static Instruction *foldCastShuffle(ShuffleVectorInst &Shuf,
                                     InstCombiner::BuilderTy &Builder) {
-  // Do we have 2 matching cast operands?
   auto *Cast0 = dyn_cast<CastInst>(Shuf.getOperand(0));
-  auto *Cast1 = dyn_cast<CastInst>(Shuf.getOperand(1));
-  if (!Cast0 || !Cast1 || Cast0->getOpcode() != Cast1->getOpcode() ||
-      Cast0->getSrcTy() != Cast1->getSrcTy())
+  if (!Cast0)
     return nullptr;
 
   // TODO: Allow other opcodes? That would require easing the type restrictions
   //       below here.
   CastInst::CastOps CastOpcode = Cast0->getOpcode();
   switch (CastOpcode) {
+  case Instruction::SExt:
+  case Instruction::ZExt:
   case Instruction::FPToSI:
   case Instruction::FPToUI:
   case Instruction::SIToFP:
@@ -2579,15 +2578,30 @@ static Instruction *foldCastShuffle(ShuffleVectorInst &Shuf,
     return nullptr;
   }
 
+  VectorType *CastSrcTy = cast<VectorType>(Cast0->getSrcTy());
   VectorType *ShufTy = Shuf.getType();
   VectorType *ShufOpTy = cast<VectorType>(Shuf.getOperand(0)->getType());
-  VectorType *CastSrcTy = cast<VectorType>(Cast0->getSrcTy());
 
   // TODO: Allow length-increasing shuffles?
   if (ShufTy->getElementCount().getKnownMinValue() >
       ShufOpTy->getElementCount().getKnownMinValue())
     return nullptr;
 
+  // shuffle (cast X), Y, identity-with-extract-mask -->
+  // cast (shuffle X, Y, identity-with-extract-mask).
+  if (Cast0->hasOneUse() && Shuf.isIdentityWithExtract()) {
+    auto *NewIns = Builder.CreateShuffleVector(Cast0->getOperand(0),
+                                               PoisonValue::get(CastSrcTy),
+                                               Shuf.getShuffleMask());
+    return CastInst::Create(Cast0->getOpcode(), NewIns, Shuf.getType());
+  }
+
+  auto *Cast1 = dyn_cast<CastInst>(Shuf.getOperand(1));
+  // Do we have 2 matching cast operands?
+  if (!Cast1 || Cast0->getOpcode() != Cast1->getOpcode() ||
+      Cast0->getSrcTy() != Cast1->getSrcTy())
+    return nullptr;
+
   // TODO: Allow element-size-decreasing casts (ex: fptosi float to i8)?
   assert(isa<FixedVectorType>(CastSrcTy) && isa<FixedVectorType>(ShufOpTy) &&
          "Expected fixed vector operands for casts and binary shuffle");
diff --git a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
index aa49f493c9fa1..5c3b0beefbb66 100644
--- a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
+++ b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
@@ -287,9 +287,9 @@ define <4 x float> @sel_v16i8_bitcast_shuffle_bitcast_cmp(<8 x float> %a, <8 x f
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <8 x float> [[A]] to <8 x i32>
 ; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <8 x float> [[B]] to <8 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <8 x i32> [[A_BC]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <8 x i32> [[B_BC]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[B_LO]], <4 x i32> [[A_LO]]
 ; CHECK-NEXT:    [[RES:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float>
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
diff --git a/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll b/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll
new file mode 100644
index 0000000000000..84ce83d40bee9
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p instcombine -S %s | FileCheck %s
+
+define <4 x i16> @ext_identity_mask_first_vector_first_half_4xi16(<8 x i8> %x) {
+; CHECK-LABEL: define <4 x i16> @ext_identity_mask_first_vector_first_half_4xi16(
+; CHECK-SAME: <8 x i8> [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[X]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
+; CHECK-NEXT:    ret <4 x i16> [[SHUFFLE]]
+;
+entry:
+  %e.1 = zext <8 x i8> %x to <8 x i16>
+  %shuffle = shufflevector <8 x i16> %e.1, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle
+}
+
+define <3 x i32> @ext_identity_mask_first_vector_first_half_3xi32(<4 x i16> %x) {
+; CHECK-LABEL: define <3 x i32> @ext_identity_mask_first_vector_first_half_3xi32(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[X]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = zext <3 x i16> [[TMP0]] to <3 x i32>
+; CHECK-NEXT:    ret <3 x i32> [[SHUFFLE]]
+;
+entry:
+  %e.1 = zext <4 x i16> %x to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %e.1, <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x i32> %shuffle
+}
+
+define <4 x i16> @ext_no_identity_mask1(<8 x i8> %in) {
+; CHECK-LABEL: define <4 x i16> @ext_no_identity_mask1(
+; CHECK-SAME: <8 x i8> [[IN:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[IN]] to <8 x i16>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    ret <4 x i16> [[SHUFFLE]]
+;
+entry:
+  %e.1 = zext <8 x i8> %in to <8 x i16>
+  %shuffle = shufflevector <8 x i16> %e.1, <8 x i16> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i16> @ext_no_identity_mask2(<8 x i8> %x, <8 x i16> %y) {
+; CHECK-LABEL: define <4 x i16> @ext_no_identity_mask2(
+; CHECK-SAME: <8 x i8> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[X]] to <8 x i16>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i16> [[SHUFFLE]]
+;
+entry:
+  %e.1 = zext <8 x i8> %x to <8 x i16>
+  %shuffle = shufflevector <8 x i16> %e.1, <8 x i16> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle
+}
+
+define <5 x i32> @ext_identity_mask_first_vector_first_half_5xi32(<4 x i16> %x) {
+; CHECK-LABEL: define <5 x i32> @ext_identity_mask_first_vector_first_half_5xi32(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[E_1]], <4 x i32> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT:    ret <5 x i32> [[SHUFFLE]]
+;
+entry:
+  %e.1 = zext <4 x i16> %x to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %e.1, <4 x i32> %e.1, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
+  ret <5 x i32> %shuffle
+}
+
+define <4 x i16> @ext_no_identity_mask_first_vector_second_half(<8 x i8> %x, <8 x i16> %y) {
+; CHECK-LABEL: define <4 x i16> @ext_no_identity_mask_first_vector_second_half(
+; CHECK-SAME: <8 x i8> [[X:%.*]], <8 x i16> [[Y:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[X]] to <8 x i16>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> [[Y]], <4 x i32> <i32 0, i32 9, i32 1, i32 10>
+; CHECK-NEXT:    ret <4 x i16> [[SHUFFLE]]
+;
+entry:
+  %e.1 = zext <8 x i8> %x to <8 x i16>
+  %shuffle = shufflevector <8 x i16> %e.1, <8 x i16> %y, <4 x i32> <i32 0, i32 9, i32 1, i32 10>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i32> @load_i32_zext_to_v4i32(ptr %di) {
+; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32(
+; CHECK-SAME: ptr [[DI:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L1:%.*]] = load <4 x i8>, ptr [[DI]], align 4
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[L1]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
+;
+entry:
+  %l = load i32, ptr %di
+  %vec.ins = insertelement <2 x i32> <i32 poison, i32 0>, i32 %l, i64 0
+  %vec.bc = bitcast <2 x i32> %vec.ins to <8 x i8>
+  %e.1 = zext <8 x i8> %vec.bc to <8 x i16>
+  %vec.shuffle = shufflevector <8 x i16> %e.1, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %ext.2 = zext nneg <4 x i16> %vec.shuffle to <4 x i32>
+  ret <4 x i32> %ext.2
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
index daf4a7b799dd4..dbce77698eb07 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
@@ -12,20 +12,10 @@
 ;
 
 define <4 x double> @x86_pblendvb_v4f64_v2f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
-; SSE-LABEL: @x86_pblendvb_v4f64_v2f64(
-; SSE-NEXT:    [[TMP1:%.*]] = fcmp olt <4 x double> [[C:%.*]], [[D:%.*]]
-; SSE-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[B:%.*]], <4 x double> [[A:%.*]]
-; SSE-NEXT:    ret <4 x double> [[DOTV]]
-;
-; AVX2-LABEL: @x86_pblendvb_v4f64_v2f64(
-; AVX2-NEXT:    [[TMP1:%.*]] = fcmp olt <4 x double> [[C:%.*]], [[D:%.*]]
-; AVX2-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[B:%.*]], <4 x double> [[A:%.*]]
-; AVX2-NEXT:    ret <4 x double> [[DOTV]]
-;
-; AVX512-LABEL: @x86_pblendvb_v4f64_v2f64(
-; AVX512-NEXT:    [[CMP:%.*]] = fcmp olt <4 x double> [[C:%.*]], [[D:%.*]]
-; AVX512-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[CMP]], <4 x double> [[B:%.*]], <4 x double> [[A:%.*]]
-; AVX512-NEXT:    ret <4 x double> [[DOTV]]
+; CHECK-LABEL: @x86_pblendvb_v4f64_v2f64(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <4 x double> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[CMP]], <4 x double> [[B:%.*]], <4 x double> [[A:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[DOTV]]
 ;
   %a.bc = bitcast <4 x double> %a to <32 x i8>
   %b.bc = bitcast <4 x double> %b to <32 x i8>
@@ -46,20 +36,10 @@ define <4 x double> @x86_pblendvb_v4f64_v2f64(<4 x double> %a, <4 x double> %b,
 }
 
 define <8 x float> @x86_pblendvb_v8f32_v4f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
-; SSE-LABEL: @x86_pblendvb_v8f32_v4f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fcmp olt <8 x float> [[C:%.*]], [[D:%.*]]
-; SSE-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[B:%.*]], <8 x float> [[A:%.*]]
-; SSE-NEXT:    ret <8 x float> [[DOTV]]
-;
-; AVX2-LABEL: @x86_pblendvb_v8f32_v4f32(
-; AVX2-NEXT:    [[TMP1:%.*]] = fcmp olt <8 x float> [[C:%.*]], [[D:%.*]]
-; AVX2-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[B:%.*]], <8 x float> [[A:%.*]]
-; AVX2-NEXT:    ret <8 x float> [[DOTV]]
-;
-; AVX512-LABEL: @x86_pblendvb_v8f32_v4f32(
-; AVX512-NEXT:    [[CMP:%.*]] = fcmp olt <8 x float> [[C:%.*]], [[D:%.*]]
-; AVX512-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[CMP]], <8 x float> [[B:%.*]], <8 x float> [[A:%.*]]
-; AVX512-NEXT:    ret <8 x float> [[DOTV]]
+; CHECK-LABEL: @x86_pblendvb_v8f32_v4f32(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <8 x float> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[CMP]], <8 x float> [[B:%.*]], <8 x float> [[A:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[DOTV]]
 ;
   %a.bc = bitcast <8 x float> %a to <32 x i8>
   %b.bc = bitcast <8 x float> %b to <32 x i8>
@@ -80,20 +60,10 @@ define <8 x float> @x86_pblendvb_v8f32_v4f32(<8 x float> %a, <8 x float> %b, <8
 }
 
 define <4 x i64> @x86_pblendvb_v4i64_v2i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
-; SSE-LABEL: @x86_pblendvb_v4i64_v2i64(
-; SSE-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i64> [[C:%.*]], [[D:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[B:%.*]], <4 x i64> [[A:%.*]]
-; SSE-NEXT:    ret <4 x i64> [[TMP2]]
-;
-; AVX2-LABEL: @x86_pblendvb_v4i64_v2i64(
-; AVX2-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i64> [[C:%.*]], [[D:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[B:%.*]], <4 x i64> [[A:%.*]]
-; AVX2-NEXT:    ret <4 x i64> [[TMP2]]
-;
-; AVX512-LABEL: @x86_pblendvb_v4i64_v2i64(
-; AVX512-NEXT:    [[CMP:%.*]] = icmp slt <4 x i64> [[C:%.*]], [[D:%.*]]
-; AVX512-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[B:%.*]], <4 x i64> [[A:%.*]]
-; AVX512-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-LABEL: @x86_pblendvb_v4i64_v2i64(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <4 x i64> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[B:%.*]], <4 x i64> [[A:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
   %a.bc = bitcast <4 x i64> %a to <32 x i8>
   %b.bc = bitcast <4 x i64> %b to <32 x i8>
@@ -216,35 +186,15 @@ define <4 x i64> @x86_pblendvb_v16i16_v8i16(<4 x i64> %a, <4 x i64> %b, <4 x i64
 }
 
 define <4 x i64> @x86_pblendvb_v32i8_v16i8(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
-; SSE-LABEL: @x86_pblendvb_v32i8_v16i8(
-; SSE-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
-; SSE-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
-; SSE-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <32 x i8>
-; SSE-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <32 x i8>
-; SSE-NEXT:    [[TMP3:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
-; SSE-NEXT:    [[CONCAT:%.*]] = select <32 x i1> [[TMP3]], <32 x i8> [[TMP2]], <32 x i8> [[TMP1]]
-; SSE-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
-; SSE-NEXT:    ret <4 x i64> [[RES]]
-;
-; AVX2-LABEL: @x86_pblendvb_v32i8_v16i8(
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
-; AVX2-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
-; AVX2-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <32 x i8>
-; AVX2-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <32 x i8>
-; AVX2-NEXT:    [[TMP3:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
-; AVX2-NEXT:    [[CONCAT:%.*]] = select <32 x i1> [[TMP3]], <32 x i8> [[TMP2]], <32 x i8> [[TMP1]]
-; AVX2-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
-; AVX2-NEXT:    ret <4 x i64> [[RES]]
-;
-; AVX512-LABEL: @x86_pblendvb_v32i8_v16i8(
-; AVX512-NEXT:    [[A_BC:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
-; AVX512-NEXT:    [[B_BC:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
-; AVX512-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <32 x i8>
-; AVX512-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <32 x i8>
-; AVX512-NEXT:    [[CMP:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
-; AVX512-NEXT:    [[CONCAT:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[B_BC]], <32 x i8> [[A_BC]]
-; AVX512-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
-; AVX512-NEXT:    ret <4 x i64> [[RES]]
+; CHECK-LABEL: @x86_pblendvb_v32i8_v16i8(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <32 x i8>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
+; CHECK-NEXT:    [[CONCAT:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[B_BC]], <32 x i8> [[A_BC]]
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
 ;
   %a.bc = bitcast <4 x i64> %a to <32 x i8>
   %b.bc = bitcast <4 x i64> %b to <32 x i8>
@@ -424,35 +374,15 @@ define <8 x i64> @x86_pblendvb_v32i16_v16i16(<8 x i64> %a, <8 x i64> %b, <8 x i6
 }
 
 define <8 x i64> @x86_pblendvb_v64i8_v32i8(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) {
-; SSE-LABEL: @x86_pblendvb_v64i8_v32i8(
-; SSE-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[A:%.*]] to <64 x i8>
-; SSE-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[B:%.*]] to <64 x i8>
-; SSE-NEXT:    [[C_BC:%.*]] = bitcast <8 x i64> [[C:%.*]] to <64 x i8>
-; SSE-NEXT:    [[D_BC:%.*]] = bitcast <8 x i64> [[D:%.*]] to <64 x i8>
-; SSE-NEXT:    [[TMP3:%.*]] = icmp slt <64 x i8> [[C_BC]], [[D_BC]]
-; SSE-NEXT:    [[CONCAT:%.*]] = select <64 x i1> [[TMP3]], <64 x i8> [[TMP2]], <64 x i8> [[TMP1]]
-; SSE-NEXT:    [[RES:%.*]] = bitcast <64 x i8> [[CONCAT]] to <8 x i64>
-; SSE-NEXT:    ret <8 x i64> [[RES]]
-;
-; AVX2-LABEL: @x86_pblendvb_v64i8_v32i8(
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[A:%.*]] to <64 x i8>
-; AVX2-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[B:%.*]] to <64 x i8>
-; AVX2-NEXT:    [[C_BC:%.*]] = bitcast <8 x i64> [[C:%.*]] to <64 x i8>
-; AVX2-NEXT:    [[D_BC:%.*]] = bitcast <8 x i64> [[D:%.*]] to <64 x i8>
-; AVX2-NEXT:    [[TMP3:%.*]] = icmp slt <64 x i8> [[C_BC]], [[D_BC]]
-; AVX2-NEXT:    [[CONCAT:%.*]] = select <64 x i1> [[TMP3]], <64 x i8> [[TMP2]], <64 x i8> [[TMP1]]
-; AVX2-NEXT:    [[RES:%.*]] = bitcast <64 x i8> [[CONCAT]] to <8 x i64>
-; AVX2-NEXT:    ret <8 x i64> [[RES]]
-;
-; AVX512-LABEL: @x86_pblendvb_v64i8_v32i8(
-; AVX512-NEXT:    [[A_BC:%.*]] = bitcast <8 x i64> [[A:%.*]] to <64 x i8>
-; AVX512-NEXT:    [[B_BC:%.*]] = bitcast <8 x i64> [[B:%.*]] to <64 x i8>
-; AVX512-NEXT:    [[C_BC:%.*]] = bitcast <8 x i64> [[C:%.*]] to <64 x i8>
-; AVX512-NEXT:    [[D_BC:%.*]] = bitcast <8 x i64> [[D:%.*]] to <64 x i8>
-; AVX512-NEXT:    [[CMP:%.*]] = icmp slt <64 x i8> [[C_BC]], [[D_BC]]
-; AVX512-NEXT:    [[CONCAT:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[B_BC]], <64 x i8> [[A_BC]]
-; AVX512-NEXT:    [[RES:%.*]] = bitcast <64 x i8> [[CONCAT]] to <8 x i64>
-; AVX512-NEXT:    ret <8 x i64> [[RES]]
+; CHECK-LABEL: @x86_pblendvb_v64i8_v32i8(
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <8 x i64> [[A:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <8 x i64> [[B:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <8 x i64> [[C:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <8 x i64> [[D:%.*]] to <64 x i8>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <64 x i8> [[C_BC]], [[D_BC]]
+; CHECK-NEXT:    [[CONCAT:%.*]] = select <64 x i1> [[CMP]], <64 x i8> [[B_BC]], <64 x i8> [[A_BC]]
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <64 x i8> [[CONCAT]] to <8 x i64>
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
 ;
   %a.bc = bitcast <8 x i64> %a to <64 x i8>
   %b.bc = bitcast <8 x i64> %b to <64 x i8>
@@ -566,9 +496,8 @@ define <2 x i64> @x86_pblendvb_v32i8_v16i8_undefs(<4 x i64> %a, <4 x i64> %b, <4
 ; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <32 x i1> [[CMP]] to <32 x i8>
-; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <32 x i8> [[SEXT]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO]], <16 x i8> [[B_LO]], <16 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i1> [[CMP]], <32 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[B_LO]], <16 x i8> [[A_LO]]
 ; CHECK-NEXT:    [[RES:%.*]] = bitcast <16 x i8> [[SEL_LO]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;

``````````

</details>


https://github.com/llvm/llvm-project/pull/146901