[llvm] [VectorCombine] Fix invalid shuffle cost argument of foldShuffleOfSelects (PR #130281)

Fri Mar 7 05:07:04 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: hanbeom (ParkHanbum)

<details>
<summary>Changes</summary>

In the previous code, it specified the destination vector as the getShuffleCost argument. Because the shuffle mask specifies the indices of the two vectors specified as elements, the maximum value is twice the size of the source vector. This causes a problem if the destination vector is smaller than the source vector and specify an index in the mask that exceeds the size of the destination vector.

Fix the problem by correcting the previous code, which was using wrong argument in the Cost calculation.

Fixed https://github.com/llvm/llvm-project/issues/130250

---

Patch is 55.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130281.diff


4 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (+11-10) 
- (modified) llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll (+261-81) 
- (modified) llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll (+4-2) 
- (modified) llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll (+159-96) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 4d4a1a6e04d32..776a733d86afc 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2037,7 +2037,6 @@ bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
                      m_Mask(Mask))))
     return false;
 
-  auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
   auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
   auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
   if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
@@ -2051,24 +2050,26 @@ bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
        (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
     return false;
 
+  auto *SrcVecTy = dyn_cast<FixedVectorType>(T1->getType());
+  auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
   auto SK = TargetTransformInfo::SK_PermuteTwoSrc;
   auto SelOp = Instruction::Select;
   InstructionCost OldCost = TTI.getCmpSelInstrCost(
-      SelOp, T1->getType(), C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
-  OldCost += TTI.getCmpSelInstrCost(SelOp, T2->getType(), C2VecTy,
+      SelOp, DstVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
+  OldCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C2VecTy,
                                     CmpInst::BAD_ICMP_PREDICATE, CostKind);
-  OldCost += TTI.getShuffleCost(SK, DstVecTy, Mask, CostKind, 0, nullptr,
+  OldCost += TTI.getShuffleCost(SK, SrcVecTy, Mask, CostKind, 0, nullptr,
                                 {I.getOperand(0), I.getOperand(1)}, &I);
 
-  auto *C1C2VecTy = cast<FixedVectorType>(
-      toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
   InstructionCost NewCost =
-      TTI.getShuffleCost(SK, C1C2VecTy, Mask, CostKind, 0, nullptr, {C1, C2});
+      TTI.getShuffleCost(SK, C1VecTy, Mask, CostKind, 0, nullptr, {C1, C2});
   NewCost +=
-      TTI.getShuffleCost(SK, DstVecTy, Mask, CostKind, 0, nullptr, {T1, T2});
+      TTI.getShuffleCost(SK, SrcVecTy, Mask, CostKind, 0, nullptr, {T1, T2});
   NewCost +=
-      TTI.getShuffleCost(SK, DstVecTy, Mask, CostKind, 0, nullptr, {F1, F2});
-  NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, DstVecTy,
+      TTI.getShuffleCost(SK, SrcVecTy, Mask, CostKind, 0, nullptr, {F1, F2});
+  auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
+      toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
+  NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
                                     CmpInst::BAD_ICMP_PREDICATE, CostKind);
 
   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
index c2ed7b9c84523..84edc6e90a91d 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512
 
 ;
 ; PR58895 - replace shuffled _mm_blendv_epi8+icmp with select+icmp
@@ -12,10 +12,20 @@
 ;
 
 define <4 x double> @x86_pblendvb_v4f64_v2f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
-; CHECK-LABEL: @x86_pblendvb_v4f64_v2f64(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <4 x double> [[C:%.*]], [[D:%.*]]
-; CHECK-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[CMP]], <4 x double> [[B:%.*]], <4 x double> [[A:%.*]]
-; CHECK-NEXT:    ret <4 x double> [[DOTV]]
+; SSE-LABEL: @x86_pblendvb_v4f64_v2f64(
+; SSE-NEXT:    [[TMP1:%.*]] = fcmp olt <4 x double> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[B:%.*]], <4 x double> [[A:%.*]]
+; SSE-NEXT:    ret <4 x double> [[DOTV]]
+;
+; AVX2-LABEL: @x86_pblendvb_v4f64_v2f64(
+; AVX2-NEXT:    [[TMP1:%.*]] = fcmp olt <4 x double> [[C:%.*]], [[D:%.*]]
+; AVX2-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[B:%.*]], <4 x double> [[A:%.*]]
+; AVX2-NEXT:    ret <4 x double> [[DOTV]]
+;
+; AVX512-LABEL: @x86_pblendvb_v4f64_v2f64(
+; AVX512-NEXT:    [[CMP:%.*]] = fcmp olt <4 x double> [[C:%.*]], [[D:%.*]]
+; AVX512-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[CMP]], <4 x double> [[B:%.*]], <4 x double> [[A:%.*]]
+; AVX512-NEXT:    ret <4 x double> [[DOTV]]
 ;
   %a.bc = bitcast <4 x double> %a to <32 x i8>
   %b.bc = bitcast <4 x double> %b to <32 x i8>
@@ -36,10 +46,20 @@ define <4 x double> @x86_pblendvb_v4f64_v2f64(<4 x double> %a, <4 x double> %b,
 }
 
 define <8 x float> @x86_pblendvb_v8f32_v4f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
-; CHECK-LABEL: @x86_pblendvb_v8f32_v4f32(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <8 x float> [[C:%.*]], [[D:%.*]]
-; CHECK-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[CMP]], <8 x float> [[B:%.*]], <8 x float> [[A:%.*]]
-; CHECK-NEXT:    ret <8 x float> [[DOTV]]
+; SSE-LABEL: @x86_pblendvb_v8f32_v4f32(
+; SSE-NEXT:    [[TMP1:%.*]] = fcmp olt <8 x float> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[B:%.*]], <8 x float> [[A:%.*]]
+; SSE-NEXT:    ret <8 x float> [[DOTV]]
+;
+; AVX2-LABEL: @x86_pblendvb_v8f32_v4f32(
+; AVX2-NEXT:    [[TMP1:%.*]] = fcmp olt <8 x float> [[C:%.*]], [[D:%.*]]
+; AVX2-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[B:%.*]], <8 x float> [[A:%.*]]
+; AVX2-NEXT:    ret <8 x float> [[DOTV]]
+;
+; AVX512-LABEL: @x86_pblendvb_v8f32_v4f32(
+; AVX512-NEXT:    [[CMP:%.*]] = fcmp olt <8 x float> [[C:%.*]], [[D:%.*]]
+; AVX512-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[CMP]], <8 x float> [[B:%.*]], <8 x float> [[A:%.*]]
+; AVX512-NEXT:    ret <8 x float> [[DOTV]]
 ;
   %a.bc = bitcast <8 x float> %a to <32 x i8>
   %b.bc = bitcast <8 x float> %b to <32 x i8>
@@ -60,10 +80,20 @@ define <8 x float> @x86_pblendvb_v8f32_v4f32(<8 x float> %a, <8 x float> %b, <8
 }
 
 define <4 x i64> @x86_pblendvb_v4i64_v2i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
-; CHECK-LABEL: @x86_pblendvb_v4i64_v2i64(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <4 x i64> [[C:%.*]], [[D:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[B:%.*]], <4 x i64> [[A:%.*]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; SSE-LABEL: @x86_pblendvb_v4i64_v2i64(
+; SSE-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i64> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[B:%.*]], <4 x i64> [[A:%.*]]
+; SSE-NEXT:    ret <4 x i64> [[TMP2]]
+;
+; AVX2-LABEL: @x86_pblendvb_v4i64_v2i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i64> [[C:%.*]], [[D:%.*]]
+; AVX2-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[B:%.*]], <4 x i64> [[A:%.*]]
+; AVX2-NEXT:    ret <4 x i64> [[TMP2]]
+;
+; AVX512-LABEL: @x86_pblendvb_v4i64_v2i64(
+; AVX512-NEXT:    [[CMP:%.*]] = icmp slt <4 x i64> [[C:%.*]], [[D:%.*]]
+; AVX512-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[CMP]], <4 x i64> [[B:%.*]], <4 x i64> [[A:%.*]]
+; AVX512-NEXT:    ret <4 x i64> [[TMP1]]
 ;
   %a.bc = bitcast <4 x i64> %a to <32 x i8>
   %b.bc = bitcast <4 x i64> %b to <32 x i8>
@@ -84,15 +114,35 @@ define <4 x i64> @x86_pblendvb_v4i64_v2i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>
 }
 
 define <4 x i64> @x86_pblendvb_v8i32_v4i32(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
-; CHECK-LABEL: @x86_pblendvb_v8i32_v4i32(
-; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <8 x i32> [[C_BC]], [[D_BC]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[A:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[TMP2]], <8 x i32> [[TMP1]]
-; CHECK-NEXT:    [[RES:%.*]] = bitcast <8 x i32> [[TMP3]] to <4 x i64>
-; CHECK-NEXT:    ret <4 x i64> [[RES]]
+; SSE-LABEL: @x86_pblendvb_v8i32_v4i32(
+; SSE-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <8 x i32>
+; SSE-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <8 x i32>
+; SSE-NEXT:    [[TMP1:%.*]] = icmp slt <8 x i32> [[C_BC]], [[D_BC]]
+; SSE-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <8 x i32>
+; SSE-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[A:%.*]] to <8 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> [[TMP3]]
+; SSE-NEXT:    [[RES:%.*]] = bitcast <8 x i32> [[TMP4]] to <4 x i64>
+; SSE-NEXT:    ret <4 x i64> [[RES]]
+;
+; AVX2-LABEL: @x86_pblendvb_v8i32_v4i32(
+; AVX2-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <8 x i32>
+; AVX2-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <8 x i32>
+; AVX2-NEXT:    [[TMP1:%.*]] = icmp slt <8 x i32> [[C_BC]], [[D_BC]]
+; AVX2-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <8 x i32>
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[A:%.*]] to <8 x i32>
+; AVX2-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> [[TMP3]]
+; AVX2-NEXT:    [[RES:%.*]] = bitcast <8 x i32> [[TMP4]] to <4 x i64>
+; AVX2-NEXT:    ret <4 x i64> [[RES]]
+;
+; AVX512-LABEL: @x86_pblendvb_v8i32_v4i32(
+; AVX512-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <8 x i32>
+; AVX512-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <8 x i32>
+; AVX512-NEXT:    [[CMP:%.*]] = icmp slt <8 x i32> [[C_BC]], [[D_BC]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[A:%.*]] to <8 x i32>
+; AVX512-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <8 x i32>
+; AVX512-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[TMP2]], <8 x i32> [[TMP1]]
+; AVX512-NEXT:    [[RES:%.*]] = bitcast <8 x i32> [[TMP3]] to <4 x i64>
+; AVX512-NEXT:    ret <4 x i64> [[RES]]
 ;
   %a.bc = bitcast <4 x i64> %a to <32 x i8>
   %b.bc = bitcast <4 x i64> %b to <32 x i8>
@@ -115,15 +165,35 @@ define <4 x i64> @x86_pblendvb_v8i32_v4i32(<4 x i64> %a, <4 x i64> %b, <4 x i64>
 }
 
 define <4 x i64> @x86_pblendvb_v16i16_v8i16(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
-; CHECK-LABEL: @x86_pblendvb_v16i16_v8i16(
-; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <16 x i16>
-; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <16 x i16>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <16 x i16> [[C_BC]], [[D_BC]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[A:%.*]] to <16 x i16>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <16 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[TMP2]], <16 x i16> [[TMP1]]
-; CHECK-NEXT:    [[RES:%.*]] = bitcast <16 x i16> [[TMP3]] to <4 x i64>
-; CHECK-NEXT:    ret <4 x i64> [[RES]]
+; SSE-LABEL: @x86_pblendvb_v16i16_v8i16(
+; SSE-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <16 x i16>
+; SSE-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <16 x i16>
+; SSE-NEXT:    [[TMP1:%.*]] = icmp slt <16 x i16> [[C_BC]], [[D_BC]]
+; SSE-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <16 x i16>
+; SSE-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[A:%.*]] to <16 x i16>
+; SSE-NEXT:    [[TMP4:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[TMP2]], <16 x i16> [[TMP3]]
+; SSE-NEXT:    [[RES:%.*]] = bitcast <16 x i16> [[TMP4]] to <4 x i64>
+; SSE-NEXT:    ret <4 x i64> [[RES]]
+;
+; AVX2-LABEL: @x86_pblendvb_v16i16_v8i16(
+; AVX2-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <16 x i16>
+; AVX2-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <16 x i16>
+; AVX2-NEXT:    [[TMP1:%.*]] = icmp slt <16 x i16> [[C_BC]], [[D_BC]]
+; AVX2-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <16 x i16>
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[A:%.*]] to <16 x i16>
+; AVX2-NEXT:    [[TMP4:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[TMP2]], <16 x i16> [[TMP3]]
+; AVX2-NEXT:    [[RES:%.*]] = bitcast <16 x i16> [[TMP4]] to <4 x i64>
+; AVX2-NEXT:    ret <4 x i64> [[RES]]
+;
+; AVX512-LABEL: @x86_pblendvb_v16i16_v8i16(
+; AVX512-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <16 x i16>
+; AVX512-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <16 x i16>
+; AVX512-NEXT:    [[CMP:%.*]] = icmp slt <16 x i16> [[C_BC]], [[D_BC]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[A:%.*]] to <16 x i16>
+; AVX512-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <16 x i16>
+; AVX512-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[TMP2]], <16 x i16> [[TMP1]]
+; AVX512-NEXT:    [[RES:%.*]] = bitcast <16 x i16> [[TMP3]] to <4 x i64>
+; AVX512-NEXT:    ret <4 x i64> [[RES]]
 ;
   %a.bc = bitcast <4 x i64> %a to <32 x i8>
   %b.bc = bitcast <4 x i64> %b to <32 x i8>
@@ -146,15 +216,35 @@ define <4 x i64> @x86_pblendvb_v16i16_v8i16(<4 x i64> %a, <4 x i64> %b, <4 x i64
 }
 
 define <4 x i64> @x86_pblendvb_v32i8_v16i8(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
-; CHECK-LABEL: @x86_pblendvb_v32i8_v16i8(
-; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
-; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
-; CHECK-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <32 x i8>
-; CHECK-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <32 x i8>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
-; CHECK-NEXT:    [[CONCAT:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[B_BC]], <32 x i8> [[A_BC]]
-; CHECK-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
-; CHECK-NEXT:    ret <4 x i64> [[RES]]
+; SSE-LABEL: @x86_pblendvb_v32i8_v16i8(
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
+; SSE-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
+; SSE-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <32 x i8>
+; SSE-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <32 x i8>
+; SSE-NEXT:    [[TMP3:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
+; SSE-NEXT:    [[CONCAT:%.*]] = select <32 x i1> [[TMP3]], <32 x i8> [[TMP2]], <32 x i8> [[TMP1]]
+; SSE-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
+; SSE-NEXT:    ret <4 x i64> [[RES]]
+;
+; AVX2-LABEL: @x86_pblendvb_v32i8_v16i8(
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
+; AVX2-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
+; AVX2-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <32 x i8>
+; AVX2-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <32 x i8>
+; AVX2-NEXT:    [[TMP3:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
+; AVX2-NEXT:    [[CONCAT:%.*]] = select <32 x i1> [[TMP3]], <32 x i8> [[TMP2]], <32 x i8> [[TMP1]]
+; AVX2-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
+; AVX2-NEXT:    ret <4 x i64> [[RES]]
+;
+; AVX512-LABEL: @x86_pblendvb_v32i8_v16i8(
+; AVX512-NEXT:    [[A_BC:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
+; AVX512-NEXT:    [[B_BC:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
+; AVX512-NEXT:    [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <32 x i8>
+; AVX512-NEXT:    [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <32 x i8>
+; AVX512-NEXT:    [[CMP:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
+; AVX512-NEXT:    [[CONCAT:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[B_BC]], <32 x i8> [[A_BC]]
+; AVX512-NEXT:    [[RES:%.*]] = bitcast <32 x i8> [[CONCAT]] to <4 x i64>
+; AVX512-NEXT:    ret <4 x i64> [[RES]]
 ;
   %a.bc = bitcast <4 x i64> %a to <32 x i8>
   %b.bc = bitcast <4 x i64> %b to <32 x i8>
@@ -180,10 +270,20 @@ define <4 x i64> @x86_pblendvb_v32i8_v16i8(<4 x i64> %a, <4 x i64> %b, <4 x i64>
 ;
 
 define <8 x double> @x86_pblendvb_v8f64_v4f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d) {
-; CHECK-LABEL: @x86_pblendvb_v8f64_v4f64(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <8 x double> [[C:%.*]], [[D:%.*]]
-; CHECK-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[CMP]], <8 x double> [[B:%.*]], <8 x double> [[A:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[DOTV]]
+; SSE-LABEL: @x86_pblendvb_v8f64_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = fcmp olt <8 x double> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[B:%.*]], <8 x double> [[A:%.*]]
+; SSE-NEXT:    ret <8 x double> [[DOTV]]
+;
+; AVX2-LABEL: @x86_pblendvb_v8f64_v4f64(
+; AVX2-NEXT:    [[TMP1:%.*]] = fcmp olt <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX2-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[B:%.*]], <8 x double> [[A:%.*]]
+; AVX2-NEXT:    ret <8 x double> [[DOTV]]
+;
+; AVX512-LABEL: @x86_pblendvb_v8f64_v4f64(
+; AVX512-NEXT:    [[CMP:%.*]] = fcmp olt <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX512-NEXT:    [[DOTV:%.*]] = select <8 x i1> [[CMP]], <8 x double> [[B:%.*]], <8 x double> [[A:%.*]]
+; AVX512-NEXT:    ret <8 x double> [[DOTV]]
 ;
   %a.bc = bitcast <8 x double> %a to <64 x i8>
   %b.bc = bitcast <8 x double> %b to <64 x i8>
@@ -204,10 +304,20 @@ define <8 x double> @x86_pblendvb_v8f64_v4f64(<8 x double> %a, <8 x double> %b,
 }
 
 define <16 x float> @x86_pblendvb_v16f32_v8f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d) {
-; CHECK-LABEL: @x86_pblendvb_v16f32_v8f32(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <16 x float> [[C:%.*]], [[D:%.*]]
-; CHECK-NEXT:    [[DOTV:%.*]] = select <16 x i1> [[CMP]], <16 x float> [[B:%.*]], <16 x float> [[A:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[DOTV]]
+; SSE-LABEL: @x86_pblendvb_v16f32_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = fcmp olt <16 x float> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[DOTV:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[B:%.*]], <16 x float> [[A:%.*]]
+; SSE-NEXT:    ret <16 x float> [[DOTV]]
+;
+; AVX2-LABEL: @x86_pblendvb_v16f32_v8f32(
+; AVX2-NEXT:    [[TMP1:%.*]] = fcmp olt <16 x float> [[C:%.*]], [[D:%.*]]
+; AVX2-NEXT:    [[DOTV:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[B:%.*]], <16 x float> [[A:%.*]]
+; AVX2-NEXT:    ret <16 x float> [[DOTV]]
+;
+; AVX512-LABEL: @x86_pblendvb_v16f32_v8f32(
+; AVX512-NEXT:    [[CMP:%.*]] = fcmp olt <16 x float> [[C:%.*]], [[D:%.*]]
+; AVX512-NEXT:    [[DOTV:%.*]] = select <16 x i1> [[CMP]], <16 x float> [[B:%.*]], <16 x float> [[A:%.*]]
+; AVX512-NEXT:    ret <16 x float> [[DOTV]]
 ;
   %a.bc = bitcast <16 x float> %a to <64 x i8>
   %b.bc = bitcast <16 x float> %b to <64 x i8>
@@ -228,10 +338,20 @@ define <16 x float> @x86_pblendvb_v16f32_v8f32(<16 x float> %a, <16 x float> %b,
 }
 
 define <8 x i64> @x86_pblendvb_v8i64_v4i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) {
-; CHECK-LABEL: @x86_pblendvb_v8i64_v4i64(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <8 x i64> [[C:%.*]], [[D:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[B:%.*]], <8 x i64> [[A:%.*]]
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+; SSE-LABEL: @x86_pblendvb_v8i64_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = icmp slt <8 x i64> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[B:%.*]], <8 x i64> [[A:%.*]]
+; SSE-NEXT:    ret <8 x i64> [[TMP2]]
+;
+; AVX2-LABEL: @x86_pblendvb_v8i64_v4i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = icmp slt <8 x i64> [[C:%.*]], [[D:%.*]]
+; AVX2-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[B:%.*]], <8 x i64> [[A:%.*]]
+; AVX2-NEXT:    ret <8 x i64> [[TMP2]]
+;
+; AVX512-LABEL: @x86_pblendvb_v8i64_v4i64(
+; AVX512-NEXT:    [[CMP:%.*]] = icmp slt <8 x i64> [[C:%.*]], [[D:%.*]]
+; AVX512-NEXT:    [[TMP1:%.*]] = select <8 x i1> [[CMP]], <8 x i64> [[B:%.*]], <8 x i64> [[A:%.*]]
+; AVX512-NEXT:    ret <8 x i64> [[TMP1]]
 ;
   %a.bc = bitcast <8 x i64> %a to <64 x i8>
   %b.bc = bitcast <8 x i64> %b to <64 x i8>
@@ -252,15 +372,35 @@ define <8 x i64> @x86_pblendvb_v8i64_v4i64(<8 x i64> %a, <8 x i64> %b, <8 x i64>
 }
 
 define <8 x i64> @x86_pblendvb_v16i32_v8i32(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) {
-; CHECK-LABEL: @x86_pblendvb_v16i32_v8i32(
-; CHECK-NEXT:    [[C_BC:%.*]] =...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/130281