[clang] [CIR][AArch64] Lower NEON vbsl builtins (PR #188449)

Wed Mar 25 02:53:26 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-clangir

Author: Jiahao Guo (E00N777)

<details>
<summary>Changes</summary>

Part of https://github.com/llvm/llvm-project/issues/185382

Lowering:

- test_vbsl_s8
- test_vbslq_s8
- test_vbsl_s16
- test_vbslq_s16
- test_vbsl_f32
- test_vbslq_f32

I reused the lowering logic from the [incubator](https://github.com/llvm/clangir/blob/main/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp) implementation and added a corresponding helper function in the upstream file. like this way:
```
case NEON::BI__builtin_neon_vbsl_v:
  case NEON::BI__builtin_neon_vbslq_v: {
    cir::VectorType bitTy = vTy;
    if (cir::isAnyFloatingPointType(bitTy.getElementType()))
      bitTy = castVecOfFPTypeToVecOfIntWithSameWidth(builder, vTy);
    Ops[0] = builder.createBitcast(Ops[0], bitTy);
    Ops[1] = builder.createBitcast(Ops[1], bitTy);
    Ops[2] = builder.createBitcast(Ops[2], bitTy);

    Ops[1] = builder.createAnd(Ops[0], Ops[1]);
    Ops[2] = builder.createAnd(builder.createNot(Ops[0]), Ops[2]);
    Ops[0] = builder.createOr(Ops[1], Ops[2]);
    return builder.createBitcast(Ops[0], ty);
  }
```
and 
```
static cir::VectorType
castVecOfFPTypeToVecOfIntWithSameWidth(CIRGenBuilderTy &builder,
                                       cir::VectorType vecTy) {
  if (mlir::isa<cir::SingleType>(vecTy.getElementType()))
    return cir::VectorType::get(builder.getSInt32Ty(), vecTy.getSize());
  if (mlir::isa<cir::DoubleType>(vecTy.getElementType()))
    return cir::VectorType::get(builder.getSInt64Ty(), vecTy.getSize());
  llvm_unreachable(
      "Unsupported element type in getVecOfIntTypeWithSameEltWidth");
}
```

If this is not the preferred way to structure it, I’d be happy to adjust it based on your feedback.

For FileCheck coverage, I moved the relevant test cases from `clang/test/CodeGen/AArch64/neon-intrinsics.c` into `clang/test/CodeGen/AArch64/neon/intrinsics.c`.

I was not entirely sure whether the Bitwise select coverage should go into a separate dedicated test file, so for now I kept it in `clang/test/CodeGen/AArch64/neon/intrinsics.c`.


---

Patch is 20.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/188449.diff


3 Files Affected:

- (modified) clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp (+27-2) 
- (modified) clang/test/CodeGen/AArch64/neon-intrinsics.c (-107) 
- (modified) clang/test/CodeGen/AArch64/neon/intrinsics.c (+155) 


``````````diff

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index a3488bfcc3dec..3a0cc766478a3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -14,6 +14,7 @@
 #include "CIRGenFunction.h"
 #include "clang/Basic/AArch64CodeGenUtils.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
 
 // TODO(cir): once all builtins are covered, decide whether we still
@@ -23,6 +24,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Value.h"
 #include "clang/AST/GlobalDecl.h"
 #include "clang/Basic/Builtins.h"
@@ -169,6 +171,17 @@ static cir::VectorType getNeonType(CIRGenFunction *cgf, NeonTypeFlags typeFlags,
   llvm_unreachable("Unknown vector element type!");
 }
 
+static cir::VectorType
+castVecOfFPTypeToVecOfIntWithSameWidth(CIRGenBuilderTy &builder,
+                                      cir::VectorType vecTy) {
+  if (mlir::isa<cir::SingleType>(vecTy.getElementType()))
+    return cir::VectorType::get(builder.getSInt32Ty(),vecTy.getSize());
+  if (mlir::isa<cir::DoubleType>(vecTy.getElementType()))
+    return cir::VectorType::get(builder.getSInt64Ty(), vecTy.getSize());
+  llvm_unreachable(
+      "Unsupported element type in getVecOfIntTypeWithSameEltWidth");
+}
+
 static mlir::Value emitCommonNeonBuiltinExpr(
     CIRGenFunction &cgf, unsigned builtinID, unsigned llvmIntrinsic,
     unsigned altLLVMIntrinsic, const char *nameHint, unsigned modifier,
@@ -1677,7 +1690,7 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
     return mlir::Value{};
   }
 
-  // Memory Operations (MOPS)
+  // Memory Operations (Mops)
   if (builtinID == AArch64::BI__builtin_arm_mops_memset_tag) {
     cgm.errorNYI(expr->getSourceRange(),
                  std::string("unimplemented AArch64 builtin call: ") +
@@ -2196,7 +2209,19 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
   default:
     return std::nullopt;
   case NEON::BI__builtin_neon_vbsl_v:
-  case NEON::BI__builtin_neon_vbslq_v:
+  case NEON::BI__builtin_neon_vbslq_v: {
+    cir::VectorType bitTy = ty;
+    if(cir::isAnyFloatingPointType(bitTy.getElementType()))
+      bitTy = castVecOfFPTypeToVecOfIntWithSameWidth(builder, bitTy);
+    ops[0] = builder.createBitcast(ops[0], bitTy);
+    ops[1] = builder.createBitcast(ops[1], bitTy);
+    ops[2] = builder.createBitcast(ops[2], bitTy);
+
+    ops[1] = builder.createAnd(loc, ops[0], ops[1]);
+    ops[2] = builder.createAnd(loc, builder.createNot(ops[0]), ops[2]);
+    ops[0] = builder.createOr(loc, ops[1], ops[2]);
+    return builder.createBitcast(ops[0], ty);
+  }
   case NEON::BI__builtin_neon_vfma_lane_v:
   case NEON::BI__builtin_neon_vfmaq_lane_v:
   case NEON::BI__builtin_neon_vfma_laneq_v:
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index 8eb6cd86339d6..c01edc93267b7 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -1038,39 +1038,6 @@ float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) {
   return vdiv_f32(v1, v2);
 }
 
-// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_s8(
-// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VBSL_I:%.*]] = and <8 x i8> [[V1]], [[V2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i8> [[V1]], splat (i8 -1)
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], [[V3]]
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
-// CHECK-NEXT:    ret <8 x i8> [[VBSL2_I]]
-//
-int8x8_t test_vbsl_s8(uint8x8_t v1, int8x8_t v2, int8x8_t v3) {
-  return vbsl_s8(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_s16(
-// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[V3]] to <8 x i8>
-// CHECK-NEXT:    [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
-// CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], splat (i16 -1)
-// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
-// CHECK-NEXT:    [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP4]]
-//
-int8x8_t test_vbsl_s16(uint16x4_t v1, int16x4_t v2, int16x4_t v3) {
-  return (int8x8_t)vbsl_s16(v1, v2, v3);
-}
-
 // CHECK-LABEL: define dso_local <2 x i32> @test_vbsl_s32(
 // CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -1179,28 +1146,6 @@ uint64x1_t test_vbsl_u64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
   return vbsl_u64(v1, v2, v3);
 }
 
-// CHECK-LABEL: define dso_local <2 x float> @test_vbsl_f32(
-// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
-// CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[VBSL_I]], splat (i32 -1)
-// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <2 x i32> [[TMP5]], [[VBSL2_I]]
-// CHECK-NEXT:    [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP6]]
-//
-float32x2_t test_vbsl_f32(uint32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  return vbsl_f32(v1, v2, v3);
-}
-
 // CHECK-LABEL: define dso_local <1 x double> @test_vbsl_f64(
 // CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x double> noundef [[V2:%.*]], <1 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -1257,37 +1202,6 @@ poly16x4_t test_vbsl_p16(uint16x4_t v1, poly16x4_t v2, poly16x4_t v3) {
   return vbsl_p16(v1, v2, v3);
 }
 
-// CHECK-LABEL: define dso_local <16 x i8> @test_vbslq_s8(
-// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VBSL_I:%.*]] = and <16 x i8> [[V1]], [[V2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = xor <16 x i8> [[V1]], splat (i8 -1)
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], [[V3]]
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
-// CHECK-NEXT:    ret <16 x i8> [[VBSL2_I]]
-//
-int8x16_t test_vbslq_s8(uint8x16_t v1, int8x16_t v2, int8x16_t v3) {
-  return vbslq_s8(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vbslq_s16(
-// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[V3]] to <16 x i8>
-// CHECK-NEXT:    [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
-// CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], splat (i16 -1)
-// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
-// CHECK-NEXT:    [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK-NEXT:    ret <8 x i16> [[VBSL5_I]]
-//
-int16x8_t test_vbslq_s16(uint16x8_t v1, int16x8_t v2, int16x8_t v3) {
-  return vbslq_s16(v1, v2, v3);
-}
 
 // CHECK-LABEL: define dso_local <4 x i32> @test_vbslq_s32(
 // CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
@@ -1397,27 +1311,6 @@ uint64x2_t test_vbslq_u64(uint64x2_t v1, uint64x2_t v2, uint64x2_t v3) {
   return vbslq_u64(v1, v2, v3);
 }
 
-// CHECK-LABEL: define dso_local <4 x float> @test_vbslq_f32(
-// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
-// CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[VBSL_I]], splat (i32 -1)
-// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i32> [[TMP5]], [[VBSL2_I]]
-// CHECK-NEXT:    [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP6]]
-//
-float32x4_t test_vbslq_f32(uint32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  return vbslq_f32(v1, v2, v3);
-}
 
 // CHECK-LABEL: define dso_local <16 x i8> @test_vbslq_p8(
 // CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
diff --git a/clang/test/CodeGen/AArch64/neon/intrinsics.c b/clang/test/CodeGen/AArch64/neon/intrinsics.c
index bf8e62feda8da..0375d3ab02647 100644
--- a/clang/test/CodeGen/AArch64/neon/intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon/intrinsics.c
@@ -982,3 +982,158 @@ int64_t test_vshld_u64(int64_t a,int64_t b) {
   return (int64_t)vshld_u64(a, b);
 }
 
+// LLVM-LABEL: @test_vbsl_s8(
+// CIR-LABEL: @vbsl_s8(
+int8x8_t test_vbsl_s8(uint8x8_t v1, int8x8_t v2, int8x8_t v3) {
+  // CIR: [[MASK_PTR:%.*]] = cir.cast bitcast %{{.*}} : !cir.ptr<!cir.vector<8 x !u8i>> -> !cir.ptr<!cir.vector<8 x !s8i>>
+  // CIR: [[AND:%.*]] = cir.and %{{.*}}, %{{.*}} : !cir.vector<8 x !s8i>
+  // CIR: [[NOT:%.*]] = cir.not %{{.*}} : !cir.vector<8 x !s8i>
+  // CIR: [[AND2:%.*]] = cir.and [[NOT]], %{{.*}} : !cir.vector<8 x !s8i>
+  // CIR: [[RES:%.*]] = cir.or [[AND]], [[AND2]] : !cir.vector<8 x !s8i>
+
+
+  // LLVM:      [[VBSL_I:%.*]] = and <8 x i8> [[V1]], [[V2]]
+  // LLVM-NEXT: [[TMP0:%.*]] = xor <8 x i8> [[V1]], splat (i8 -1)
+  // LLVM-NEXT: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], [[V3]]
+  // LLVM-NEXT: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+  // LLVM-NEXT: ret <8 x i8> [[VBSL2_I]]
+  return vbsl_s8(v1, v2, v3);
+}
+  
+// LLVM-LABEL: @test_vbslq_s8(
+// CIR-LABEL: @vbslq_s8(
+int8x16_t test_vbslq_s8(uint8x16_t v1, int8x16_t v2, int8x16_t v3) {
+  // CIR: cir.cast bitcast %{{.*}} : !cir.ptr<!cir.vector<16 x !u8i>> -> !cir.ptr<!cir.vector<16 x !s8i>>
+  // CIR: [[AND:%.*]] = cir.and %{{.*}}, %{{.*}} : !cir.vector<16 x !s8i>
+  // CIR: [[NOT:%.*]] = cir.not %{{.*}} : !cir.vector<16 x !s8i>
+  // CIR: [[AND2:%.*]] = cir.and [[NOT]], %{{.*}} : !cir.vector<16 x !s8i>
+  // CIR: cir.or [[AND]], [[AND2]] : !cir.vector<16 x !s8i>
+
+  // LLVM:      [[VBSL_I:%.*]] = and <16 x i8> [[V1]], [[V2]]
+  // LLVM-NEXT: [[TMP0:%.*]] = xor <16 x i8> [[V1]], splat (i8 -1)
+  // LLVM-NEXT: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], [[V3]]
+  // LLVM-NEXT: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+  // LLVM-NEXT: ret <16 x i8> [[VBSL2_I]]
+  return vbslq_s8(v1, v2, v3);
+}
+
+// LLVM-LABEL: @test_vbsl_s16(
+// CIR-LABEL: @vbsl_s16(
+int8x8_t test_vbsl_s16(uint16x4_t v1, int16x4_t v2, int16x4_t v3) {
+  // CIR: cir.cast bitcast %{{.*}} : !cir.ptr<!cir.vector<4 x !u16i>> -> !cir.ptr<!cir.vector<8 x !s8i>>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.ptr<!cir.vector<4 x !s16i>> -> !cir.ptr<!cir.vector<8 x !s8i>>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.ptr<!cir.vector<4 x !s16i>> -> !cir.ptr<!cir.vector<8 x !s8i>>
+  // CIR: [[MASK:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !s16i>
+  // CIR: [[VAL1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !s16i>
+  // CIR: [[VAL2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<4 x !s16i>
+  // CIR: [[AND:%.*]] = cir.and [[MASK]], [[VAL1]] : !cir.vector<4 x !s16i>
+  // CIR: [[NOT:%.*]] = cir.not [[MASK]] : !cir.vector<4 x !s16i>
+  // CIR: [[AND2:%.*]] = cir.and [[NOT]], [[VAL2]] : !cir.vector<4 x !s16i>
+  // CIR: [[RES:%.*]] = cir.or [[AND]], [[AND2]] : !cir.vector<4 x !s16i>
+
+  // LLVM:      [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8>
+  // LLVM-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8>
+  // LLVM-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[V3]] to <8 x i8>
+  // LLVM-NEXT: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+  // LLVM-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+  // LLVM-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+  // LLVM-NEXT: [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+  // LLVM-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], splat (i16 -1)
+  // LLVM-NEXT: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+  // LLVM-NEXT: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+  // LLVM-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <8 x i8>
+  // LLVM-NEXT: ret <8 x i8> [[TMP4]]
+  return (int8x8_t)vbsl_s16(v1, v2, v3);
+}
+
+// LLVM-LABEL: @test_vbslq_s16(
+// CIR-LABEL: @vbslq_s16(
+int16x8_t test_vbslq_s16(uint16x8_t v1, int16x8_t v2, int16x8_t v3) {
+  // CIR: cir.cast bitcast %{{.*}} : !cir.ptr<!cir.vector<8 x !u16i>> -> !cir.ptr<!cir.vector<16 x !s8i>>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.ptr<!cir.vector<8 x !s16i>> -> !cir.ptr<!cir.vector<16 x !s8i>>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.ptr<!cir.vector<8 x !s16i>> -> !cir.ptr<!cir.vector<16 x !s8i>>
+  // CIR: [[MASK:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<8 x !s16i>
+  // CIR: [[VAL1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<8 x !s16i>
+  // CIR: [[VAL2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<8 x !s16i>
+  // CIR: [[AND:%.*]] = cir.and [[MASK]], [[VAL1]] : !cir.vector<8 x !s16i>
+  // CIR: [[NOT:%.*]] = cir.not [[MASK]] : !cir.vector<8 x !s16i>
+  // CIR: [[AND2:%.*]] = cir.and [[NOT]], [[VAL2]] : !cir.vector<8 x !s16i>
+  // CIR: [[RES:%.*]] = cir.or [[AND]], [[AND2]] : !cir.vector<8 x !s16i>
+
+  // LLVM:      [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8>
+  // LLVM-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8>
+  // LLVM-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[V3]] to <16 x i8>
+  // LLVM-NEXT: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+  // LLVM-NEXT: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+  // LLVM-NEXT: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+  // LLVM-NEXT: [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
+  // LLVM-NEXT: [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], splat (i16 -1)
+  // LLVM-NEXT: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+  // LLVM-NEXT: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+  // LLVM-NEXT: ret <8 x i16> [[VBSL5_I]]
+  return vbslq_s16(v1, v2, v3);
+}
+
+// LLVM-LABEL: @test_vbsl_f32(
+// CIR-LABEL: @vbsl_f32(
+float32x2_t test_vbsl_f32(uint32x2_t v1, float32x2_t v2, float32x2_t v3) {
+  // CIR: cir.cast bitcast %{{.*}} : !cir.ptr<!cir.vector<2 x !u32i>> -> !cir.ptr<!cir.vector<8 x !s8i>>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.ptr<!cir.vector<2 x !cir.float>> -> !cir.ptr<!cir.vector<8 x !s8i>>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.ptr<!cir.vector<2 x !cir.float>> -> !cir.ptr<!cir.vector<8 x !s8i>>
+  // CIR: [[MASK:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !s32i>
+  // CIR: [[VAL1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !s32i>
+  // CIR: [[VAL2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s8i> -> !cir.vector<2 x !s32i>
+  // CIR: [[AND:%.*]] = cir.and [[MASK]], [[VAL1]] : !cir.vector<2 x !s32i>
+  // CIR: [[NOT:%.*]] = cir.not [[MASK]] : !cir.vector<2 x !s32i>
+  // CIR: [[AND2:%.*]] = cir.and [[NOT]], [[VAL2]] : !cir.vector<2 x !s32i>
+  // CIR: [[OR:%.*]] = cir.or [[AND]], [[AND2]] : !cir.vector<2 x !s32i>
+  // CIR: cir.cast bitcast [[OR]] : !cir.vector<2 x !s32i> -> !cir.vector<2 x !cir.float>
+
+  // LLVM:      [[TMP0:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32>
+  // LLVM-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32>
+  // LLVM-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8>
+  // LLVM-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+  // LLVM-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
+  // LLVM-NEXT: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+  // LLVM-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+  // LLVM-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+  // LLVM-NEXT: [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
+  // LLVM-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[VBSL_I]], splat (i32 -1)
+  // LLVM-NEXT: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP5]], [[VBSL2_I]]
+  // LLVM-NEXT: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+  // LLVM-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float>
+  // LLVM-NEXT: ret <2 x float> [[TMP6]]
+  return vbsl_f32(v1, v2, v3);
+}
+
+// LLVM-LABEL: @test_vbslq_f32(
+// CIR-LABEL: @vbslq_f32(
+float32x4_t test_vbslq_f32(uint32x4_t v1, float32x4_t v2, float32x4_t v3) {
+  // CIR: cir.cast bitcast %{{.*}} : !cir.ptr<!cir.vector<4 x !u32i>> -> !cir.ptr<!cir.vector<16 x !s8i>>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.ptr<!cir.vector<4 x !cir.float>> -> !cir.ptr<!cir.vector<16 x !s8i>>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.ptr<!cir.vector<4 x !cir.float>> -> !cir.ptr<!cir.vector<16 x !s8i>>
+  // CIR: [[MASK:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !s32i>
+  // CIR: [[VAL1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !s32i>
+  // CIR: [[VAL2:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s8i> -> !cir.vector<4 x !s32i>
+  // CIR: [[AND:%.*]] = cir.and [[MASK]], [[VAL1]] : !cir.vector<4 x !s32i>
+  // CIR: [[NOT:%.*]] = cir.not [[MASK]] : !cir.vector<4 x !s32i>
+  // CIR: [[AND2:%.*]] = cir.and [[NOT]], [[VAL2]] : !cir.vector<4 x !s32i>
+  // CIR: [[OR:%.*]] = cir.or [[AND]], [[AND2]] : !cir.vector<4 x !s32i>
+  // CIR: cir.cast bitcast [[OR]] : !cir.vector<4 x !s32i> -> !cir.vector<4 x !cir.float>
+
+  // LLVM:      [[TMP0:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32>
+  // LLVM-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32>
+  // LLVM-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8>
+  // LLVM-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+  // LLVM-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+  // LLVM-NEXT: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+  // LLVM-NEXT: [[VBSL1_I:%.*]] = ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/188449