[clang] [CIR][AArch64] Lower vfma lane builtins (PR #188190)

Wed Mar 25 03:48:40 PDT 2026

https://github.com/yairbenavraham updated https://github.com/llvm/llvm-project/pull/188190

>From 4ceade9630502af988e42a046e7568b3a71e96f5 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <yairba at protonmail.com>
Date: Wed, 25 Mar 2026 12:08:26 +0200
Subject: [PATCH 1/2] [CIR][AArch64] Lower vfma lane builtins

Lower the AArch64 vfma lane and laneq builtins in CIR codegen.

This adds handling for the vector and scalar vfma lane forms,
including the vfmaq_laneq_v family called out in the issue, and
keeps the CIR builtin structure aligned with the existing AArch64
builtin lowering pattern while preserving the original case order.

The scalar lane forms are dispatched before getNeonType() so the
f16 cases do not fall through the unsupported Poly128 path during
ClangIR lowering.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 66 ++++++++++++++-----
 1 file changed, 50 insertions(+), 16 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index a3488bfcc3dec..c972e9e12c430 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -139,11 +139,10 @@ static cir::VectorType getNeonType(CIRGenFunction *cgf, NeonTypeFlags typeFlags,
       cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: BFloat16"));
     [[fallthrough]];
   case NeonTypeFlags::Float16:
-    if (hasLegalHalfType)
+    if (!hasLegalHalfType)
       cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
-    else
-      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
-    [[fallthrough]];
+    return cir::VectorType::get(cgf->getCIRGenModule().fP16Ty,
+                                v1Ty ? 1 : (4 << isQuad));
   case NeonTypeFlags::Int32:
     return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt32Ty
                                                        : cgf->sInt32Ty,
@@ -628,11 +627,6 @@ static bool hasExtraNeonArgument(unsigned builtinID) {
   case ARM::BI__builtin_arm_vcvtr_d:
     mask = 1;
   }
-  switch (builtinID) {
-  default:
-    break;
-  }
-
   return mask != 0;
 }
 
@@ -2186,6 +2180,23 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
     return mlir::Value{};
   }
 
+  switch (builtinID) {
+  case NEON::BI__builtin_neon_vfmah_lane_f16:
+  case NEON::BI__builtin_neon_vfmas_lane_f32:
+  case NEON::BI__builtin_neon_vfmah_laneq_f16:
+  case NEON::BI__builtin_neon_vfmas_laneq_f32:
+  case NEON::BI__builtin_neon_vfmad_lane_f64:
+  case NEON::BI__builtin_neon_vfmad_laneq_f64: {
+    mlir::Value lane = cir::VecExtractOp::create(builder, loc, ops[2], ops[3]);
+    mlir::Type scalarTy = convertType(expr->getType());
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], lane, ops[0]};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", scalarTy,
+                                           fmaOps);
+  }
+  default:
+    break;
+  }
+
   cir::VectorType ty = getNeonType(this, type, loc);
   if (!ty)
     return nullptr;
@@ -2200,13 +2211,36 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vfma_lane_v:
   case NEON::BI__builtin_neon_vfmaq_lane_v:
   case NEON::BI__builtin_neon_vfma_laneq_v:
-  case NEON::BI__builtin_neon_vfmaq_laneq_v:
-  case NEON::BI__builtin_neon_vfmah_lane_f16:
-  case NEON::BI__builtin_neon_vfmas_lane_f32:
-  case NEON::BI__builtin_neon_vfmah_laneq_f16:
-  case NEON::BI__builtin_neon_vfmas_laneq_f32:
-  case NEON::BI__builtin_neon_vfmad_lane_f64:
-  case NEON::BI__builtin_neon_vfmad_laneq_f64:
+  case NEON::BI__builtin_neon_vfmaq_laneq_v: {
+    mlir::Value addend = ops[0];
+    mlir::Value multiplicand = ops[1];
+    mlir::Value laneSource = ops[2];
+    auto vecTy = mlir::cast<cir::VectorType>(ty);
+    auto elemTy = vecTy.getElementType();
+    auto numElts = vecTy.getSize();
+
+    if (addend.getType() != ty)
+      addend = builder.createBitcast(loc, addend, ty);
+    if (multiplicand.getType() != ty)
+      multiplicand = builder.createBitcast(loc, multiplicand, ty);
+
+    cir::VectorType sourceTy = ty;
+    if (builtinID == NEON::BI__builtin_neon_vfmaq_lane_v)
+      sourceTy = cir::VectorType::get(elemTy, numElts / 2);
+    else if (builtinID == NEON::BI__builtin_neon_vfma_laneq_v)
+      sourceTy = cir::VectorType::get(elemTy, numElts * 2);
+
+    if (laneSource.getType() != sourceTy)
+      laneSource = builder.createBitcast(loc, laneSource, sourceTy);
+
+    int64_t lane =
+        expr->getArg(3)->EvaluateKnownConstInt(getContext()).getSExtValue();
+    llvm::SmallVector<int64_t> mask(numElts, lane);
+    mlir::Value splat = builder.createVecShuffle(loc, laneSource, mask);
+
+    llvm::SmallVector<mlir::Value> fmaOps = {multiplicand, splat, addend};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
+  }
   case NEON::BI__builtin_neon_vmull_v:
   case NEON::BI__builtin_neon_vmax_v:
   case NEON::BI__builtin_neon_vmaxq_v:

>From ae6b618b696899275c900931323b2401499e8cb9 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <yairba at protonmail.com>
Date: Wed, 25 Mar 2026 12:08:27 +0200
Subject: [PATCH 2/2] [CIR][AArch64] Add vfma lane tests

Add focused AArch64 NEON tests for the vfma lane and laneq builtins.

The tests cover the vector and scalar forms used by this patch series
and are placed under clang/test/CodeGen/AArch64/neon for CIR-enabled
validation.

The corresponding legacy coverage is removed from the old AArch64 test
files so the new neon tests become the canonical checks for these cases.
---
 clang/test/CodeGen/AArch64/neon-2velem.c      | 225 ------------------
 .../AArch64/neon-scalar-x-indexed-elem.c      |  83 -------
 clang/test/CodeGen/AArch64/neon/vfma-lane.c   | 126 ++++++++++
 .../CodeGen/AArch64/neon/vfma-scalar-lane.c   |  67 ++++++
 .../CodeGen/AArch64/v8.2a-neon-intrinsics.c   | 104 --------
 5 files changed, 193 insertions(+), 412 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/neon/vfma-lane.c
 create mode 100644 clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c

diff --git a/clang/test/CodeGen/AArch64/neon-2velem.c b/clang/test/CodeGen/AArch64/neon-2velem.c
index 2bc7212cde9f8..de95a1983f574 100644
--- a/clang/test/CodeGen/AArch64/neon-2velem.c
+++ b/clang/test/CodeGen/AArch64/neon-2velem.c
@@ -404,83 +404,6 @@ uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
   return vmulq_laneq_u32(a, v, 3);
 }
-
-// CHECK-LABEL: @test_vfma_lane_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
-// CHECK-NEXT:    ret <2 x float> [[FMLA2]]
-//
-float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  return vfma_lane_f32(a, b, v, 1);
-}
-
-// CHECK-LABEL: @test_vfmaq_lane_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
-// CHECK-NEXT:    ret <4 x float> [[FMLA2]]
-//
-float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  return vfmaq_lane_f32(a, b, v, 1);
-}
-
-// CHECK-LABEL: @test_vfma_laneq_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP7]], <2 x float> [[TMP6]])
-// CHECK-NEXT:    ret <2 x float> [[TMP9]]
-//
-float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  return vfma_laneq_f32(a, b, v, 3);
-}
-
-// CHECK-LABEL: @test_vfmaq_laneq_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP7]], <4 x float> [[TMP6]])
-// CHECK-NEXT:    ret <4 x float> [[TMP9]]
-//
-float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  return vfmaq_laneq_f32(a, b, v, 3);
-}
-
 // CHECK-LABEL: @test_vfms_lane_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
@@ -560,46 +483,6 @@ float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vfmsq_laneq_f32(a, b, v, 3);
 }
-
-// CHECK-LABEL: @test_vfmaq_lane_f64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to i64
-// CHECK-NEXT:    [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
-// CHECK-NEXT:    ret <2 x double> [[FMLA2]]
-//
-float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
-  return vfmaq_lane_f64(a, b, v, 0);
-}
-
-// CHECK-LABEL: @test_vfmaq_laneq_f64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP8]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP7]], <2 x double> [[TMP6]])
-// CHECK-NEXT:    ret <2 x double> [[TMP9]]
-//
-float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
-  return vfmaq_laneq_f64(a, b, v, 1);
-}
-
 // CHECK-LABEL: @test_vfmsq_lane_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
@@ -640,17 +523,6 @@ float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
   return vfmsq_laneq_f64(a, b, v, 1);
 }
-
-// CHECK-LABEL: @test_vfmas_laneq_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B:%.*]], float [[EXTRACT]], float [[A:%.*]])
-// CHECK-NEXT:    ret float [[TMP0]]
-//
-float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
-  return vfmas_laneq_f32(a, b, v, 3);
-}
-
 // CHECK-LABEL: @test_vfmsd_lane_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
@@ -2547,83 +2419,6 @@ uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
   return vmulq_laneq_u32(a, v, 0);
 }
-
-// CHECK-LABEL: @test_vfma_lane_f32_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
-// CHECK-NEXT:    ret <2 x float> [[FMLA2]]
-//
-float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  return vfma_lane_f32(a, b, v, 0);
-}
-
-// CHECK-LABEL: @test_vfmaq_lane_f32_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
-// CHECK-NEXT:    ret <4 x float> [[FMLA2]]
-//
-float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  return vfmaq_lane_f32(a, b, v, 0);
-}
-
-// CHECK-LABEL: @test_vfma_laneq_f32_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP7]], <2 x float> [[TMP6]])
-// CHECK-NEXT:    ret <2 x float> [[TMP9]]
-//
-float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  return vfma_laneq_f32(a, b, v, 0);
-}
-
-// CHECK-LABEL: @test_vfmaq_laneq_f32_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP7]], <4 x float> [[TMP6]])
-// CHECK-NEXT:    ret <4 x float> [[TMP9]]
-//
-float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  return vfmaq_laneq_f32(a, b, v, 0);
-}
-
 // CHECK-LABEL: @test_vfms_lane_f32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
@@ -2703,26 +2498,6 @@ float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vfmsq_laneq_f32(a, b, v, 0);
 }
-
-// CHECK-LABEL: @test_vfmaq_laneq_f64_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP8]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP7]], <2 x double> [[TMP6]])
-// CHECK-NEXT:    ret <2 x double> [[TMP9]]
-//
-float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
-  return vfmaq_laneq_f64(a, b, v, 0);
-}
-
 // CHECK-LABEL: @test_vfmsq_laneq_f64_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
index 9b98126500444..d4f1abb0adb27 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
@@ -146,41 +146,6 @@ float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) {
 float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) {
   return vmulx_laneq_f64(a, b, 1);
 }
-
-
-// CHECK-LABEL: define dso_local float @test_vfmas_lane_f32(
-// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B]], float [[EXTRACT]], float [[A]])
-// CHECK-NEXT:    ret float [[TMP0]]
-//
-float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
-  return vfmas_lane_f32(a, b, c, 1);
-}
-
-// CHECK-LABEL: define dso_local double @test_vfmad_lane_f64(
-// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[C]], i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]])
-// CHECK-NEXT:    ret double [[TMP0]]
-//
-float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
-  return vfmad_lane_f64(a, b, c, 0);
-}
-
-// CHECK-LABEL: define dso_local double @test_vfmad_laneq_f64(
-// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]])
-// CHECK-NEXT:    ret double [[TMP0]]
-//
-float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
-  return vfmad_laneq_f64(a, b, c, 1);
-}
-
 // CHECK-LABEL: define dso_local float @test_vfmss_lane_f32(
 // CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -192,30 +157,6 @@ float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
 float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmss_lane_f32(a, b, c, 1);
 }
-
-// CHECK-LABEL: define dso_local <1 x double> @test_vfma_lane_f64(
-// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
-// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
-// CHECK-NEXT:    [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
-// CHECK-NEXT:    ret <1 x double> [[FMLA2]]
-//
-float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
-  return vfma_lane_f64(a, b, v, 0);
-}
-
 // CHECK-LABEL: define dso_local <1 x double> @test_vfms_lane_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -239,30 +180,6 @@ float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfms_lane_f64(a, b, v, 0);
 }
-
-// CHECK-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64(
-// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
-// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
-// CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP10]]
-//
-float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
-  return vfma_laneq_f64(a, b, v, 0);
-}
-
 // CHECK-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/neon/vfma-lane.c b/clang/test/CodeGen/AArch64/neon/vfma-lane.c
new file mode 100644
index 0000000000000..ca0fe7805ec12
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/vfma-lane.c
@@ -0,0 +1,126 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN:                   %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -disable-O0-optnone -flax-vector-conversions=none           -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-cir  -o - %s |                               FileCheck %s --check-prefixes=CIR %}
+
+#include <arm_neon.h>
+
+// LLVM-LABEL: @test_vfma_lane_f16(
+// LLVM: shufflevector <4 x half>
+// LLVM: call <4 x half> @llvm.fma.v4f16(
+// CIR-LABEL: @test_vfma_lane_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
+  return vfma_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f16(
+// LLVM: shufflevector <4 x half>
+// LLVM: call <8 x half> @llvm.fma.v8f16(
+// CIR-LABEL: @test_vfmaq_lane_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
+  return vfmaq_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f16(
+// LLVM: shufflevector <8 x half>
+// LLVM: call <4 x half> @llvm.fma.v4f16(
+// CIR-LABEL: @test_vfma_laneq_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
+  return vfma_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f16(
+// LLVM: shufflevector <8 x half>
+// LLVM: call <8 x half> @llvm.fma.v8f16(
+// CIR-LABEL: @test_vfmaq_laneq_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
+  return vfmaq_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfma_lane_f32(
+// LLVM: shufflevector <2 x float>
+// LLVM: call <2 x float> @llvm.fma.v2f32(
+// CIR-LABEL: @test_vfma_lane_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
+  return vfma_lane_f32(a, b, v, 1);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f32(
+// LLVM: shufflevector <2 x float>
+// LLVM: call <4 x float> @llvm.fma.v4f32(
+// CIR-LABEL: @test_vfmaq_lane_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
+  return vfmaq_lane_f32(a, b, v, 1);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f32(
+// LLVM: shufflevector <4 x float>
+// LLVM: call <2 x float> @llvm.fma.v2f32(
+// CIR-LABEL: @test_vfma_laneq_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
+  return vfma_laneq_f32(a, b, v, 3);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f32(
+// LLVM: shufflevector <4 x float>
+// LLVM: call <4 x float> @llvm.fma.v4f32(
+// CIR-LABEL: @test_vfmaq_laneq_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
+  return vfmaq_laneq_f32(a, b, v, 3);
+}
+
+// LLVM-LABEL: @test_vfma_lane_f64(
+// LLVM: shufflevector <1 x double>
+// LLVM: call <1 x double> @llvm.fma.v1f64(
+// CIR-LABEL: @test_vfma_lane_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
+  return vfma_lane_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f64(
+// LLVM: shufflevector <1 x double>
+// LLVM: call <2 x double> @llvm.fma.v2f64(
+// CIR-LABEL: @test_vfmaq_lane_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
+  return vfmaq_lane_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f64(
+// LLVM: @llvm.fma
+// CIR-LABEL: @test_vfma_laneq_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
+  return vfma_laneq_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f64(
+// LLVM: shufflevector <2 x double>
+// LLVM: call <2 x double> @llvm.fma.v2f64(
+// CIR-LABEL: @test_vfmaq_laneq_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
+  return vfmaq_laneq_f64(a, b, v, 1);
+}
diff --git a/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c b/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c
new file mode 100644
index 0000000000000..42e0c211d6dc6
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c
@@ -0,0 +1,67 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN:                   %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -disable-O0-optnone -flax-vector-conversions=none           -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-cir  -o - %s |                               FileCheck %s --check-prefixes=CIR %}
+
+#include <arm_neon.h>
+
+// LLVM-LABEL: @test_vfmah_lane_f16(
+// LLVM: extractelement <4 x half>
+// LLVM: call half @llvm.fma.f16(
+// CIR-LABEL: @test_vfmah_lane_f16(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
+  return vfmah_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmah_laneq_f16(
+// LLVM: extractelement <8 x half>
+// LLVM: call half @llvm.fma.f16(
+// CIR-LABEL: @test_vfmah_laneq_f16(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
+  return vfmah_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfmas_lane_f32(
+// LLVM: extractelement <2 x float>
+// LLVM: call float @llvm.fma.f32(
+// CIR-LABEL: @test_vfmas_lane_f32(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
+  return vfmas_lane_f32(a, b, c, 1);
+}
+
+// LLVM-LABEL: @test_vfmas_laneq_f32(
+// LLVM: extractelement <4 x float>
+// LLVM: call float @llvm.fma.f32(
+// CIR-LABEL: @test_vfmas_laneq_f32(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t c) {
+  return vfmas_laneq_f32(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmad_lane_f64(
+// LLVM: extractelement <1 x double>
+// LLVM: call double @llvm.fma.f64(
+// CIR-LABEL: @test_vfmad_lane_f64(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
+  return vfmad_lane_f64(a, b, c, 0);
+}
+
+// LLVM-LABEL: @test_vfmad_laneq_f64(
+// LLVM: extractelement <2 x double>
+// LLVM: call double @llvm.fma.f64(
+// CIR-LABEL: @test_vfmad_laneq_f64(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
+  return vfmad_laneq_f64(a, b, c, 1);
+}
diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
index 9c408e8c702fd..b331ae7eb63db 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
@@ -1679,87 +1679,6 @@ float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
 float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmsq_f16(a, b, c);
 }
-
-// CHECK-LABEL: define {{[^@]+}}@test_vfma_lane_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP6]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]])
-// CHECK-NEXT:    ret <4 x half> [[FMLA2]]
-//
-float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
-  return vfma_lane_f16(a, b, c, 3);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_lane_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP6]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]])
-// CHECK-NEXT:    ret <8 x half> [[FMLA2]]
-//
-float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
-  return vfmaq_lane_f16(a, b, c, 3);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vfma_laneq_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP7]], <4 x half> [[TMP6]])
-// CHECK-NEXT:    ret <4 x half> [[TMP9]]
-//
-float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
-  return vfma_laneq_f16(a, b, c, 7);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_laneq_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP7]], <8 x half> [[TMP6]])
-// CHECK-NEXT:    ret <8 x half> [[TMP9]]
-//
-float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
-  return vfmaq_laneq_f16(a, b, c, 7);
-}
-
 // CHECK-LABEL: define {{[^@]+}}@test_vfma_n_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
@@ -1809,29 +1728,6 @@ float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
 float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
   return vfmaq_n_f16(a, b, c);
 }
-
-// CHECK-LABEL: define {{[^@]+}}@test_vfmah_lane_f16
-// CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half [[EXTRACT]], half [[A]])
-// CHECK-NEXT:    ret half [[TMP0]]
-//
-float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
-  return vfmah_lane_f16(a, b, c, 3);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vfmah_laneq_f16
-// CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half [[EXTRACT]], half [[A]])
-// CHECK-NEXT:    ret half [[TMP0]]
-//
-float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
-  return vfmah_laneq_f16(a, b, c, 7);
-}
-
 // CHECK-LABEL: define {{[^@]+}}@test_vfms_lane_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry: