[clang] [CIR][AArch64] Lower NEON laneq FMA builtins (PR #202337)

Sun Jun 14 20:18:08 PDT 2026

https://github.com/yairbenavraham updated https://github.com/llvm/llvm-project/pull/202337

>From 6f06ebbfb4223e595c97d181828e2e3da0edcc04 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <yairba at protonmail.com>
Date: Thu, 4 Jun 2026 18:27:43 +0300
Subject: [PATCH 1/4] [CIR][AArch64] Lower vfmaq_laneq_v builtin

Lower BI__builtin_neon_vfmaq_laneq_v in CIR.

Bitcast the operands, splat the selected lane, and emit llvm.fma.

Move vfmaq_laneq ACLE coverage into the CIR-enabled fused multiply tests.

Remove the replaced old-style checks.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 10 +-
 clang/test/CodeGen/AArch64/neon-2velem.c      | 76 ---------------
 .../AArch64/neon/fused-multiple-fullfp16.c    | 26 +++++-
 .../CodeGen/AArch64/neon/fused-multiply.c     | 92 ++++++++++++++++++-
 .../CodeGen/AArch64/v8.2a-neon-intrinsics.c   | 20 ----
 5 files changed, 123 insertions(+), 101 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index be906d0671e3a..e3556c60bc9a9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2742,7 +2742,15 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
     llvm::SmallVector<mlir::Value> fmaOps = {laneSource, multiplicand, addend};
     return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
   }
-  case NEON::BI__builtin_neon_vfmaq_laneq_v:
+  case NEON::BI__builtin_neon_vfmaq_laneq_v: {
+    mlir::Value addend = builder.createBitcast(ops[0], ty);
+    mlir::Value multiplicand = builder.createBitcast(ops[1], ty);
+    mlir::Value laneSource = builder.createBitcast(ops[2], ty);
+    laneSource = emitNeonSplat(builder, loc, laneSource, ops[3], ty.getSize());
+
+    llvm::SmallVector<mlir::Value> fmaOps = {laneSource, multiplicand, addend};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
+  }
   case NEON::BI__builtin_neon_vfmah_lane_f16:
   case NEON::BI__builtin_neon_vfmas_lane_f32:
   case NEON::BI__builtin_neon_vfmah_laneq_f16:
diff --git a/clang/test/CodeGen/AArch64/neon-2velem.c b/clang/test/CodeGen/AArch64/neon-2velem.c
index 89fdb979d8a98..c7eca2d8426c6 100644
--- a/clang/test/CodeGen/AArch64/neon-2velem.c
+++ b/clang/test/CodeGen/AArch64/neon-2velem.c
@@ -424,25 +424,6 @@ float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
   return vfma_lane_f32(a, b, v, 1);
 }
 
-// CHECK-LABEL: @test_vfmaq_laneq_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP7]], <4 x float> [[TMP6]])
-// CHECK-NEXT:    ret <4 x float> [[TMP9]]
-//
-float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  return vfmaq_laneq_f32(a, b, v, 3);
-}
-
 // CHECK-LABEL: @test_vfms_lane_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
@@ -523,25 +504,6 @@ float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vfmsq_laneq_f32(a, b, v, 3);
 }
 
-// CHECK-LABEL: @test_vfmaq_laneq_f64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP8]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP7]], <2 x double> [[TMP6]])
-// CHECK-NEXT:    ret <2 x double> [[TMP9]]
-//
-float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
-  return vfmaq_laneq_f64(a, b, v, 1);
-}
-
 // CHECK-LABEL: @test_vfmsq_lane_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
@@ -2509,25 +2471,6 @@ float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
   return vfma_lane_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: @test_vfmaq_laneq_f32_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP7]], <4 x float> [[TMP6]])
-// CHECK-NEXT:    ret <4 x float> [[TMP9]]
-//
-float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  return vfmaq_laneq_f32(a, b, v, 0);
-}
-
 // CHECK-LABEL: @test_vfms_lane_f32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
@@ -2608,25 +2551,6 @@ float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v)
   return vfmsq_laneq_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: @test_vfmaq_laneq_f64_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP8]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP7]], <2 x double> [[TMP6]])
-// CHECK-NEXT:    ret <2 x double> [[TMP9]]
-//
-float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
-  return vfmaq_laneq_f64(a, b, v, 0);
-}
-
 // CHECK-LABEL: @test_vfmsq_laneq_f64_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
index d3d6b52358678..1460fb3b2bae1 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
@@ -13,8 +13,8 @@
 //  * clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
 // The main difference is the use of RUN lines that enable ClangIR lowering.
 // This file currently covers the f16 wrappers that lower through
-// BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v, and
-// BI__builtin_neon_vfma_laneq_v.
+// BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v,
+// BI__builtin_neon_vfmaq_laneq_v, and BI__builtin_neon_vfma_laneq_v.
 //
 // ACLE section headings based on v2025Q2 of the ACLE specification:
 //  * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate-2
@@ -67,6 +67,28 @@ float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
   return vfmaq_lane_f16(a, b, c, 3);
 }
 
+// ALL-LABEL: @test_vfmaq_laneq_f16(
+float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b,
+                                  float16x8_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.f16>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>) -> !cir.vector<8 x !cir.f16>
+
+// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half>
+// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C_CAST]], <8 x half> {{.*}}, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[B_CAST]], <8 x half> [[A_CAST]])
+// LLVM:      ret <8 x half> [[FMA]]
+  return vfmaq_laneq_f16(a, b, c, 7);
+}
+
 // ALL-LABEL: @test_vfma_laneq_f16(
 float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b,
                                  float16x8_t c) {
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index a1e3c6eeea2f2..9ead1c53dcc34 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -13,8 +13,8 @@
 //  * clang/test/CodeGen/AArch64/neon-intrinsics.c
 // The main difference is the use of RUN lines that enable ClangIR lowering.
 // This file currently covers the f32/f64 wrappers that lower through
-// BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v, and
-// BI__builtin_neon_vfma_laneq_v.
+// BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v,
+// BI__builtin_neon_vfmaq_laneq_v, and BI__builtin_neon_vfma_laneq_v.
 //
 // ACLE section headings based on v2025Q2 of the ACLE specification:
 //  * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate
@@ -106,6 +106,50 @@ float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
   return vfmaq_lane_f64(a, b, v, 0);
 }
 
+// ALL-LABEL: @test_vfmaq_laneq_f32(
+float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b,
+                                  float32x4_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float>
+
+// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <4 x float> [[V]] to <4 x i32>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <4 x i32> [[V_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <4 x float>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <4 x float>
+// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <4 x float>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V_CAST]], <4 x float> {{.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// LLVM-NEXT: [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[B_CAST]], <4 x float> [[A_CAST]])
+// LLVM:      ret <4 x float> [[FMA]]
+  return vfmaq_laneq_f32(a, b, v, 3);
+}
+
+// ALL-LABEL: @test_vfmaq_laneq_f64(
+float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b,
+                                  float64x2_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.double>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>) -> !cir.vector<2 x !cir.double>
+
+// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 x double> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i64> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i64> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <2 x i64> [[V_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <2 x double>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <2 x double>
+// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <2 x double>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V_CAST]], <2 x double> {{.*}}, <2 x i32> <i32 1, i32 1>
+// LLVM-NEXT: [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[B_CAST]], <2 x double> [[A_CAST]])
+// LLVM:      ret <2 x double> [[FMA]]
+  return vfmaq_laneq_f64(a, b, v, 1);
+}
+
 // ALL-LABEL: @test_vfma_laneq_f32(
 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
 // CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i] : !cir.vector<2 x !cir.float>
@@ -152,6 +196,50 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b,
   return vfma_laneq_f64(a, b, v, 0);
 }
 
+// ALL-LABEL: @test_vfmaq_laneq_f32_0(
+float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b,
+                                    float32x4_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<4 x !cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float>
+
+// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <4 x float> [[V]] to <4 x i32>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <4 x i32> [[V_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <4 x float>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <4 x float>
+// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <4 x float>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V_CAST]], <4 x float> {{.*}}, <4 x i32> zeroinitializer
+// LLVM-NEXT: [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[B_CAST]], <4 x float> [[A_CAST]])
+// LLVM:      ret <4 x float> [[FMA]]
+  return vfmaq_laneq_f32(a, b, v, 0);
+}
+
+// ALL-LABEL: @test_vfmaq_laneq_f64_0(
+float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b,
+                                    float64x2_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x !cir.double>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>) -> !cir.vector<2 x !cir.double>
+
+// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 x double> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i64> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i64> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <2 x i64> [[V_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <2 x double>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <2 x double>
+// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <2 x double>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V_CAST]], <2 x double> {{.*}}, <2 x i32> zeroinitializer
+// LLVM-NEXT: [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[B_CAST]], <2 x double> [[A_CAST]])
+// LLVM:      ret <2 x double> [[FMA]]
+  return vfmaq_laneq_f64(a, b, v, 0);
+}
+
 // ALL-LABEL: @test_vfma_laneq_f32_0(
 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b,
                                    float32x4_t v) {
diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
index e8f1eead2a0d5..b25b5fe90bfc5 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
@@ -1681,26 +1681,6 @@ float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
   return vfma_lane_f16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_laneq_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP7]], <8 x half> [[TMP6]])
-// CHECK-NEXT:    ret <8 x half> [[TMP9]]
-//
-float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
-  return vfmaq_laneq_f16(a, b, c, 7);
-}
-
 // CHECK-LABEL: define {{[^@]+}}@test_vfma_n_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:

>From 47e149483322f90ffdd6fe6e5dc877240c232eb9 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <yairba at protonmail.com>
Date: Fri, 5 Jun 2026 11:22:48 +0300
Subject: [PATCH 2/4] [CIR][AArch64] Lower vfmad_laneq_f64 builtin

Lower BI__builtin_neon_vfmad_laneq_f64 in CIR.

Extract the selected lane from the float64x2_t source.

Emit llvm.fma for the scalar double result.

Move the ACLE coverage into the CIR-enabled fused multiply test.

Remove the replaced old-style check.
---
 clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp   | 15 ++++++++++++++-
 .../CodeGen/AArch64/neon-scalar-x-indexed-elem.c | 11 -----------
 clang/test/CodeGen/AArch64/neon/fused-multiply.c | 16 +++++++++++++++-
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index e3556c60bc9a9..7c37375d22c77 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2756,11 +2756,24 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vfmah_laneq_f16:
   case NEON::BI__builtin_neon_vfmas_laneq_f32:
   case NEON::BI__builtin_neon_vfmad_lane_f64:
-  case NEON::BI__builtin_neon_vfmad_laneq_f64:
     cgm.errorNYI(expr->getSourceRange(),
                  std::string("unimplemented AArch64 builtin call: ") +
                      getContext().BuiltinInfo.getName(builtinID));
     return mlir::Value{};
+  case NEON::BI__builtin_neon_vfmad_laneq_f64: {
+    mlir::Value addend = builder.createBitcast(ops[0], cgm.doubleTy);
+    mlir::Value multiplicand = builder.createBitcast(ops[1], cgm.doubleTy);
+    // The laneq source operand is float64x2_t, so the source vector has two
+    // double lanes.
+    cir::VectorType sourceTy = cir::VectorType::get(cgm.doubleTy, 2);
+    mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy);
+    laneSource = builder.createExtractElement(
+        loc, laneSource, static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));
+
+    llvm::SmallVector<mlir::Value> fmaOps = {multiplicand, laneSource, addend};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", cgm.doubleTy,
+                                           fmaOps);
+  }
   case NEON::BI__builtin_neon_vmull_v: {
     intrName = usgn ? "aarch64.neon.umull" : "aarch64.neon.smull";
     if (type.isPoly())
diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
index b464bccdbf9ec..fdb772a79e973 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
@@ -170,17 +170,6 @@ float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
   return vfmad_lane_f64(a, b, c, 0);
 }
 
-// CHECK-LABEL: define dso_local double @test_vfmad_laneq_f64(
-// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]])
-// CHECK-NEXT:    ret double [[TMP0]]
-//
-float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
-  return vfmad_laneq_f64(a, b, c, 1);
-}
-
 // CHECK-LABEL: define dso_local float @test_vfmss_lane_f32(
 // CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 9ead1c53dcc34..c0b1932126a23 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -11,10 +11,12 @@
 //
 // This file contains tests that were originally located in:
 //  * clang/test/CodeGen/AArch64/neon-intrinsics.c
+//  * clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
 // The main difference is the use of RUN lines that enable ClangIR lowering.
 // This file currently covers the f32/f64 wrappers that lower through
 // BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v,
-// BI__builtin_neon_vfmaq_laneq_v, and BI__builtin_neon_vfma_laneq_v.
+// BI__builtin_neon_vfmaq_laneq_v, BI__builtin_neon_vfma_laneq_v,
+// and BI__builtin_neon_vfmad_laneq_f64.
 //
 // ACLE section headings based on v2025Q2 of the ACLE specification:
 //  * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate
@@ -196,6 +198,18 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b,
   return vfma_laneq_f64(a, b, v, 0);
 }
 
+// ALL-LABEL: @test_vfmad_laneq_f64(
+float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : !cir.vector<2 x !cir.double>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : (!cir.double, !cir.double, !cir.double) -> !cir.double
+
+// LLVM-SAME: double {{.*}} [[A:%.*]], double {{.*}} [[B:%.*]], <2 x double> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = extractelement <2 x double> [[C]], i{{32|64}} 1
+// LLVM:      [[FMA:%.*]] = call double @llvm.fma.f64(double [[B]], double [[LANE]], double [[A]])
+// LLVM:      ret double [[FMA]]
+  return vfmad_laneq_f64(a, b, c, 1);
+}
+
 // ALL-LABEL: @test_vfmaq_laneq_f32_0(
 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b,
                                     float32x4_t v) {

>From d7c5d73f983a41bf2edeaa1825fe5b6be9ee784f Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <yairba at protonmail.com>
Date: Thu, 11 Jun 2026 08:00:38 +0300
Subject: [PATCH 3/4] [CIR][AArch64] Simplify vfmad_laneq_f64 lowering

Use the scalar operands directly when lowering the vfmad_laneq_f64 builtin.

The addend and multiplicand are already double values. The lane source is
already float64x2_t, so the extra bitcasts are unnecessary.
---
 clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 7c37375d22c77..1a9b3567d75bb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2761,16 +2761,12 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
                      getContext().BuiltinInfo.getName(builtinID));
     return mlir::Value{};
   case NEON::BI__builtin_neon_vfmad_laneq_f64: {
-    mlir::Value addend = builder.createBitcast(ops[0], cgm.doubleTy);
-    mlir::Value multiplicand = builder.createBitcast(ops[1], cgm.doubleTy);
     // The laneq source operand is float64x2_t, so the source vector has two
     // double lanes.
-    cir::VectorType sourceTy = cir::VectorType::get(cgm.doubleTy, 2);
-    mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy);
-    laneSource = builder.createExtractElement(
-        loc, laneSource, static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));
+    mlir::Value laneSource = builder.createExtractElement(
+        loc, ops[2], static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));
 
-    llvm::SmallVector<mlir::Value> fmaOps = {multiplicand, laneSource, addend};
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], laneSource, ops[0]};
     return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", cgm.doubleTy,
                                            fmaOps);
   }

>From a5609477120aee2a04e1256a63df6ab787a6f5b0 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <yairba at protonmail.com>
Date: Mon, 15 Jun 2026 06:12:38 +0300
Subject: [PATCH 4/4] [CIR][AArch64] Reorder fused multiply tests

Reorder the laneq test blocks to match the CIR builtin family order.

Update the file note to match the test order.
---
 .../CodeGen/AArch64/neon/fused-multiply.c     | 146 +++++++++---------
 1 file changed, 73 insertions(+), 73 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index c0b1932126a23..4e30b063064aa 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -15,7 +15,7 @@
 // The main difference is the use of RUN lines that enable ClangIR lowering.
 // This file currently covers the f32/f64 wrappers that lower through
 // BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v,
-// BI__builtin_neon_vfmaq_laneq_v, BI__builtin_neon_vfma_laneq_v,
+// BI__builtin_neon_vfma_laneq_v, BI__builtin_neon_vfmaq_laneq_v,
 // and BI__builtin_neon_vfmad_laneq_f64.
 //
 // ACLE section headings based on v2025Q2 of the ACLE specification:
@@ -108,50 +108,6 @@ float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
   return vfmaq_lane_f64(a, b, v, 0);
 }
 
-// ALL-LABEL: @test_vfmaq_laneq_f32(
-float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b,
-                                  float32x4_t v) {
-// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.float>
-// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float>
-
-// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[V:%.*]]) {{.*}} {
-// LLVM:      [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
-// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
-// LLVM-NEXT: [[V_I:%.*]] = bitcast <4 x float> [[V]] to <4 x i32>
-// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8>
-// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8>
-// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <4 x i32> [[V_I]] to <16 x i8>
-// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <4 x float>
-// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <4 x float>
-// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <4 x float>
-// LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V_CAST]], <4 x float> {{.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// LLVM-NEXT: [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[B_CAST]], <4 x float> [[A_CAST]])
-// LLVM:      ret <4 x float> [[FMA]]
-  return vfmaq_laneq_f32(a, b, v, 3);
-}
-
-// ALL-LABEL: @test_vfmaq_laneq_f64(
-float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b,
-                                  float64x2_t v) {
-// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.double>
-// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>) -> !cir.vector<2 x !cir.double>
-
-// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 x double> {{.*}} [[V:%.*]]) {{.*}} {
-// LLVM:      [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
-// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
-// LLVM-NEXT: [[V_I:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
-// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i64> [[A_I]] to <16 x i8>
-// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i64> [[B_I]] to <16 x i8>
-// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <2 x i64> [[V_I]] to <16 x i8>
-// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <2 x double>
-// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <2 x double>
-// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <2 x double>
-// LLVM-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V_CAST]], <2 x double> {{.*}}, <2 x i32> <i32 1, i32 1>
-// LLVM-NEXT: [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[B_CAST]], <2 x double> [[A_CAST]])
-// LLVM:      ret <2 x double> [[FMA]]
-  return vfmaq_laneq_f64(a, b, v, 1);
-}
-
 // ALL-LABEL: @test_vfma_laneq_f32(
 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
 // CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i] : !cir.vector<2 x !cir.float>
@@ -198,16 +154,70 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b,
   return vfma_laneq_f64(a, b, v, 0);
 }
 
-// ALL-LABEL: @test_vfmad_laneq_f64(
-float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
-// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : !cir.vector<2 x !cir.double>
-// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : (!cir.double, !cir.double, !cir.double) -> !cir.double
+// ALL-LABEL: @test_vfma_laneq_f32_0(
+float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b,
+                                   float32x4_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x !cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>) -> !cir.vector<2 x !cir.float>
 
-// LLVM-SAME: double {{.*}} [[A:%.*]], double {{.*}} [[B:%.*]], <2 x double> {{.*}} [[C:%.*]]) {{.*}} {
-// LLVM:      [[LANE:%.*]] = extractelement <2 x double> [[C]], i{{32|64}} 1
-// LLVM:      [[FMA:%.*]] = call double @llvm.fma.f64(double [[B]], double [[LANE]], double [[A]])
-// LLVM:      ret double [[FMA]]
-  return vfmad_laneq_f64(a, b, c, 1);
+// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x float> [[B]] to <2 x i32>
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <4 x float> [[V]] to <4 x i32>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i32> [[A_I]] to <8 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i32> [[B_I]] to <8 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <4 x i32> [[V_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <8 x i8> [[A_BYTES]] to <2 x float>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <8 x i8> [[B_BYTES]] to <2 x float>
+// LLVM:      [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <4 x float>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V_CAST]], <4 x float> {{.*}}, <2 x i32> zeroinitializer
+// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[B_CAST]], <2 x float> [[A_CAST]])
+// LLVM:      ret <2 x float> [[FMA]]
+  return vfma_laneq_f32(a, b, v, 0);
+}
+
+// ALL-LABEL: @test_vfmaq_laneq_f32(
+float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b,
+                                  float32x4_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float>
+
+// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <4 x float> [[V]] to <4 x i32>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <4 x i32> [[V_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <4 x float>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <4 x float>
+// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <4 x float>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V_CAST]], <4 x float> {{.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// LLVM-NEXT: [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[B_CAST]], <4 x float> [[A_CAST]])
+// LLVM:      ret <4 x float> [[FMA]]
+  return vfmaq_laneq_f32(a, b, v, 3);
+}
+
+// ALL-LABEL: @test_vfmaq_laneq_f64(
+float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b,
+                                  float64x2_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.double>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>) -> !cir.vector<2 x !cir.double>
+
+// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 x double> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i64> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i64> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <2 x i64> [[V_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <2 x double>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <2 x double>
+// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <2 x double>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V_CAST]], <2 x double> {{.*}}, <2 x i32> <i32 1, i32 1>
+// LLVM-NEXT: [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[B_CAST]], <2 x double> [[A_CAST]])
+// LLVM:      ret <2 x double> [[FMA]]
+  return vfmaq_laneq_f64(a, b, v, 1);
 }
 
 // ALL-LABEL: @test_vfmaq_laneq_f32_0(
@@ -254,24 +264,14 @@ float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b,
   return vfmaq_laneq_f64(a, b, v, 0);
 }
 
-// ALL-LABEL: @test_vfma_laneq_f32_0(
-float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b,
-                                   float32x4_t v) {
-// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x !cir.float>
-// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>) -> !cir.vector<2 x !cir.float>
+// ALL-LABEL: @test_vfmad_laneq_f64(
+float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : !cir.vector<2 x !cir.double>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : (!cir.double, !cir.double, !cir.double) -> !cir.double
 
-// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[V:%.*]]) {{.*}} {
-// LLVM:      [[A_I:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
-// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x float> [[B]] to <2 x i32>
-// LLVM-NEXT: [[V_I:%.*]] = bitcast <4 x float> [[V]] to <4 x i32>
-// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i32> [[A_I]] to <8 x i8>
-// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i32> [[B_I]] to <8 x i8>
-// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <4 x i32> [[V_I]] to <16 x i8>
-// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <8 x i8> [[A_BYTES]] to <2 x float>
-// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <8 x i8> [[B_BYTES]] to <2 x float>
-// LLVM:      [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <4 x float>
-// LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V_CAST]], <4 x float> {{.*}}, <2 x i32> zeroinitializer
-// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[B_CAST]], <2 x float> [[A_CAST]])
-// LLVM:      ret <2 x float> [[FMA]]
-  return vfma_laneq_f32(a, b, v, 0);
+// LLVM-SAME: double {{.*}} [[A:%.*]], double {{.*}} [[B:%.*]], <2 x double> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = extractelement <2 x double> [[C]], i{{32|64}} 1
+// LLVM:      [[FMA:%.*]] = call double @llvm.fma.f64(double [[B]], double [[LANE]], double [[A]])
+// LLVM:      ret double [[FMA]]
+  return vfmad_laneq_f64(a, b, c, 1);
 }