[clang] [CIR][AArch64] Lower vfmaq_v f32/f64 (PR #195602)
Yair Ben Avraham via cfe-commits
cfe-commits at lists.llvm.org
Mon May 4 00:21:46 PDT 2026
https://github.com/yairbenavraham created https://github.com/llvm/llvm-project/pull/195602
Lower `BI__builtin_neon_vfmaq_v` in CIR for the `vfmaq_f32` and `vfmaq_f64` ACLE wrappers.
This is split out from the broader fused multiply-accumulate work and only covers `BI__builtin_neon_vfmaq_v`. The related `vfma_v`, `vfmaq_f16`, lane, laneq, and scalar forms remain outside this PR.
Tests move the existing `vfmaq_f32` and `vfmaq_f64` coverage from `neon-intrinsics.c` into `neon/vfmaq.c`, preserve the original LLVM checks, and add ClangIR coverage.
Validation: rebuilt `clang` and ran the focused `vfmaq.c` lit test.
Part of #185382
Split from feedback on #188190
>From 5047649698cd34eaa6e824405782991d66bf2234 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <yairba at protonmail.com>
Date: Wed, 29 Apr 2026 19:24:37 +0300
Subject: [PATCH] [CIR][AArch64] Lower vfmaq_v f32/f64
Lower BI__builtin_neon_vfmaq_v for the vfmaq_f32 and vfmaq_f64 wrappers
through the LLVM fma intrinsic.
Keep vfma_v and vfmaq_f16 outside this focused split.
Move the replaced vfmaq_f32 and vfmaq_f64 tests into neon/vfmaq.c with
CIR coverage.
---
.../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 23 ++++++-
clang/test/CodeGen/AArch64/neon-intrinsics.c | 38 -----------
clang/test/CodeGen/AArch64/neon/vfmaq.c | 65 +++++++++++++++++++
3 files changed, 87 insertions(+), 39 deletions(-)
create mode 100644 clang/test/CodeGen/AArch64/neon/vfmaq.c
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 834f66586833b..349d6c837af12 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -501,6 +501,7 @@ static mlir::Value emitCommonNeonBuiltinExpr(
// Determine the type of this overloaded NEON intrinsic.
NeonTypeFlags neonType(neonTypeConst->getZExtValue());
+
const bool isUnsigned = neonType.isUnsigned();
const bool hasLegalHalfType = cgf.getTarget().hasFastHalfType();
const bool usgn = neonType.isUnsigned();
@@ -677,7 +678,20 @@ static mlir::Value emitCommonNeonBuiltinExpr(
case NEON::BI__builtin_neon_vext_v:
case NEON::BI__builtin_neon_vextq_v:
case NEON::BI__builtin_neon_vfma_v:
- case NEON::BI__builtin_neon_vfmaq_v:
+ cgf.cgm.errorNYI(expr->getSourceRange(),
+ std::string("unimplemented AArch64 builtin call: ") +
+ ctx.BuiltinInfo.getName(builtinID));
+ return mlir::Value{};
+ case NEON::BI__builtin_neon_vfmaq_v: {
+ mlir::Value op0 = cgf.getBuilder().createBitcast(ops[0], ty);
+ mlir::Value op1 = cgf.getBuilder().createBitcast(ops[1], ty);
+ mlir::Value op2 = cgf.getBuilder().createBitcast(ops[2], ty);
+ llvm::SmallVector<mlir::Value> fmaOps = {op1, op2, op0};
+ return cir::LLVMIntrinsicCallOp::create(
+ cgf.getBuilder(), loc, cgf.getBuilder().getStringAttr("fma"),
+ ty, fmaOps)
+ .getResult();
+ }
case NEON::BI__builtin_neon_vld1_v:
case NEON::BI__builtin_neon_vld1q_v:
case NEON::BI__builtin_neon_vld1_x2_v:
@@ -2092,6 +2106,13 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
return mlir::Value{};
}
+ if (builtinID == NEON::BI__builtin_neon_vfmaq_f16) {
+ cgm.errorNYI(expr->getSourceRange(),
+ std::string("unimplemented AArch64 builtin call: ") +
+ getContext().BuiltinInfo.getName(builtinID));
+ return mlir::Value{};
+ }
+
// Handle MSVC intrinsics before argument evaluation to prevent double
// evaluation.
assert(!cir::MissingFeatures::msvcBuiltins());
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index 784d9624823d5..64bbf3e90d675 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -890,44 +890,6 @@ float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
return vfma_f32(v1, v2, v3);
}
-// CHECK-LABEL: define dso_local <4 x float> @test_vfmaq_f32(
-// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32>
-// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]])
-// CHECK-NEXT: ret <4 x float> [[TMP9]]
-//
-float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
- return vfmaq_f32(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <2 x double> @test_vfmaq_f64(
-// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64>
-// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
-// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x double> [[TMP6]])
-// CHECK-NEXT: ret <2 x double> [[TMP9]]
-//
-float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
- return vfmaq_f64(v1, v2, v3);
-}
-
// CHECK-LABEL: define dso_local <2 x float> @test_vfms_f32(
// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/neon/vfmaq.c b/clang/test/CodeGen/AArch64/neon/vfmaq.c
new file mode 100644
index 0000000000000..54bc9d1a2cc5c
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/vfmaq.c
@@ -0,0 +1,65 @@
+// REQUIRES: aarch64-registered-target || arm-registered-target
+
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-cir -o - %s | FileCheck %s --check-prefixes=CIR %}
+
+//=============================================================================
+// NOTES
+//
+// This file contains tests that were originally located in:
+// * clang/test/CodeGen/AArch64/neon-intrinsics.c
+// The main difference is the use of RUN lines that enable ClangIR lowering.
+// This file currently covers the f32/f64 wrappers that lower through
+// BI__builtin_neon_vfmaq_v.
+//
+// ACLE section headings based on v2025Q2 of the ACLE specification:
+// * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate
+//
+//=============================================================================
+
+#include <arm_neon.h>
+
+//===------------------------------------------------------===//
+// Fused multiply-accumulate, vector quad forms
+//===------------------------------------------------------===//
+
+// CIR-LABEL: @vfmaq_f32(
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float>
+
+// LLVM-LABEL: @test_vfmaq_f32(
+float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
+// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM: [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <4 x float> [[C]] to <4 x i32>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <4 x i32> [[C_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <4 x float>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <4 x float>
+// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <4 x float>
+// LLVM-NEXT: [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B_CAST]], <4 x float> [[C_CAST]], <4 x float> [[A_CAST]])
+// LLVM-NEXT: ret <4 x float> [[FMA]]
+ return vfmaq_f32(a, b, c);
+}
+
+// CIR-LABEL: @vfmaq_f64(
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>) -> !cir.vector<2 x !cir.double>
+
+// LLVM-LABEL: @test_vfmaq_f64(
+float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) {
+// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 x double> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM: [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <2 x double> [[C]] to <2 x i64>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i64> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i64> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <2 x i64> [[C_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <2 x double>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <2 x double>
+// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <2 x double>
+// LLVM-NEXT: [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B_CAST]], <2 x double> [[C_CAST]], <2 x double> [[A_CAST]])
+// LLVM-NEXT: ret <2 x double> [[FMA]]
+ return vfmaq_f64(a, b, c);
+}
More information about the cfe-commits
mailing list