[clang] [CIR][AArch64] Lower vfma lane builtins (PR #188190)

Tue Mar 24 00:49:58 PDT 2026

https://github.com/yairbenavraham updated https://github.com/llvm/llvm-project/pull/188190

>From 5cc336f971d79932ea425da7253a05428a65a610 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <yairba at protonmail.com>
Date: Sun, 22 Mar 2026 04:57:07 +0200
Subject: [PATCH 1/3] [CIR] Fix generated type constraint header dependencies

Add the missing MLIRCIRTypeConstraintsIncGen dependencies in the
CIR dialect and lowering CMake targets so clean CIR-enabled builds
generate the required headers before the lowering libraries are
compiled.
---
 clang/include/clang/CIR/Dialect/IR/CMakeLists.txt  | 2 +-
 clang/lib/CIR/Lowering/CMakeLists.txt              | 3 +++
 clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt b/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
index 870f9e3f5d052..1388e5bc612f2 100644
--- a/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
+++ b/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
@@ -27,5 +27,5 @@ clang_tablegen(CIRLowering.inc -gen-cir-lowering
 set(LLVM_TARGET_DEFINITIONS CIRTypeConstraints.td)
 mlir_tablegen(CIRTypeConstraints.h.inc -gen-type-constraint-decls)
 mlir_tablegen(CIRTypeConstraints.cpp.inc -gen-type-constraint-defs)
-add_public_tablegen_target(MLIRCIRTypeConstraintsIncGen)
+add_mlir_generic_tablegen_target(MLIRCIRTypeConstraintsIncGen)
 add_dependencies(mlir-headers MLIRCIRTypeConstraintsIncGen)
diff --git a/clang/lib/CIR/Lowering/CMakeLists.txt b/clang/lib/CIR/Lowering/CMakeLists.txt
index 28ec3c551018c..77d28ef72d11d 100644
--- a/clang/lib/CIR/Lowering/CMakeLists.txt
+++ b/clang/lib/CIR/Lowering/CMakeLists.txt
@@ -9,6 +9,9 @@ add_clang_library(clangCIRLoweringCommon
   CIRPasses.cpp
   LoweringHelpers.cpp
 
+  DEPENDS
+  MLIRCIRTypeConstraintsIncGen
+
   LINK_LIBS
   clangCIR
   ${dialect_libs}
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
index c7467fe40ba30..5b197ddca12c0 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
@@ -13,6 +13,7 @@ add_clang_library(clangCIRLoweringDirectToLLVM
   MLIRCIREnumsGen
   MLIRCIROpsIncGen
   MLIRCIROpInterfacesIncGen
+  MLIRCIRTypeConstraintsIncGen
 
   LINK_LIBS
   clangCIRLoweringCommon

>From a674c9106368033a28229601026f11815ae69648 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <yairba at protonmail.com>
Date: Mon, 23 Mar 2026 17:55:19 +0200
Subject: [PATCH 2/3] [CIR][AArch64] Lower vfma lane builtins

Lower the AArch64 vfma lane and laneq builtins in CIR codegen.

This adds handling for the vector and scalar vfma lane forms,
including the vfmaq_laneq_v family called out in the issue, and
keeps the CIR builtin structure aligned with the existing AArch64
builtin lowering pattern.

The patch also includes the required formatting adjustment so the
implementation matches the repository clang-format style.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 67 +++++++++++++++----
 1 file changed, 53 insertions(+), 14 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index a3488bfcc3dec..2f1e974927fd8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -139,11 +139,10 @@ static cir::VectorType getNeonType(CIRGenFunction *cgf, NeonTypeFlags typeFlags,
       cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: BFloat16"));
     [[fallthrough]];
   case NeonTypeFlags::Float16:
-    if (hasLegalHalfType)
+    if (!hasLegalHalfType)
       cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
-    else
-      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
-    [[fallthrough]];
+    return cir::VectorType::get(cgf->getCIRGenModule().fP16Ty,
+                                v1Ty ? 1 : (4 << isQuad));
   case NeonTypeFlags::Int32:
     return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt32Ty
                                                        : cgf->sInt32Ty,
@@ -2186,6 +2185,23 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
     return mlir::Value{};
   }
 
+  switch (builtinID) {
+  case NEON::BI__builtin_neon_vfmah_lane_f16:
+  case NEON::BI__builtin_neon_vfmas_lane_f32:
+  case NEON::BI__builtin_neon_vfmah_laneq_f16:
+  case NEON::BI__builtin_neon_vfmas_laneq_f32:
+  case NEON::BI__builtin_neon_vfmad_lane_f64:
+  case NEON::BI__builtin_neon_vfmad_laneq_f64: {
+    mlir::Value lane = cir::VecExtractOp::create(builder, loc, ops[2], ops[3]);
+    mlir::Type scalarTy = convertType(expr->getType());
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], lane, ops[0]};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", scalarTy,
+                                           fmaOps);
+  }
+  default:
+    break;
+  }
+
   cir::VectorType ty = getNeonType(this, type, loc);
   if (!ty)
     return nullptr;
@@ -2197,16 +2213,6 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
     return std::nullopt;
   case NEON::BI__builtin_neon_vbsl_v:
   case NEON::BI__builtin_neon_vbslq_v:
-  case NEON::BI__builtin_neon_vfma_lane_v:
-  case NEON::BI__builtin_neon_vfmaq_lane_v:
-  case NEON::BI__builtin_neon_vfma_laneq_v:
-  case NEON::BI__builtin_neon_vfmaq_laneq_v:
-  case NEON::BI__builtin_neon_vfmah_lane_f16:
-  case NEON::BI__builtin_neon_vfmas_lane_f32:
-  case NEON::BI__builtin_neon_vfmah_laneq_f16:
-  case NEON::BI__builtin_neon_vfmas_laneq_f32:
-  case NEON::BI__builtin_neon_vfmad_lane_f64:
-  case NEON::BI__builtin_neon_vfmad_laneq_f64:
   case NEON::BI__builtin_neon_vmull_v:
   case NEON::BI__builtin_neon_vmax_v:
   case NEON::BI__builtin_neon_vmaxq_v:
@@ -2224,6 +2230,39 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
     if (cir::isFPOrVectorOfFPType(ty))
       intrName = "aarch64.neon.fabd";
     return emitNeonCall(cgm, builder, {ty, ty}, ops, intrName, ty, loc);
+  case NEON::BI__builtin_neon_vfma_lane_v:
+  case NEON::BI__builtin_neon_vfmaq_lane_v:
+  case NEON::BI__builtin_neon_vfma_laneq_v:
+  case NEON::BI__builtin_neon_vfmaq_laneq_v: {
+    mlir::Value addend = ops[0];
+    mlir::Value multiplicand = ops[1];
+    mlir::Value laneSource = ops[2];
+    auto vecTy = mlir::cast<cir::VectorType>(ty);
+    auto elemTy = vecTy.getElementType();
+    auto numElts = vecTy.getSize();
+
+    if (addend.getType() != ty)
+      addend = builder.createBitcast(loc, addend, ty);
+    if (multiplicand.getType() != ty)
+      multiplicand = builder.createBitcast(loc, multiplicand, ty);
+
+    cir::VectorType sourceTy = ty;
+    if (builtinID == NEON::BI__builtin_neon_vfmaq_lane_v)
+      sourceTy = cir::VectorType::get(elemTy, numElts / 2);
+    else if (builtinID == NEON::BI__builtin_neon_vfma_laneq_v)
+      sourceTy = cir::VectorType::get(elemTy, numElts * 2);
+
+    if (laneSource.getType() != sourceTy)
+      laneSource = builder.createBitcast(loc, laneSource, sourceTy);
+
+    int64_t lane =
+        expr->getArg(3)->EvaluateKnownConstInt(getContext()).getSExtValue();
+    llvm::SmallVector<int64_t> mask(numElts, lane);
+    mlir::Value splat = builder.createVecShuffle(loc, laneSource, mask);
+
+    llvm::SmallVector<mlir::Value> fmaOps = {multiplicand, splat, addend};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
+  }
   case NEON::BI__builtin_neon_vpadal_v:
   case NEON::BI__builtin_neon_vpadalq_v:
   case NEON::BI__builtin_neon_vpmin_v:

>From e9d58588d750adabae1ec9371872a198a2712303 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <yairba at protonmail.com>
Date: Mon, 23 Mar 2026 17:55:19 +0200
Subject: [PATCH 3/3] [CIR][AArch64] Add vfma lane tests

Add focused AArch64 NEON tests for the vfma lane and laneq builtins.

The tests cover the vector and scalar forms used by this patch series
and are placed under clang/test/CodeGen/AArch64/neon for CIR-enabled
validation.
---
 clang/test/CodeGen/AArch64/neon/vfma-lane.c   | 136 ++++++++++++++++++
 .../CodeGen/AArch64/neon/vfma-scalar-lane.c   |  77 ++++++++++
 2 files changed, 213 insertions(+)
 create mode 100644 clang/test/CodeGen/AArch64/neon/vfma-lane.c
 create mode 100644 clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c

diff --git a/clang/test/CodeGen/AArch64/neon/vfma-lane.c b/clang/test/CodeGen/AArch64/neon/vfma-lane.c
new file mode 100644
index 0000000000000..955ab411793b9
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/vfma-lane.c
@@ -0,0 +1,136 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
+// RUN:   -target-feature +fullfp16 -disable-O0-optnone \
+// RUN:   -flax-vector-conversions=none -emit-llvm -o - %s | \
+// RUN:   opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -target-feature +neon -target-feature +fullfp16 \
+// RUN:   -disable-O0-optnone -flax-vector-conversions=none \
+// RUN:   -fclangir -emit-llvm -o - %s | \
+// RUN:   opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -target-feature +neon -target-feature +fullfp16 \
+// RUN:   -disable-O0-optnone -flax-vector-conversions=none \
+// RUN:   -fclangir -emit-cir -o - %s | FileCheck %s --check-prefix=CIR %}
+
+#include <arm_neon.h>
+
+// LLVM-LABEL: @test_vfma_lane_f16(
+// LLVM: shufflevector <4 x half>
+// LLVM: call <4 x half> @llvm.fma.v4f16(
+// CIR-LABEL: @test_vfma_lane_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
+  return vfma_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f16(
+// LLVM: shufflevector <4 x half>
+// LLVM: call <8 x half> @llvm.fma.v8f16(
+// CIR-LABEL: @test_vfmaq_lane_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
+  return vfmaq_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f16(
+// LLVM: shufflevector <8 x half>
+// LLVM: call <4 x half> @llvm.fma.v4f16(
+// CIR-LABEL: @test_vfma_laneq_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
+  return vfma_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f16(
+// LLVM: shufflevector <8 x half>
+// LLVM: call <8 x half> @llvm.fma.v8f16(
+// CIR-LABEL: @test_vfmaq_laneq_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
+  return vfmaq_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfma_lane_f32(
+// LLVM: shufflevector <2 x float>
+// LLVM: call <2 x float> @llvm.fma.v2f32(
+// CIR-LABEL: @test_vfma_lane_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
+  return vfma_lane_f32(a, b, v, 1);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f32(
+// LLVM: shufflevector <2 x float>
+// LLVM: call <4 x float> @llvm.fma.v4f32(
+// CIR-LABEL: @test_vfmaq_lane_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
+  return vfmaq_lane_f32(a, b, v, 1);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f32(
+// LLVM: shufflevector <4 x float>
+// LLVM: call <2 x float> @llvm.fma.v2f32(
+// CIR-LABEL: @test_vfma_laneq_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
+  return vfma_laneq_f32(a, b, v, 3);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f32(
+// LLVM: shufflevector <4 x float>
+// LLVM: call <4 x float> @llvm.fma.v4f32(
+// CIR-LABEL: @test_vfmaq_laneq_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
+  return vfmaq_laneq_f32(a, b, v, 3);
+}
+
+// LLVM-LABEL: @test_vfma_lane_f64(
+// LLVM: shufflevector <1 x double>
+// LLVM: call <1 x double> @llvm.fma.v1f64(
+// CIR-LABEL: @test_vfma_lane_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
+  return vfma_lane_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f64(
+// LLVM: shufflevector <1 x double>
+// LLVM: call <2 x double> @llvm.fma.v2f64(
+// CIR-LABEL: @test_vfmaq_lane_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
+  return vfmaq_lane_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f64(
+// LLVM: @llvm.fma
+// CIR-LABEL: @test_vfma_laneq_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
+  return vfma_laneq_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f64(
+// LLVM: shufflevector <2 x double>
+// LLVM: call <2 x double> @llvm.fma.v2f64(
+// CIR-LABEL: @test_vfmaq_laneq_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
+  return vfmaq_laneq_f64(a, b, v, 1);
+}
diff --git a/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c b/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c
new file mode 100644
index 0000000000000..53fc9761e01a0
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c
@@ -0,0 +1,77 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
+// RUN:   -target-feature +fullfp16 -disable-O0-optnone \
+// RUN:   -flax-vector-conversions=none -emit-llvm -o - %s | \
+// RUN:   opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -target-feature +neon -target-feature +fullfp16 \
+// RUN:   -disable-O0-optnone -flax-vector-conversions=none \
+// RUN:   -fclangir -emit-llvm -o - %s | \
+// RUN:   opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -target-feature +neon -target-feature +fullfp16 \
+// RUN:   -disable-O0-optnone -flax-vector-conversions=none \
+// RUN:   -fclangir -emit-cir -o - %s | FileCheck %s --check-prefix=CIR %}
+
+#include <arm_neon.h>
+
+// LLVM-LABEL: @test_vfmah_lane_f16(
+// LLVM: extractelement <4 x half>
+// LLVM: call half @llvm.fma.f16(
+// CIR-LABEL: @test_vfmah_lane_f16(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
+  return vfmah_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmah_laneq_f16(
+// LLVM: extractelement <8 x half>
+// LLVM: call half @llvm.fma.f16(
+// CIR-LABEL: @test_vfmah_laneq_f16(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
+  return vfmah_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfmas_lane_f32(
+// LLVM: extractelement <2 x float>
+// LLVM: call float @llvm.fma.f32(
+// CIR-LABEL: @test_vfmas_lane_f32(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
+  return vfmas_lane_f32(a, b, c, 1);
+}
+
+// LLVM-LABEL: @test_vfmas_laneq_f32(
+// LLVM: extractelement <4 x float>
+// LLVM: call float @llvm.fma.f32(
+// CIR-LABEL: @test_vfmas_laneq_f32(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t c) {
+  return vfmas_laneq_f32(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmad_lane_f64(
+// LLVM: extractelement <1 x double>
+// LLVM: call double @llvm.fma.f64(
+// CIR-LABEL: @test_vfmad_lane_f64(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
+  return vfmad_lane_f64(a, b, c, 0);
+}
+
+// LLVM-LABEL: @test_vfmad_laneq_f64(
+// LLVM: extractelement <2 x double>
+// LLVM: call double @llvm.fma.f64(
+// CIR-LABEL: @test_vfmad_laneq_f64(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
+  return vfmad_laneq_f64(a, b, c, 1);
+}