[clang] [Clang] Add float type support to __builtin_reduce_add and __builtin_reduce_multipy (PR #120367)

Mon Jan 6 12:11:30 PST 2025

https://github.com/farzonl updated https://github.com/llvm/llvm-project/pull/120367

>From e46ef6592aec6a1a0b3f7509eb260bb1e9bda692 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Wed, 18 Dec 2024 01:44:42 -0500
Subject: [PATCH 1/3] [Clang] Add float type support to __builtin_reduce_add
 and __builtin_reduce_multipy

---
 .../clang/Basic/DiagnosticSemaKinds.td        |  3 +-
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      | 12 ++++++++
 clang/lib/CodeGen/CGBuiltin.cpp               | 29 +++++++++++++++++--
 clang/lib/Sema/SemaChecking.cpp               | 26 +++++++++++++++--
 clang/test/AST/ByteCode/builtin-functions.cpp | 15 ++++++++++
 clang/test/CodeGen/builtins-reduction-math.c  | 21 ++++++++++++++
 clang/test/Sema/builtins-reduction-math.c     | 16 +++++-----
 clang/test/Sema/constant_builtins_vector.cpp  | 12 ++++++++
 8 files changed, 120 insertions(+), 14 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 03fb7ca9bc3c3b..ee7a3e48b6421c 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12355,7 +12355,8 @@ def err_builtin_invalid_arg_type: Error <
   "a vector of integers|"
   "an unsigned integer|"
   "an 'int'|"
-  "a vector of floating points}1 (was %2)">;
+  "a vector of floating points|"
+  "a vector of integers or floating points}1 (was %2)">;
 
 def err_builtin_matrix_disabled: Error<
   "matrix types extension is disabled. Pass -fenable-matrix to enable it">;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 0d52083b069464..5fff1afe4d2a2c 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 #include "../ExprConstShared.h"
 #include "Boolean.h"
+#include "ByteCode/Floating.h"
 #include "Compiler.h"
 #include "EvalEmitter.h"
 #include "Interp.h"
@@ -1756,6 +1757,17 @@ static bool interp__builtin_vector_reduce(InterpState &S, CodePtr OpPC,
   PrimType ElemT = *S.getContext().classify(ElemType);
   unsigned NumElems = Arg.getNumElems();
 
+  if (ElemType->isRealFloatingType()) {
+    if (ID != Builtin::BI__builtin_reduce_add &&
+        ID != Builtin::BI__builtin_reduce_mul)
+      llvm_unreachable("Only reduce_add and reduce_mul are supported for "
+                       "floating-point types.");
+    // Floating-point arithmetic is not valid for constant expression
+    // initialization. Returning false defers checks to integral constant
+    // expression validation, preventing a bad deref of Floating as an integer.
+    return false;
+  }
+
   INT_TYPE_SWITCH_NO_BOOL(ElemT, {
     T Result = Arg.atIndex(0).deref<T>();
     unsigned BitWidth = Result.bitWidth();
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 5cd893d70695c8..29ba65a575505e 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4314,12 +4314,37 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         *this, E, GetIntrinsicID(E->getArg(0)->getType()), "rdx.min"));
   }
 
-  case Builtin::BI__builtin_reduce_add:
+  case Builtin::BI__builtin_reduce_add: {
+    // Note: vector_reduce_fadd takes two arguments a
+    // scalar start value and a vector. That would mean to
+    // correctly call it we would need emitBuiltinWithOneOverloadedType<2>
+    // To keep the  builtin sema behavior the same despite type we will
+    // popululate vector_reduce_fadd scalar value with a 0.
+    if (E->getArg(0)->getType()->hasFloatingRepresentation()) {
+      Value *X = EmitScalarExpr(E->getArg(0));
+      auto EltTy = X->getType()->getScalarType();
+      Value *Seed = ConstantFP::get(EltTy, 0);
+      return RValue::get(Builder.CreateIntrinsic(
+          /*ReturnType=*/EltTy, llvm::Intrinsic::vector_reduce_fadd,
+          ArrayRef<Value *>{Seed, X}, nullptr, "rdx.fadd"));
+    }
+    assert(E->getArg(0)->getType()->hasIntegerRepresentation());
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, llvm::Intrinsic::vector_reduce_add, "rdx.add"));
-  case Builtin::BI__builtin_reduce_mul:
+  }
+  case Builtin::BI__builtin_reduce_mul: {
+    if (E->getArg(0)->getType()->hasFloatingRepresentation()) {
+      Value *X = EmitScalarExpr(E->getArg(0));
+      auto EltTy = X->getType()->getScalarType();
+      Value *Seed = ConstantFP::get(EltTy, 0);
+      return RValue::get(Builder.CreateIntrinsic(
+          /*ReturnType=*/EltTy, llvm::Intrinsic::vector_reduce_fmul,
+          ArrayRef<Value *>{Seed, X}, nullptr, "rdx.fmul"));
+    }
+    assert(E->getArg(0)->getType()->hasIntegerRepresentation());
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, llvm::Intrinsic::vector_reduce_mul, "rdx.mul"));
+  }
   case Builtin::BI__builtin_reduce_xor:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, llvm::Intrinsic::vector_reduce_xor, "rdx.xor"));
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 28dcfaac2e84f5..e89b0652edc2eb 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2886,11 +2886,31 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     TheCall->setType(ElTy);
     break;
   }
+  case Builtin::BI__builtin_reduce_add:
+  case Builtin::BI__builtin_reduce_mul: {
+    if (PrepareBuiltinReduceMathOneArgCall(TheCall))
+      return ExprError();
+
+    const Expr *Arg = TheCall->getArg(0);
+    const auto *TyA = Arg->getType()->getAs<VectorType>();
+
+    QualType ElTy;
+    if (TyA)
+      ElTy = TyA->getElementType();
+    else if (Arg->getType()->isSizelessVectorType())
+      ElTy = Arg->getType()->getSizelessVectorEltType(Context);
+
+    if (ElTy.isNull()) {
+      Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
+          << 1 << /* vector of integers or floating points */ 10
+          << Arg->getType();
+      return ExprError();
+    }
+    TheCall->setType(ElTy);
+    break;
+  }
 
   // These builtins support vectors of integers only.
-  // TODO: ADD/MUL should support floating-point types.
-  case Builtin::BI__builtin_reduce_add:
-  case Builtin::BI__builtin_reduce_mul:
   case Builtin::BI__builtin_reduce_xor:
   case Builtin::BI__builtin_reduce_or:
   case Builtin::BI__builtin_reduce_and: {
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 723764010d9a3a..29ea51d619df35 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1056,6 +1056,14 @@ namespace RecuceAdd {
   static_assert(__builtin_reduce_add((vector4uint){~0U, 0, 0, 1}) == 0);
   static_assert(__builtin_reduce_add((vector4ulong){~0ULL, 0, 0, 1}) == 0);
 
+  static_assert(__builtin_reduce_add((vector4float){}) == 0.0);
+  // both-error at -1 {{static assertion expression is not an integral constant expression}}
+  static_assert(__builtin_reduce_add((vector4float){1.1, 2.2, 3.3, 4.4}) == 11.0);
+  // both-error at -1 {{static assertion expression is not an integral constant expression}}
+  static_assert(__builtin_reduce_add((vector4double){100.1, 200.2, 300.3, 400.4}) == 1001.0);
+  // both-error at -1 {{static assertion expression is not an integral constant expression}}
+
+
 
 #ifdef __SIZEOF_INT128__
   typedef __int128 v4i128 __attribute__((__vector_size__(128 * 2)));
@@ -1091,6 +1099,13 @@ namespace ReduceMul {
       (~0U - 1));
 #endif
   static_assert(__builtin_reduce_mul((vector4ulong){~0ULL, 1, 1, 2}) == ~0ULL - 1);
+
+    static_assert(__builtin_reduce_mul((vector4float){}) == 0.0);
+  // both-error at -1 {{static assertion expression is not an integral constant expression}}
+  static_assert(__builtin_reduce_mul((vector4float){1.0, 2.0, 3.0, 1.0}) == 6.0);
+  // both-error at -1 {{static assertion expression is not an integral constant expression}}
+  static_assert(__builtin_reduce_mul((vector4double){3.0, 4.0, 1.0, 1.0}) == 12.0);
+  // both-error at -1 {{static assertion expression is not an integral constant expression}}
 }
 
 namespace ReduceAnd {
diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c
index e12fd729c84c0b..35f12ca710e3e3 100644
--- a/clang/test/CodeGen/builtins-reduction-math.c
+++ b/clang/test/CodeGen/builtins-reduction-math.c
@@ -4,6 +4,7 @@
 // RUN: %clang_cc1 -O1 -triple aarch64 -target-feature +sve  %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=SVE   %s
 
 typedef float float4 __attribute__((ext_vector_type(4)));
+typedef double double4 __attribute__((ext_vector_type(4)));
 typedef short int si8 __attribute__((ext_vector_type(8)));
 typedef unsigned int u4 __attribute__((ext_vector_type(4)));
 
@@ -61,6 +62,16 @@ void test_builtin_reduce_min(float4 vf1, si8 vi1, u4 vu1) {
   unsigned long long r5 = __builtin_reduce_min(cvi1);
 }
 
+void test_builtin_reduce_addf(float4 vf4, double4 vd4) {          
+  // CHECK:      [[VF4:%.+]] = load <4 x float>, ptr %vf4.addr, align 16
+  // CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[VF4]])
+  float r2 = __builtin_reduce_add(vf4);
+
+  // CHECK:      [[VD4:%.+]] = load <4 x double>, ptr %vd4.addr, align 16
+  // CHECK-NEXT: call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[VD4]])
+  double r3 = __builtin_reduce_add(vd4);
+}
+
 void test_builtin_reduce_add(si8 vi1, u4 vu1) {
   // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
   // CHECK-NEXT: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VI1]])
@@ -83,6 +94,16 @@ void test_builtin_reduce_add(si8 vi1, u4 vu1) {
   unsigned long long r5 = __builtin_reduce_add(cvu1);
 }
 
+void test_builtin_reduce_mulf(float4 vf4, double4 vd4) {          
+  // CHECK:      [[VF4:%.+]] = load <4 x float>, ptr %vf4.addr, align 16
+  // CHECK-NEXT: call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> [[VF4]])
+  float r2 = __builtin_reduce_mul(vf4);
+
+  // CHECK:      [[VD4:%.+]] = load <4 x double>, ptr %vd4.addr, align 16
+  // CHECK-NEXT: call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> [[VD4]])
+  double r3 = __builtin_reduce_mul(vd4);
+}
+
 void test_builtin_reduce_mul(si8 vi1, u4 vu1) {
   // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
   // CHECK-NEXT: call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> [[VI1]])
diff --git a/clang/test/Sema/builtins-reduction-math.c b/clang/test/Sema/builtins-reduction-math.c
index 9b0d91bfd6e3d2..9e2dac7ebbe6f6 100644
--- a/clang/test/Sema/builtins-reduction-math.c
+++ b/clang/test/Sema/builtins-reduction-math.c
@@ -36,7 +36,7 @@ void test_builtin_reduce_min(int i, float4 v, int3 iv) {
   // expected-error at -1 {{1st argument must be a vector type (was 'int')}}
 }
 
-void test_builtin_reduce_add(int i, float4 v, int3 iv) {
+void test_builtin_reduce_add(int i, float f, int3 iv) {
   struct Foo s = __builtin_reduce_add(iv);
   // expected-error at -1 {{initializing 'struct Foo' with an expression of incompatible type 'int'}}
 
@@ -47,13 +47,13 @@ void test_builtin_reduce_add(int i, float4 v, int3 iv) {
   // expected-error at -1 {{too many arguments to function call, expected 1, have 2}}
 
   i = __builtin_reduce_add(i);
-  // expected-error at -1 {{1st argument must be a vector of integers (was 'int')}}
+  // expected-error at -1 {{1st argument must be a vector of integers or floating points (was 'int')}}
 
-  i = __builtin_reduce_add(v);
-  // expected-error at -1 {{1st argument must be a vector of integers (was 'float4' (vector of 4 'float' values))}}
+  f = __builtin_reduce_add(f);
+  // expected-error at -1 {{1st argument must be a vector of integers or floating points (was 'float')}}
 }
 
-void test_builtin_reduce_mul(int i, float4 v, int3 iv) {
+void test_builtin_reduce_mul(int i, float f, int3 iv) {
   struct Foo s = __builtin_reduce_mul(iv);
   // expected-error at -1 {{initializing 'struct Foo' with an expression of incompatible type 'int'}}
 
@@ -64,10 +64,10 @@ void test_builtin_reduce_mul(int i, float4 v, int3 iv) {
   // expected-error at -1 {{too many arguments to function call, expected 1, have 2}}
 
   i = __builtin_reduce_mul(i);
-  // expected-error at -1 {{1st argument must be a vector of integers (was 'int')}}
+  // expected-error at -1 {{1st argument must be a vector of integers or floating points (was 'int')}}
 
-  i = __builtin_reduce_mul(v);
-  // expected-error at -1 {{1st argument must be a vector of integers (was 'float4' (vector of 4 'float' values))}}
+  f = __builtin_reduce_mul(f);
+  // expected-error at -1 {{1st argument must be a vector of integers or floating points (was 'float')}}
 }
 
 void test_builtin_reduce_xor(int i, float4 v, int3 iv) {
diff --git a/clang/test/Sema/constant_builtins_vector.cpp b/clang/test/Sema/constant_builtins_vector.cpp
index 8659fa9e46612d..af35f7f5999890 100644
--- a/clang/test/Sema/constant_builtins_vector.cpp
+++ b/clang/test/Sema/constant_builtins_vector.cpp
@@ -746,6 +746,12 @@ constexpr long long reduceAddLong2 = __builtin_reduce_add((vector4long){(1LL <<
 static_assert(__builtin_reduce_add((vector4uint){~0U, 0, 0, 1}) == 0);
 static_assert(__builtin_reduce_add((vector4ulong){~0ULL, 0, 0, 1}) == 0);
 
+constexpr float reduceAddFloat = __builtin_reduce_add((vector4float){1.0, 2.0, 3.0, 4.0});
+// expected-error at -1 {{must be initialized by a constant expression}}
+
+constexpr double reduceAddDouble = __builtin_reduce_add((vector4double){-1.0, 2.0, -3.0, 4.0});
+// expected-error at -1 {{must be initialized by a constant expression}}
+
 static_assert(__builtin_reduce_mul((vector4char){}) == 0);
 static_assert(__builtin_reduce_mul((vector4char){1, 2, 3, 4}) == 24);
 static_assert(__builtin_reduce_mul((vector4short){1, 2, 30, 40}) == 2400);
@@ -766,6 +772,12 @@ constexpr long long reduceMulLong2 = __builtin_reduce_mul((vector4long){(1LL <<
 static_assert(__builtin_reduce_mul((vector4uint){~0U, 1, 1, 2}) == ~0U - 1);
 static_assert(__builtin_reduce_mul((vector4ulong){~0ULL, 1, 1, 2}) == ~0ULL - 1);
 
+constexpr float reduceMulFloat = __builtin_reduce_mul((vector4float){1.0, 2.0, 3.0, 1.0});
+// expected-error at -1 {{must be initialized by a constant expression}}
+
+constexpr double reduceMulDouble = __builtin_reduce_mul((vector4double){3.0, 4.0, 1.0, 1.0});
+// expected-error at -1 {{must be initialized by a constant expression}}
+
 static_assert(__builtin_reduce_and((vector4char){}) == 0);
 static_assert(__builtin_reduce_and((vector4char){(char)0x11, (char)0x22, (char)0x44, (char)0x88}) == 0);
 static_assert(__builtin_reduce_and((vector4short){(short)0x1111, (short)0x2222, (short)0x4444, (short)0x8888}) == 0);

>From f935ecded1461dba945c63dbb75389bb242a91c0 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Thu, 26 Dec 2024 15:38:18 -0500
Subject: [PATCH 2/3] address pr feedback

---
 clang/include/clang/Basic/DiagnosticSemaKinds.td | 2 +-
 clang/lib/AST/ByteCode/InterpBuiltin.cpp         | 1 -
 clang/lib/CodeGen/CGBuiltin.cpp                  | 4 ++--
 clang/test/CodeGen/builtins-reduction-math.c     | 8 ++++----
 clang/test/Sema/builtins-reduction-math.c        | 8 ++++----
 5 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index ee7a3e48b6421c..2fc15c81446b78 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12356,7 +12356,7 @@ def err_builtin_invalid_arg_type: Error <
   "an unsigned integer|"
   "an 'int'|"
   "a vector of floating points|"
-  "a vector of integers or floating points}1 (was %2)">;
+  "a vector of arithmetic element type}1 (was %2)">;
 
 def err_builtin_matrix_disabled: Error<
   "matrix types extension is disabled. Pass -fenable-matrix to enable it">;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5fff1afe4d2a2c..3a9b6dc303dea9 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 #include "../ExprConstShared.h"
 #include "Boolean.h"
-#include "ByteCode/Floating.h"
 #include "Compiler.h"
 #include "EvalEmitter.h"
 #include "Interp.h"
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 29ba65a575505e..534c9d9a8551b1 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4323,7 +4323,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     if (E->getArg(0)->getType()->hasFloatingRepresentation()) {
       Value *X = EmitScalarExpr(E->getArg(0));
       auto EltTy = X->getType()->getScalarType();
-      Value *Seed = ConstantFP::get(EltTy, 0);
+      Value *Seed = ConstantFP::get(EltTy, -0.0);
       return RValue::get(Builder.CreateIntrinsic(
           /*ReturnType=*/EltTy, llvm::Intrinsic::vector_reduce_fadd,
           ArrayRef<Value *>{Seed, X}, nullptr, "rdx.fadd"));
@@ -4336,7 +4336,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     if (E->getArg(0)->getType()->hasFloatingRepresentation()) {
       Value *X = EmitScalarExpr(E->getArg(0));
       auto EltTy = X->getType()->getScalarType();
-      Value *Seed = ConstantFP::get(EltTy, 0);
+      Value *Seed = ConstantFP::get(EltTy, 1.0);
       return RValue::get(Builder.CreateIntrinsic(
           /*ReturnType=*/EltTy, llvm::Intrinsic::vector_reduce_fmul,
           ArrayRef<Value *>{Seed, X}, nullptr, "rdx.fmul"));
diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c
index 35f12ca710e3e3..ffdfb763d8b560 100644
--- a/clang/test/CodeGen/builtins-reduction-math.c
+++ b/clang/test/CodeGen/builtins-reduction-math.c
@@ -64,11 +64,11 @@ void test_builtin_reduce_min(float4 vf1, si8 vi1, u4 vu1) {
 
 void test_builtin_reduce_addf(float4 vf4, double4 vd4) {          
   // CHECK:      [[VF4:%.+]] = load <4 x float>, ptr %vf4.addr, align 16
-  // CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[VF4]])
+  // CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[VF4]])
   float r2 = __builtin_reduce_add(vf4);
 
   // CHECK:      [[VD4:%.+]] = load <4 x double>, ptr %vd4.addr, align 16
-  // CHECK-NEXT: call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[VD4]])
+  // CHECK-NEXT: call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[VD4]])
   double r3 = __builtin_reduce_add(vd4);
 }
 
@@ -96,11 +96,11 @@ void test_builtin_reduce_add(si8 vi1, u4 vu1) {
 
 void test_builtin_reduce_mulf(float4 vf4, double4 vd4) {          
   // CHECK:      [[VF4:%.+]] = load <4 x float>, ptr %vf4.addr, align 16
-  // CHECK-NEXT: call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> [[VF4]])
+  // CHECK-NEXT: call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[VF4]])
   float r2 = __builtin_reduce_mul(vf4);
 
   // CHECK:      [[VD4:%.+]] = load <4 x double>, ptr %vd4.addr, align 16
-  // CHECK-NEXT: call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> [[VD4]])
+  // CHECK-NEXT: call double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[VD4]])
   double r3 = __builtin_reduce_mul(vd4);
 }
 
diff --git a/clang/test/Sema/builtins-reduction-math.c b/clang/test/Sema/builtins-reduction-math.c
index 9e2dac7ebbe6f6..dd164b8e422975 100644
--- a/clang/test/Sema/builtins-reduction-math.c
+++ b/clang/test/Sema/builtins-reduction-math.c
@@ -47,10 +47,10 @@ void test_builtin_reduce_add(int i, float f, int3 iv) {
   // expected-error at -1 {{too many arguments to function call, expected 1, have 2}}
 
   i = __builtin_reduce_add(i);
-  // expected-error at -1 {{1st argument must be a vector of integers or floating points (was 'int')}}
+  // expected-error at -1 {{1st argument must be a vector of arithmetic element type (was 'int')}}
 
   f = __builtin_reduce_add(f);
-  // expected-error at -1 {{1st argument must be a vector of integers or floating points (was 'float')}}
+  // expected-error at -1 {{1st argument must be a vector of arithmetic element type (was 'float')}}
 }
 
 void test_builtin_reduce_mul(int i, float f, int3 iv) {
@@ -64,10 +64,10 @@ void test_builtin_reduce_mul(int i, float f, int3 iv) {
   // expected-error at -1 {{too many arguments to function call, expected 1, have 2}}
 
   i = __builtin_reduce_mul(i);
-  // expected-error at -1 {{1st argument must be a vector of integers or floating points (was 'int')}}
+  // expected-error at -1 {{1st argument must be a vector of arithmetic element type (was 'int')}}
 
   f = __builtin_reduce_mul(f);
-  // expected-error at -1 {{1st argument must be a vector of integers or floating points (was 'float')}}
+  // expected-error at -1 {{1st argument must be a vector of arithmetic element type (was 'float')}}
 }
 
 void test_builtin_reduce_xor(int i, float4 v, int3 iv) {

>From a063f631ffe8d5652234eaf472c0199623c7595d Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Mon, 6 Jan 2025 15:10:21 -0500
Subject: [PATCH 3/3] remove Intrinsic::vector_reduce_fadd and
 Intrinsic::vector_reduce_fmul replace with sequential reduction

---
 clang/lib/CodeGen/CGBuiltin.cpp              | 46 +++++++++++---------
 clang/test/CodeGen/builtins-reduction-math.c | 32 ++++++++++++--
 2 files changed, 53 insertions(+), 25 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 534c9d9a8551b1..a1bf936f315032 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4315,33 +4315,37 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   }
 
   case Builtin::BI__builtin_reduce_add: {
-    // Note: vector_reduce_fadd takes two arguments a
-    // scalar start value and a vector. That would mean to
-    // correctly call it we would need emitBuiltinWithOneOverloadedType<2>
-    // To keep the  builtin sema behavior the same despite type we will
-    // popululate vector_reduce_fadd scalar value with a 0.
-    if (E->getArg(0)->getType()->hasFloatingRepresentation()) {
-      Value *X = EmitScalarExpr(E->getArg(0));
-      auto EltTy = X->getType()->getScalarType();
-      Value *Seed = ConstantFP::get(EltTy, -0.0);
-      return RValue::get(Builder.CreateIntrinsic(
-          /*ReturnType=*/EltTy, llvm::Intrinsic::vector_reduce_fadd,
-          ArrayRef<Value *>{Seed, X}, nullptr, "rdx.fadd"));
+    QualType QT = E->getArg(0)->getType();
+    if (QT->hasFloatingRepresentation()) {
+      Value *Op0 = EmitScalarExpr(E->getArg(0));
+      assert(Op0->getType()->isVectorTy());
+      unsigned VecSize = QT->getAs<VectorType>()->getNumElements();
+      Value *Sum = Builder.CreateExtractElement(Op0, static_cast<uint64_t>(0));
+      for (unsigned I = 1; I < VecSize; I++) {
+        Value *Elt = Builder.CreateExtractElement(Op0, I);
+        Sum = Builder.CreateFAdd(Sum, Elt);
+      }
+      return RValue::get(Sum);
     }
-    assert(E->getArg(0)->getType()->hasIntegerRepresentation());
+    assert(QT->hasIntegerRepresentation());
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, llvm::Intrinsic::vector_reduce_add, "rdx.add"));
   }
   case Builtin::BI__builtin_reduce_mul: {
-    if (E->getArg(0)->getType()->hasFloatingRepresentation()) {
-      Value *X = EmitScalarExpr(E->getArg(0));
-      auto EltTy = X->getType()->getScalarType();
-      Value *Seed = ConstantFP::get(EltTy, 1.0);
-      return RValue::get(Builder.CreateIntrinsic(
-          /*ReturnType=*/EltTy, llvm::Intrinsic::vector_reduce_fmul,
-          ArrayRef<Value *>{Seed, X}, nullptr, "rdx.fmul"));
+    QualType QT = E->getArg(0)->getType();
+    if (QT->hasFloatingRepresentation()) {
+      Value *Op0 = EmitScalarExpr(E->getArg(0));
+      assert(Op0->getType()->isVectorTy());
+      unsigned VecSize = QT->getAs<VectorType>()->getNumElements();
+      Value *Product =
+          Builder.CreateExtractElement(Op0, static_cast<uint64_t>(0));
+      for (unsigned I = 1; I < VecSize; I++) {
+        Value *Elt = Builder.CreateExtractElement(Op0, I);
+        Product = Builder.CreateFMul(Product, Elt);
+      }
+      return RValue::get(Product);
     }
-    assert(E->getArg(0)->getType()->hasIntegerRepresentation());
+    assert(QT->hasIntegerRepresentation());
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, llvm::Intrinsic::vector_reduce_mul, "rdx.mul"));
   }
diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c
index ffdfb763d8b560..d0ab57f41edc24 100644
--- a/clang/test/CodeGen/builtins-reduction-math.c
+++ b/clang/test/CodeGen/builtins-reduction-math.c
@@ -64,11 +64,23 @@ void test_builtin_reduce_min(float4 vf1, si8 vi1, u4 vu1) {
 
 void test_builtin_reduce_addf(float4 vf4, double4 vd4) {          
   // CHECK:      [[VF4:%.+]] = load <4 x float>, ptr %vf4.addr, align 16
-  // CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[VF4]])
+  // CHECK-NEXT: [[ARRF1:%.+]] = extractelement <4 x float> [[VF4]], i64 0
+  // CHECK-NEXT: [[ARRF2:%.+]] = extractelement <4 x float> [[VF4]], i64 1
+  // CHECK-NEXT: [[ADDF1:%.+]] = fadd float [[ARRF1]], [[ARRF2]]
+  // CHECK-NEXT: [[ARRF3:%.+]] = extractelement <4 x float> [[VF4]], i64 2
+  // CHECK-NEXT: [[ADDF2:%.+]] = fadd float [[ADDF1]], [[ARRF3]]
+  // CHECK-NEXT: [[ARRF4:%.+]] = extractelement <4 x float> [[VF4]], i64 3
+  // CHECK-NEXT: [[ADDF3:%.+]] = fadd float [[ADDF2]], [[ARRF4]]
   float r2 = __builtin_reduce_add(vf4);
 
   // CHECK:      [[VD4:%.+]] = load <4 x double>, ptr %vd4.addr, align 16
-  // CHECK-NEXT: call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[VD4]])
+  // CHECK-NEXT: [[ARR1:%.+]] = extractelement <4 x double> [[VD4]], i64 0
+  // CHECK-NEXT: [[ARR2:%.+]] = extractelement <4 x double> [[VD4]], i64 1
+  // CHECK-NEXT: [[ADD1:%.+]] = fadd double [[ARR1]], [[ARR2]]
+  // CHECK-NEXT: [[ARR3:%.+]] = extractelement <4 x double> [[VD4]], i64 2
+  // CHECK-NEXT: [[ADD2:%.+]] = fadd double [[ADD1]], [[ARR3]]
+  // CHECK-NEXT: [[ARR4:%.+]] = extractelement <4 x double> [[VD4]], i64 3
+  // CHECK-NEXT: [[ADD3:%.+]] = fadd double [[ADD2]], [[ARR4]]
   double r3 = __builtin_reduce_add(vd4);
 }
 
@@ -96,11 +108,23 @@ void test_builtin_reduce_add(si8 vi1, u4 vu1) {
 
 void test_builtin_reduce_mulf(float4 vf4, double4 vd4) {          
   // CHECK:      [[VF4:%.+]] = load <4 x float>, ptr %vf4.addr, align 16
-  // CHECK-NEXT: call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[VF4]])
+  // CHECK-NEXT: [[ARRF1:%.+]] = extractelement <4 x float> [[VF4]], i64 0
+  // CHECK-NEXT: [[ARRF2:%.+]] = extractelement <4 x float> [[VF4]], i64 1
+  // CHECK-NEXT: [[MULF1:%.+]] = fmul float [[ARRF1]], [[ARRF2]]
+  // CHECK-NEXT: [[ARRF3:%.+]] = extractelement <4 x float> [[VF4]], i64 2
+  // CHECK-NEXT: [[MULF2:%.+]] = fmul float [[MULF1]], [[ARRF3]]
+  // CHECK-NEXT: [[ARRF4:%.+]] = extractelement <4 x float> [[VF4]], i64 3
+  // CHECK-NEXT: [[MULF3:%.+]] = fmul float [[MULF2]], [[ARRF4]]
   float r2 = __builtin_reduce_mul(vf4);
 
   // CHECK:      [[VD4:%.+]] = load <4 x double>, ptr %vd4.addr, align 16
-  // CHECK-NEXT: call double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[VD4]])
+  // CHECK-NEXT: [[ARR1:%.+]] = extractelement <4 x double> [[VD4]], i64 0
+  // CHECK-NEXT: [[ARR2:%.+]] = extractelement <4 x double> [[VD4]], i64 1
+  // CHECK-NEXT: [[MUL1:%.+]] = fmul double [[ARR1]], [[ARR2]]
+  // CHECK-NEXT: [[ARR3:%.+]] = extractelement <4 x double> [[VD4]], i64 2
+  // CHECK-NEXT: [[MUL2:%.+]] = fmul double [[MUL1]], [[ARR3]]
+  // CHECK-NEXT: [[ARR4:%.+]] = extractelement <4 x double> [[VD4]], i64 3
+  // CHECK-NEXT: [[MUL3:%.+]] = fmul double [[MUL2]], [[ARR4]]
   double r3 = __builtin_reduce_mul(vd4);
 }