[clang] [X86] Add MMX/SSE/AVX PHADD/SUB & HADDPS/D intrinsics to be used in constexpr (PR #156822)

Fri Sep 26 00:00:42 PDT 2025

================
@@ -2742,6 +2742,86 @@ static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp_builtin_horizontal_int_binop(
+    InterpState &S, CodePtr OpPC, const CallExpr *Call,
+    llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) {
+  assert(Call->getNumArgs() == 2);
+
+  assert(Call->getArg(0)->getType()->isVectorType() &&
+         Call->getArg(1)->getType()->isVectorType());
+  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+  assert(VT->getElementType()->isIntegralOrEnumerationType());
+  PrimType ElemT = *S.getContext().classify(VT->getElementType());
+  bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
+
+  const Pointer &RHS = S.Stk.pop<Pointer>();
+  const Pointer &LHS = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  unsigned SourceLen = VT->getNumElements();
+  assert(SourceLen % 2 == 0 &&
+         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
+             SourceLen);
+  unsigned DstElem = 0;
+
+  for (unsigned I = 0; I != SourceLen; I += 2) {
+    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+      APSInt Elem1 = LHS.elem<T>(I).toAPSInt();
+      APSInt Elem2 = LHS.elem<T>(I + 1).toAPSInt();
+      Dst.elem<T>(DstElem) =
+          static_cast<T>(APSInt(Fn(Elem1, Elem2), DestUnsigned));
+    });
+    ++DstElem;
+  }
+  for (unsigned I = 0; I != SourceLen; I += 2) {
+    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+      APSInt Elem1 = RHS.elem<T>(I).toAPSInt();
+      APSInt Elem2 = RHS.elem<T>(I + 1).toAPSInt();
+      Dst.elem<T>(DstElem) =
+          static_cast<T>(APSInt(Fn(Elem1, Elem2), DestUnsigned));
+    });
+    ++DstElem;
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
+static bool interp_builtin_horizontal_fp_binop(
+    InterpState &S, CodePtr OpPC, const CallExpr *Call,
+    llvm::function_ref<APFloat(const APFloat &, const APFloat &,
+                               llvm::RoundingMode)>
+        Fn) {
+  assert(Call->getNumArgs() == 2);
+  assert(Call->getArg(0)->getType()->isVectorType() &&
+         Call->getArg(1)->getType()->isVectorType());
+  const Pointer &RHS = S.Stk.pop<Pointer>();
+  const Pointer &LHS = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
+  llvm::RoundingMode RM = getRoundingMode(FPO);
+  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+  unsigned SourceLen = VT->getNumElements();
+  assert(SourceLen % 2 == 0 &&
+         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
+             SourceLen);
+  unsigned DstElem = 0;
+  for (unsigned I = 0; I != SourceLen; I += 2) {
+    using T = PrimConv<PT_Float>::T;
+    APFloat Elem1 = LHS.elem<T>(I).getAPFloat();
+    APFloat Elem2 = LHS.elem<T>(I + 1).getAPFloat();
+    Dst.elem<T>(DstElem++) = static_cast<T>(APFloat(Fn(Elem1, Elem2, RM)));
----------------
tbaederr wrote:

`Fn` already returns an `APFloat`, doesn't it? Why the extra constructor call here?

https://github.com/llvm/llvm-project/pull/156822