[clang] [llvm] Emit constrained atan2 intrinsic for clang builtin (PR #113636)

Mon Nov 11 18:45:49 PST 2024

https://github.com/tex3d updated https://github.com/llvm/llvm-project/pull/113636

>From 661bd4ceba1e60bc12e1e85bffc53edfd13f5494 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr at microsoft.com>
Date: Tue, 15 Oct 2024 16:18:44 -0700
Subject: [PATCH 1/6] Emit constrained atan2 intrinsic for clang builtin

This change is part of this proposal: https://discourse.llvm.org/t/rfc-all-the-math-intrinsics/78294

- `Builtins.td` - Add f16 support for libm atan2 builtin
- `CGBuiltin.cpp` - Emit constraint atan2 intrinsic for clang builtin

Part of Implement the atan2 HLSL Function #70096.
---
 clang/include/clang/Basic/Builtins.td         |  6 +++---
 clang/lib/CodeGen/CGBuiltin.cpp               | 11 ++++++++++
 clang/test/CodeGen/X86/math-builtins.c        | 14 ++++++-------
 .../test/CodeGen/constrained-math-builtins.c  |  7 +++++++
 clang/test/CodeGen/libcalls.c                 |  7 +++----
 clang/test/CodeGen/math-libcalls.c            | 20 +++++++++----------
 .../test/CodeGenCXX/builtin-calling-conv.cpp  | 10 +++++-----
 clang/test/CodeGenOpenCL/builtins-f16.cl      |  3 +++
 8 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 4360e0bf9840f1..e866605ac05c09 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -227,10 +227,10 @@ def FminimumNumF16F128 : Builtin, F16F128MathTemplate {
   let Prototype = "T(T, T)";
 }
 
-def Atan2F128 : Builtin {
-  let Spellings = ["__builtin_atan2f128"];
+def Atan2F16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_atan2"];
   let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
-  let Prototype = "__float128(__float128, __float128)";
+  let Prototype = "T(T, T)";
 }
 
 def CopysignF16 : Builtin {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 430ac5626f89d7..eaae4fbf711c8d 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2798,6 +2798,17 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
           *this, E, Intrinsic::atan, Intrinsic::experimental_constrained_atan));
 
+    case Builtin::BIatan2:
+    case Builtin::BIatan2f:
+    case Builtin::BIatan2l:
+    case Builtin::BI__builtin_atan2:
+    case Builtin::BI__builtin_atan2f:
+    case Builtin::BI__builtin_atan2f16:
+    case Builtin::BI__builtin_atan2l:
+    case Builtin::BI__builtin_atan2f128:
+      return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::atan2, Intrinsic::experimental_constrained_atan2));
+
     case Builtin::BIceil:
     case Builtin::BIceilf:
     case Builtin::BIceill:
diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c
index 48465df21cca19..bf107437fc63a3 100644
--- a/clang/test/CodeGen/X86/math-builtins.c
+++ b/clang/test/CodeGen/X86/math-builtins.c
@@ -45,10 +45,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_atan2(f,f);    __builtin_atan2f(f,f) ;  __builtin_atan2l(f, f); __builtin_atan2f128(f,f);
 
-// NO__ERRNO: declare double @atan2(double noundef, double noundef) [[READNONE:#[0-9]+]]
-// NO__ERRNO: declare float @atan2f(float noundef, float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @atan2l(x86_fp80 noundef, x86_fp80 noundef) [[READNONE]]
-// NO__ERRNO: declare fp128 @atan2f128(fp128 noundef, fp128 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.atan2.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
+// NO__ERRNO: declare float @llvm.atan2.f32(float, float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare fp128 @llvm.atan2.f128(fp128, fp128) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @atan2(double noundef, double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @atan2f(float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @atan2l(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -56,7 +56,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_copysign(f,f); __builtin_copysignf(f,f); __builtin_copysignl(f,f); __builtin_copysignf128(f,f);
 
-// NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
+// NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]]
@@ -179,7 +179,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_acosh(f);      __builtin_acoshf(f);     __builtin_acoshl(f);  __builtin_acoshf128(f);
 
-// NO__ERRNO: declare double @acosh(double noundef) [[READNONE]]
+// NO__ERRNO: declare double @acosh(double noundef) [[READNONE:#[0-9]+]]
 // NO__ERRNO: declare float @acoshf(float noundef) [[READNONE]]
 // NO__ERRNO: declare x86_fp80 @acoshl(x86_fp80 noundef) [[READNONE]]
 // NO__ERRNO: declare fp128 @acoshf128(fp128 noundef) [[READNONE]]
@@ -721,10 +721,10 @@ __builtin_trunc(f);      __builtin_truncf(f);     __builtin_truncl(f); __builtin
 // HAS_ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC]]
 };
 
-// NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
 // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
 // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
 // NO__ERRNO: attributes [[PURE]] = { {{.*}}memory(read){{.*}} }
+// NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
 
 // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
 // HAS_ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
diff --git a/clang/test/CodeGen/constrained-math-builtins.c b/clang/test/CodeGen/constrained-math-builtins.c
index aa77620b445356..68b9e75283c547 100644
--- a/clang/test/CodeGen/constrained-math-builtins.c
+++ b/clang/test/CodeGen/constrained-math-builtins.c
@@ -57,6 +57,13 @@ __builtin_atan(f);        __builtin_atanf(f);       __builtin_atanl(f); __builti
 // CHECK: call x86_fp80 @llvm.experimental.constrained.atan.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 // CHECK: call fp128 @llvm.experimental.constrained.atan.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 
+__builtin_atan2(f,f);        __builtin_atan2f(f,f);       __builtin_atan2l(f,f); __builtin_atan2f128(f,f);
+
+// CHECK: call double @llvm.experimental.constrained.atan2.f64(double %{{.*}}, double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call float @llvm.experimental.constrained.atan2.f32(float %{{.*}}, float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call x86_fp80 @llvm.experimental.constrained.atan2.f80(x86_fp80 %{{.*}}, x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call fp128 @llvm.experimental.constrained.atan2.f128(fp128 %{{.*}}, fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+
   __builtin_ceil(f);       __builtin_ceilf(f);      __builtin_ceill(f); __builtin_ceilf128(f);
 
 // CHECK: call double @llvm.experimental.constrained.ceil.f64(double %{{.*}}, metadata !"fpexcept.strict")
diff --git a/clang/test/CodeGen/libcalls.c b/clang/test/CodeGen/libcalls.c
index b1637121127c5b..1e4b06e34aaf92 100644
--- a/clang/test/CodeGen/libcalls.c
+++ b/clang/test/CodeGen/libcalls.c
@@ -95,9 +95,9 @@ void test_builtins(double d, float f, long double ld) {
   double atan2_ = atan2(d, 2);
   long double atan2l_ = atan2l(ld, ld);
   float atan2f_ = atan2f(f, f);
-// CHECK-NO: declare double @atan2(double noundef, double noundef) [[NUW_RN:#[0-9]+]]
-// CHECK-NO: declare x86_fp80 @atan2l(x86_fp80 noundef, x86_fp80 noundef) [[NUW_RN]]
-// CHECK-NO: declare float @atan2f(float noundef, float noundef) [[NUW_RN]]
+// CHECK-NO: declare double @llvm.atan2.f64(double, double) [[NUW_RNI]]
+// CHECK-NO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[NUW_RNI]]
+// CHECK-NO: declare float @llvm.atan2.f32(float, float) [[NUW_RNI]]
 // CHECK-YES: declare double @atan2(double noundef, double noundef) [[NUW]]
 // CHECK-YES: declare x86_fp80 @atan2l(x86_fp80 noundef, x86_fp80 noundef) [[NUW]]
 // CHECK-YES: declare float @atan2f(float noundef, float noundef) [[NUW]]
@@ -124,5 +124,4 @@ void test_builtins(double d, float f, long double ld) {
 }
 
 // CHECK-YES: attributes [[NUW]] = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" }
-// CHECK-NO-DAG: attributes [[NUW_RN]] = { nounwind willreturn memory(none){{.*}} }
 // CHECK-NO-DAG: attributes [[NUW_RNI]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c
index 2226212eca94ee..bcc61c8f046b43 100644
--- a/clang/test/CodeGen/math-libcalls.c
+++ b/clang/test/CodeGen/math-libcalls.c
@@ -23,19 +23,19 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   atan2(f,f);    atan2f(f,f) ;  atan2l(f, f);
 
-  // NO__ERRNO: declare double @atan2(double noundef, double noundef) [[READNONE:#[0-9]+]]
-  // NO__ERRNO: declare float @atan2f(float noundef, float noundef) [[READNONE]]
-  // NO__ERRNO: declare x86_fp80 @atan2l(x86_fp80 noundef, x86_fp80 noundef) [[READNONE]]
+  // NO__ERRNO: declare double @llvm.atan2.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
+  // NO__ERRNO: declare float @llvm.atan2.f32(float, float) [[READNONE_INTRINSIC]]
+  // NO__ERRNO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
   // HAS_ERRNO: declare double @atan2(double noundef, double noundef) [[NOT_READNONE]]
   // HAS_ERRNO: declare float @atan2f(float noundef, float noundef) [[NOT_READNONE]]
   // HAS_ERRNO: declare x86_fp80 @atan2l(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
-  // HAS_MAYTRAP: declare double @atan2(double noundef, double noundef) [[NOT_READNONE:#[0-9]+]]
-  // HAS_MAYTRAP: declare float @atan2f(float noundef, float noundef) [[NOT_READNONE]]
-  // HAS_MAYTRAP: declare x86_fp80 @atan2l(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
+  // HAS_MAYTRAP: declare double @llvm.experimental.constrained.atan2.f64(
+  // HAS_MAYTRAP: declare float @llvm.experimental.constrained.atan2.f32(
+  // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.atan2.f80(
 
   copysign(f,f); copysignf(f,f);copysignl(f,f);
 
-  // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
+  // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC]]
   // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
   // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
   // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
@@ -65,13 +65,13 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
   // HAS_ERRNO: declare double @frexp(double noundef, ptr noundef) [[NOT_READNONE]]
   // HAS_ERRNO: declare float @frexpf(float noundef, ptr noundef) [[NOT_READNONE]]
   // HAS_ERRNO: declare x86_fp80 @frexpl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]]
-  // HAS_MAYTRAP: declare double @frexp(double noundef, ptr noundef) [[NOT_READNONE]]
+  // HAS_MAYTRAP: declare double @frexp(double noundef, ptr noundef) [[NOT_READNONE:#[0-9]+]]
   // HAS_MAYTRAP: declare float @frexpf(float noundef, ptr noundef) [[NOT_READNONE]]
   // HAS_MAYTRAP: declare x86_fp80 @frexpl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]]
 
   ldexp(f,f);    ldexpf(f,f);   ldexpl(f,f);
 
-  // NO__ERRNO: declare double @ldexp(double noundef, i32 noundef) [[READNONE]]
+  // NO__ERRNO: declare double @ldexp(double noundef, i32 noundef) [[READNONE:#[0-9]+]]
   // NO__ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[READNONE]]
   // NO__ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[READNONE]]
   // HAS_ERRNO: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]]
@@ -719,9 +719,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
 };
 
-// NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
 // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
 // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
+// NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
 // NO__ERRNO: attributes [[READONLY]] = { {{.*}}memory(read){{.*}} }
 
 // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
diff --git a/clang/test/CodeGenCXX/builtin-calling-conv.cpp b/clang/test/CodeGenCXX/builtin-calling-conv.cpp
index 7020d1e0a24144..6b1c308344e05f 100644
--- a/clang/test/CodeGenCXX/builtin-calling-conv.cpp
+++ b/clang/test/CodeGenCXX/builtin-calling-conv.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -triple x86_64-linux-pc -DREDECL -emit-llvm %s -o - | FileCheck %s -check-prefix LINUX
-// RUN: %clang_cc1 -triple spir-unknown-unknown -DREDECL -DSPIR -emit-llvm %s -o - | FileCheck %s -check-prefix SPIR
-// RUN: %clang_cc1 -triple x86_64-linux-pc -emit-llvm %s -o - | FileCheck %s -check-prefix LINUX
-// RUN: %clang_cc1 -triple spir-unknown-unknown -DSPIR -emit-llvm %s -o - | FileCheck %s -check-prefix SPIR
-// RUN: %clang_cc1 -triple i386-windows-pc -fdefault-calling-conv=stdcall -emit-llvm %s -o - | FileCheck %s -check-prefix WIN32
+// RUN: %clang_cc1 -triple x86_64-linux-pc -DREDECL -emit-llvm -fmath-errno %s -o - | FileCheck %s -check-prefix LINUX
+// RUN: %clang_cc1 -triple spir-unknown-unknown -DREDECL -DSPIR -emit-llvm -fmath-errno %s -o - | FileCheck %s -check-prefix SPIR
+// RUN: %clang_cc1 -triple x86_64-linux-pc -emit-llvm -fmath-errno %s -o - | FileCheck %s -check-prefix LINUX
+// RUN: %clang_cc1 -triple spir-unknown-unknown -DSPIR -emit-llvm -fmath-errno %s -o - | FileCheck %s -check-prefix SPIR
+// RUN: %clang_cc1 -triple i386-windows-pc -fdefault-calling-conv=stdcall -emit-llvm -fmath-errno %s -o - | FileCheck %s -check-prefix WIN32
 
 #ifdef REDECL
 namespace std {
diff --git a/clang/test/CodeGenOpenCL/builtins-f16.cl b/clang/test/CodeGenOpenCL/builtins-f16.cl
index 8150bc1ac9e2d7..e8b62fe0830cdb 100644
--- a/clang/test/CodeGenOpenCL/builtins-f16.cl
+++ b/clang/test/CodeGenOpenCL/builtins-f16.cl
@@ -15,6 +15,9 @@ void test_half_builtins(half h0, half h1, half h2, int i0) {
   // CHECK: call half @llvm.atan.f16(half %h0)
   res = __builtin_atanf16(h0);
 
+  // CHECK: call half @llvm.atan2.f16(half %h0, half %h1)
+  res = __builtin_atan2f16(h0, h1);
+
   // CHECK: call half @llvm.copysign.f16(half %h0, half %h1)
   res = __builtin_copysignf16(h0, h1);
 

>From e68456bb80e1be6d7ebbe2e8dc53225b05313556 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr at microsoft.com>
Date: Tue, 15 Oct 2024 19:59:01 -0700
Subject: [PATCH 2/6] Use erff instead of atan2 for builtin calling convention
 check

---
 .../test/CodeGenCXX/builtin-calling-conv.cpp  | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/clang/test/CodeGenCXX/builtin-calling-conv.cpp b/clang/test/CodeGenCXX/builtin-calling-conv.cpp
index 6b1c308344e05f..92d698b43be0ab 100644
--- a/clang/test/CodeGenCXX/builtin-calling-conv.cpp
+++ b/clang/test/CodeGenCXX/builtin-calling-conv.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -triple x86_64-linux-pc -DREDECL -emit-llvm -fmath-errno %s -o - | FileCheck %s -check-prefix LINUX
-// RUN: %clang_cc1 -triple spir-unknown-unknown -DREDECL -DSPIR -emit-llvm -fmath-errno %s -o - | FileCheck %s -check-prefix SPIR
-// RUN: %clang_cc1 -triple x86_64-linux-pc -emit-llvm -fmath-errno %s -o - | FileCheck %s -check-prefix LINUX
-// RUN: %clang_cc1 -triple spir-unknown-unknown -DSPIR -emit-llvm -fmath-errno %s -o - | FileCheck %s -check-prefix SPIR
-// RUN: %clang_cc1 -triple i386-windows-pc -fdefault-calling-conv=stdcall -emit-llvm -fmath-errno %s -o - | FileCheck %s -check-prefix WIN32
+// RUN: %clang_cc1 -triple x86_64-linux-pc -DREDECL -emit-llvm %s -o - | FileCheck %s -check-prefix LINUX
+// RUN: %clang_cc1 -triple spir-unknown-unknown -DREDECL -DSPIR -emit-llvm %s -o - | FileCheck %s -check-prefix SPIR
+// RUN: %clang_cc1 -triple x86_64-linux-pc -emit-llvm %s -o - | FileCheck %s -check-prefix LINUX
+// RUN: %clang_cc1 -triple spir-unknown-unknown -DSPIR -emit-llvm %s -o - | FileCheck %s -check-prefix SPIR
+// RUN: %clang_cc1 -triple i386-windows-pc -fdefault-calling-conv=stdcall -emit-llvm %s -o - | FileCheck %s -check-prefix WIN32
 
 #ifdef REDECL
 namespace std {
@@ -13,7 +13,7 @@ using size_t = unsigned long;
 #endif // SPIR
 } // namespace std
 
-float __builtin_atan2f(float, float);
+float __builtin_erff(float);
 void *operator new(std::size_t);
 #endif // REDECL
 
@@ -22,32 +22,32 @@ void foo();
 void user() {
   int i;
   ::operator new(5);
-  (void)__builtin_atan2f(1.1, 2.2);
+  (void)__builtin_erff(1.1);
   foo();
 }
 
 // LINUX: define{{.*}} void @_Z4userv()
 // LINUX: call noalias noundef nonnull ptr @_Znwm
-// LINUX: call float @atan2f
+// LINUX: call float @erff
 // LINUX: call void @_Z3foov
 // LINUX: declare noundef nonnull ptr @_Znwm(i64 noundef)
-// LINUX: declare float @atan2f(float noundef, float noundef)
+// LINUX: declare float @erff(float noundef)
 // LINUX: declare void @_Z3foov()
 
 // SPIR: define{{.*}} spir_func void @_Z4userv()
 // SPIR: call spir_func noalias noundef nonnull ptr @_Znwj
-// SPIR: call spir_func float @atan2f
+// SPIR: call spir_func float @erff
 // SPIR: call spir_func void @_Z3foov
 // SPIR: declare spir_func noundef nonnull ptr @_Znwj(i32 noundef)
-// SPIR: declare spir_func float @atan2f(float noundef, float noundef)
+// SPIR: declare spir_func float @erff(float noundef)
 // SPIR: declare spir_func void @_Z3foov()
 
 // Note: Windows /G options should not change the platform default calling
 // convention of builtins.
 // WIN32: define dso_local x86_stdcallcc void @"?user@@YGXXZ"()
 // WIN32: call noalias noundef nonnull ptr @"??2 at YAPAXI@Z"
-// WIN32: call float @atan2f
+// WIN32: call float @erff
 // WIN32: call x86_stdcallcc void @"?foo@@YGXXZ"
 // WIN32: declare dso_local noundef nonnull ptr @"??2 at YAPAXI@Z"(
-// WIN32: declare dso_local float @atan2f(float noundef, float noundef)
+// WIN32: declare dso_local float @erff(float noundef)
 // WIN32: declare dso_local x86_stdcallcc void @"?foo@@YGXXZ"()

>From 7761f4d4a9048e3dbcafbe5c10783b05043e6aa9 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr at microsoft.com>
Date: Thu, 24 Oct 2024 20:00:06 -0700
Subject: [PATCH 3/6] clang-format

---
 clang/lib/CodeGen/CGBuiltin.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index eaae4fbf711c8d..aefd3a069b2186 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2807,7 +2807,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_atan2l:
     case Builtin::BI__builtin_atan2f128:
       return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(
-          *this, E, Intrinsic::atan2, Intrinsic::experimental_constrained_atan2));
+          *this, E, Intrinsic::atan2,
+          Intrinsic::experimental_constrained_atan2));
 
     case Builtin::BIceil:
     case Builtin::BIceilf:

>From 09aa896a55abdd4f773c28c640853891261e4674 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr at microsoft.com>
Date: Mon, 11 Nov 2024 16:33:39 -0800
Subject: [PATCH 4/6] Update tests for constraint intrinsics on PowerPC, RISCV,
 SystemZ, ARM

---
 llvm/test/CodeGen/ARM/fp-intrinsics.ll        |  16 +
 .../CodeGen/PowerPC/ctrloop-constrained-fp.ll |  57 +++
 .../CodeGen/RISCV/double-intrinsics-strict.ll |  60 +++
 .../vector-constrained-fp-intrinsics.ll       | 391 ++++++++++++++++++
 4 files changed, 524 insertions(+)

diff --git a/llvm/test/CodeGen/ARM/fp-intrinsics.ll b/llvm/test/CodeGen/ARM/fp-intrinsics.ll
index ca2dc701bd1fb3..93b6a58a22b6ce 100644
--- a/llvm/test/CodeGen/ARM/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/ARM/fp-intrinsics.ll
@@ -146,6 +146,13 @@ define float @tan_f32(float %x) #0 {
   ret float %val
 }
 
+; CHECK-LABEL: atan2_f32:
+; CHECK: bl atan2f
+define float @atan2_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.atan2.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
 ; CHECK-LABEL: pow_f32:
 ; CHECK: bl powf
 define float @pow_f32(float %x, float %y) #0 {
@@ -610,6 +617,13 @@ define double @tan_f64(double %x) #0 {
   ret double %val
 }
 
+; CHECK-LABEL: atan2_f64:
+; CHECK: bl atan2
+define double @atan2_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.atan2.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
 ; CHECK-LABEL: pow_f64:
 ; CHECK: bl pow
 define double @pow_f64(double %x, double %y) #0 {
@@ -1038,6 +1052,7 @@ declare float @llvm.experimental.constrained.powi.f32(float, i32, metadata, meta
 declare float @llvm.experimental.constrained.sin.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.cos.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.atan2.f32(float, float, metadata, metadata)
 declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata)
 declare float @llvm.experimental.constrained.log.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.log10.f32(float, metadata, metadata)
@@ -1072,6 +1087,7 @@ declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, me
 declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.atan2.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.log10.f64(double, metadata, metadata)
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll b/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll
index 402ecb763d5b33..7966f8c0a93ec8 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll
@@ -132,6 +132,63 @@ for.body:
   br i1 %cond, label %exit, label %for.body
 }
 
+; Check constrained ops converted to call
+define void @testAtan2(ptr %cast1, ptr %cast2) strictfp {
+; CHECK-LABEL: testAtan2:
+; CHECK:       # %bb.0: # %root
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    .cfi_offset r28, -32
+; CHECK-NEXT:    .cfi_offset r29, -24
+; CHECK-NEXT:    .cfi_offset r30, -16
+; CHECK-NEXT:    std 28, -32(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 29, -24(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 30, -16(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stdu 1, -64(1)
+; CHECK-NEXT:    addi 30, 3, -8
+; CHECK-NEXT:    addi 29, 4, -8
+; CHECK-NEXT:    li 28, 255
+; CHECK-NEXT:    std 0, 80(1)
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_1: # %for.body
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lfdu 2, 8(29)
+; CHECK-NEXT:    lfdu 1, 8(30)
+; CHECK-NEXT:    bl atan2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi 28, 28, -1
+; CHECK-NEXT:    stfd 1, 0(30)
+; CHECK-NEXT:    cmpldi 28, 0
+; CHECK-NEXT:    bc 12, 1, .LBB3_1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    addi 1, 1, 64
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
+root:
+  br label %for.body
+
+exit:
+  ret void
+
+for.body:
+  %i = phi i64 [ 0, %root ], [ %next, %for.body ]
+  %idx1 = getelementptr inbounds double, ptr %cast1, i64 %i
+  %idx2 = getelementptr inbounds double, ptr %cast2, i64 %i
+  %val1 = load double, ptr %idx1
+  %val2 = load double, ptr %idx2
+  %tan = tail call nnan ninf nsz arcp double @llvm.experimental.constrained.atan2.f64(double %val1, double %val2, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  store double %tan, ptr %idx1, align 8
+  %next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %next, 255
+  br i1 %cond, label %exit, label %for.body
+}
+
 declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.atan2.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata)
diff --git a/llvm/test/CodeGen/RISCV/double-intrinsics-strict.ll b/llvm/test/CodeGen/RISCV/double-intrinsics-strict.ll
index 7e5ea173e52295..3adc46143f9f20 100644
--- a/llvm/test/CodeGen/RISCV/double-intrinsics-strict.ll
+++ b/llvm/test/CodeGen/RISCV/double-intrinsics-strict.ll
@@ -1635,3 +1635,63 @@ define i64 @llround_f64(double %a) nounwind strictfp {
   %1 = call i64 @llvm.experimental.constrained.llround.i64.f64(double %a, metadata !"fpexcept.strict") strictfp
   ret i64 %1
 }
+
+declare double @llvm.experimental.constrained.atan2.f64(double, double, metadata, metadata)
+
+define double @atan2_f64(double %a, double %b) nounwind strictfp {
+; RV32IFD-LABEL: atan2_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT:    call atan2
+; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+;
+; RV64IFD-LABEL: atan2_f64:
+; RV64IFD:       # %bb.0:
+; RV64IFD-NEXT:    addi sp, sp, -16
+; RV64IFD-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IFD-NEXT:    call atan2
+; RV64IFD-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IFD-NEXT:    addi sp, sp, 16
+; RV64IFD-NEXT:    ret
+;
+; RV32IZFINXZDINX-LABEL: atan2_f64:
+; RV32IZFINXZDINX:       # %bb.0:
+; RV32IZFINXZDINX-NEXT:    addi sp, sp, -16
+; RV32IZFINXZDINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    call atan2
+; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    addi sp, sp, 16
+; RV32IZFINXZDINX-NEXT:    ret
+;
+; RV64IZFINXZDINX-LABEL: atan2_f64:
+; RV64IZFINXZDINX:       # %bb.0:
+; RV64IZFINXZDINX-NEXT:    addi sp, sp, -16
+; RV64IZFINXZDINX-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IZFINXZDINX-NEXT:    call atan2
+; RV64IZFINXZDINX-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IZFINXZDINX-NEXT:    addi sp, sp, 16
+; RV64IZFINXZDINX-NEXT:    ret
+;
+; RV32I-LABEL: atan2_f64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call atan2
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: atan2_f64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call atan2
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+  %1 = call double @llvm.experimental.constrained.atan2.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp
+  ret double %1
+}
diff --git a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
index 4a109ee96a3d3e..edf818ab95131c 100644
--- a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
@@ -6539,6 +6539,392 @@ entry:
   ret <4 x double> %tan
 }
 
+define <1 x float> @constrained_vector_atan2_v1f32() #0 {
+; S390X-LABEL: constrained_vector_atan2_v1f32:
+; S390X:       # %bb.0: # %entry
+; S390X-NEXT:    stmg %r14, %r15, 112(%r15)
+; S390X-NEXT:    .cfi_offset %r14, -48
+; S390X-NEXT:    .cfi_offset %r15, -40
+; S390X-NEXT:    aghi %r15, -160
+; S390X-NEXT:    .cfi_def_cfa_offset 320
+; S390X-NEXT:    larl %r1, .LCPI124_0
+; S390X-NEXT:    le %f0, 0(%r1)
+; S390X-NEXT:    larl %r1, .LCPI124_1
+; S390X-NEXT:    le %f2, 0(%r1)
+; S390X-NEXT:    brasl %r14, atan2f at PLT
+; S390X-NEXT:    lmg %r14, %r15, 272(%r15)
+; S390X-NEXT:    br %r14
+;
+; SZ13-LABEL: constrained_vector_atan2_v1f32:
+; SZ13:       # %bb.0: # %entry
+; SZ13-NEXT:    stmg %r14, %r15, 112(%r15)
+; SZ13-NEXT:    .cfi_offset %r14, -48
+; SZ13-NEXT:    .cfi_offset %r15, -40
+; SZ13-NEXT:    aghi %r15, -160
+; SZ13-NEXT:    .cfi_def_cfa_offset 320
+; SZ13-NEXT:    larl %r1, .LCPI124_0
+; SZ13-NEXT:    lde %f0, 0(%r1)
+; SZ13-NEXT:    larl %r1, .LCPI124_1
+; SZ13-NEXT:    lde %f2, 0(%r1)
+; SZ13-NEXT:    brasl %r14, atan2f at PLT
+; SZ13-NEXT:    # kill: def $f0s killed $f0s def $v0
+; SZ13-NEXT:    vlr %v24, %v0
+; SZ13-NEXT:    lmg %r14, %r15, 272(%r15)
+; SZ13-NEXT:    br %r14
+entry:
+  %atan2 = call <1 x float> @llvm.experimental.constrained.atan2.v1f32(
+                             <1 x float> <float 42.0>,
+                             <1 x float> <float 43.0>,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #0
+  ret <1 x float> %atan2
+}
+
+define <2 x double> @constrained_vector_atan2_v2f64() #0 {
+; S390X-LABEL: constrained_vector_atan2_v2f64:
+; S390X:       # %bb.0: # %entry
+; S390X-NEXT:    stmg %r14, %r15, 112(%r15)
+; S390X-NEXT:    .cfi_offset %r14, -48
+; S390X-NEXT:    .cfi_offset %r15, -40
+; S390X-NEXT:    aghi %r15, -168
+; S390X-NEXT:    .cfi_def_cfa_offset 328
+; S390X-NEXT:    std %f8, 160(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    .cfi_offset %f8, -168
+; S390X-NEXT:    larl %r1, .LCPI125_0
+; S390X-NEXT:    ld %f0, 0(%r1)
+; S390X-NEXT:    larl %r1, .LCPI125_1
+; S390X-NEXT:    ld %f2, 0(%r1)
+; S390X-NEXT:    brasl %r14, atan2 at PLT
+; S390X-NEXT:    larl %r1, .LCPI125_2
+; S390X-NEXT:    ld %f1, 0(%r1)
+; S390X-NEXT:    larl %r1, .LCPI125_3
+; S390X-NEXT:    ld %f2, 0(%r1)
+; S390X-NEXT:    ldr %f8, %f0
+; S390X-NEXT:    ldr %f0, %f1
+; S390X-NEXT:    brasl %r14, atan2 at PLT
+; S390X-NEXT:    ldr %f2, %f8
+; S390X-NEXT:    ld %f8, 160(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    lmg %r14, %r15, 280(%r15)
+; S390X-NEXT:    br %r14
+;
+; SZ13-LABEL: constrained_vector_atan2_v2f64:
+; SZ13:       # %bb.0: # %entry
+; SZ13-NEXT:    stmg %r14, %r15, 112(%r15)
+; SZ13-NEXT:    .cfi_offset %r14, -48
+; SZ13-NEXT:    .cfi_offset %r15, -40
+; SZ13-NEXT:    aghi %r15, -176
+; SZ13-NEXT:    .cfi_def_cfa_offset 336
+; SZ13-NEXT:    larl %r1, .LCPI125_0
+; SZ13-NEXT:    ld %f0, 0(%r1)
+; SZ13-NEXT:    larl %r1, .LCPI125_1
+; SZ13-NEXT:    ld %f2, 0(%r1)
+; SZ13-NEXT:    brasl %r14, atan2 at PLT
+; SZ13-NEXT:    larl %r1, .LCPI125_2
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    ld %f0, 0(%r1)
+; SZ13-NEXT:    larl %r1, .LCPI125_3
+; SZ13-NEXT:    ld %f2, 0(%r1)
+; SZ13-NEXT:    brasl %r14, atan2 at PLT
+; SZ13-NEXT:    vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vmrhg %v24, %v0, %v1
+; SZ13-NEXT:    lmg %r14, %r15, 288(%r15)
+; SZ13-NEXT:    br %r14
+entry:
+  %atan2 = call <2 x double> @llvm.experimental.constrained.atan2.v2f64(
+                             <2 x double> <double 42.0, double 42.1>,
+                             <2 x double> <double 43.0, double 43.1>,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #0
+  ret <2 x double> %atan2
+}
+
+define <3 x float> @constrained_vector_atan2_v3f32() #0 {
+; S390X-LABEL: constrained_vector_atan2_v3f32:
+; S390X:       # %bb.0: # %entry
+; S390X-NEXT:    stmg %r14, %r15, 112(%r15)
+; S390X-NEXT:    .cfi_offset %r14, -48
+; S390X-NEXT:    .cfi_offset %r15, -40
+; S390X-NEXT:    aghi %r15, -176
+; S390X-NEXT:    .cfi_def_cfa_offset 336
+; S390X-NEXT:    std %f8, 168(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    std %f9, 160(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    .cfi_offset %f8, -168
+; S390X-NEXT:    .cfi_offset %f9, -176
+; S390X-NEXT:    larl %r1, .LCPI126_0
+; S390X-NEXT:    le %f0, 0(%r1)
+; S390X-NEXT:    larl %r1, .LCPI126_1
+; S390X-NEXT:    le %f2, 0(%r1)
+; S390X-NEXT:    brasl %r14, atan2f at PLT
+; S390X-NEXT:    larl %r1, .LCPI126_2
+; S390X-NEXT:    le %f1, 0(%r1)
+; S390X-NEXT:    larl %r1, .LCPI126_3
+; S390X-NEXT:    le %f2, 0(%r1)
+; S390X-NEXT:    ler %f8, %f0
+; S390X-NEXT:    ler %f0, %f1
+; S390X-NEXT:    brasl %r14, atan2f at PLT
+; S390X-NEXT:    larl %r1, .LCPI126_4
+; S390X-NEXT:    le %f1, 0(%r1)
+; S390X-NEXT:    larl %r1, .LCPI126_5
+; S390X-NEXT:    le %f2, 0(%r1)
+; S390X-NEXT:    ler %f9, %f0
+; S390X-NEXT:    ler %f0, %f1
+; S390X-NEXT:    brasl %r14, atan2f at PLT
+; S390X-NEXT:    ler %f2, %f9
+; S390X-NEXT:    ler %f4, %f8
+; S390X-NEXT:    ld %f8, 168(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    ld %f9, 160(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    lmg %r14, %r15, 288(%r15)
+; S390X-NEXT:    br %r14
+;
+; SZ13-LABEL: constrained_vector_atan2_v3f32:
+; SZ13:       # %bb.0: # %entry
+; SZ13-NEXT:    stmg %r14, %r15, 112(%r15)
+; SZ13-NEXT:    .cfi_offset %r14, -48
+; SZ13-NEXT:    .cfi_offset %r15, -40
+; SZ13-NEXT:    aghi %r15, -192
+; SZ13-NEXT:    .cfi_def_cfa_offset 352
+; SZ13-NEXT:    larl %r1, .LCPI126_0
+; SZ13-NEXT:    lde %f0, 0(%r1)
+; SZ13-NEXT:    larl %r1, .LCPI126_1
+; SZ13-NEXT:    lde %f2, 0(%r1)
+; SZ13-NEXT:    brasl %r14, atan2f at PLT
+; SZ13-NEXT:    larl %r1, .LCPI126_2
+; SZ13-NEXT:    # kill: def $f0s killed $f0s def $v0
+; SZ13-NEXT:    vst %v0, 176(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    lde %f0, 0(%r1)
+; SZ13-NEXT:    larl %r1, .LCPI126_3
+; SZ13-NEXT:    lde %f2, 0(%r1)
+; SZ13-NEXT:    brasl %r14, atan2f at PLT
+; SZ13-NEXT:    larl %r1, .LCPI126_4
+; SZ13-NEXT:    # kill: def $f0s killed $f0s def $v0
+; SZ13-NEXT:    vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    lde %f0, 0(%r1)
+; SZ13-NEXT:    larl %r1, .LCPI126_5
+; SZ13-NEXT:    lde %f2, 0(%r1)
+; SZ13-NEXT:    brasl %r14, atan2f at PLT
+; SZ13-NEXT:    vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    # kill: def $f0s killed $f0s def $v0
+; SZ13-NEXT:    vmrhf %v0, %v1, %v0
+; SZ13-NEXT:    vl %v1, 176(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    vrepf %v1, %v1, 0
+; SZ13-NEXT:    vmrhg %v24, %v0, %v1
+; SZ13-NEXT:    lmg %r14, %r15, 304(%r15)
+; SZ13-NEXT:    br %r14
+entry:
+  %atan2 = call <3 x float> @llvm.experimental.constrained.atan2.v3f32(
+                              <3 x float> <float 42.0, float 43.0, float 44.0>,
+                              <3 x float> <float 42.125, float 43.25, float 44.375>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict") #0
+  ret <3 x float> %atan2
+}
+
+define void @constrained_vector_atan2_v3f64(ptr %a, ptr %b) #0 {
+; S390X-LABEL: constrained_vector_atan2_v3f64:
+; S390X:       # %bb.0: # %entry
+; S390X-NEXT:    stmg %r13, %r15, 104(%r15)
+; S390X-NEXT:    .cfi_offset %r13, -56
+; S390X-NEXT:    .cfi_offset %r14, -48
+; S390X-NEXT:    .cfi_offset %r15, -40
+; S390X-NEXT:    aghi %r15, -200
+; S390X-NEXT:    .cfi_def_cfa_offset 360
+; S390X-NEXT:    std %f8, 192(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    std %f9, 184(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    std %f10, 176(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    std %f11, 168(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    std %f12, 160(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    .cfi_offset %f8, -168
+; S390X-NEXT:    .cfi_offset %f9, -176
+; S390X-NEXT:    .cfi_offset %f10, -184
+; S390X-NEXT:    .cfi_offset %f11, -192
+; S390X-NEXT:    .cfi_offset %f12, -200
+; S390X-NEXT:    lgr %r13, %r2
+; S390X-NEXT:    ld %f8, 0(%r2)
+; S390X-NEXT:    ld %f9, 8(%r2)
+; S390X-NEXT:    ld %f0, 16(%r2)
+; S390X-NEXT:    ld %f10, 0(%r3)
+; S390X-NEXT:    ld %f2, 16(%r3)
+; S390X-NEXT:    ld %f11, 8(%r3)
+; S390X-NEXT:    brasl %r14, atan2 at PLT
+; S390X-NEXT:    ldr %f12, %f0
+; S390X-NEXT:    ldr %f0, %f9
+; S390X-NEXT:    ldr %f2, %f11
+; S390X-NEXT:    brasl %r14, atan2 at PLT
+; S390X-NEXT:    ldr %f9, %f0
+; S390X-NEXT:    ldr %f0, %f8
+; S390X-NEXT:    ldr %f2, %f10
+; S390X-NEXT:    brasl %r14, atan2 at PLT
+; S390X-NEXT:    std %f0, 0(%r13)
+; S390X-NEXT:    std %f9, 8(%r13)
+; S390X-NEXT:    std %f12, 16(%r13)
+; S390X-NEXT:    ld %f8, 192(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    ld %f9, 184(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    ld %f10, 176(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    ld %f11, 168(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    ld %f12, 160(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    lmg %r13, %r15, 304(%r15)
+; S390X-NEXT:    br %r14
+;
+; SZ13-LABEL: constrained_vector_atan2_v3f64:
+; SZ13:       # %bb.0: # %entry
+; SZ13-NEXT:    stmg %r13, %r15, 104(%r15)
+; SZ13-NEXT:    .cfi_offset %r13, -56
+; SZ13-NEXT:    .cfi_offset %r14, -48
+; SZ13-NEXT:    .cfi_offset %r15, -40
+; SZ13-NEXT:    aghi %r15, -224
+; SZ13-NEXT:    .cfi_def_cfa_offset 384
+; SZ13-NEXT:    std %f8, 216(%r15) # 8-byte Folded Spill
+; SZ13-NEXT:    std %f9, 208(%r15) # 8-byte Folded Spill
+; SZ13-NEXT:    .cfi_offset %f8, -168
+; SZ13-NEXT:    .cfi_offset %f9, -176
+; SZ13-NEXT:    vl %v0, 0(%r2), 4
+; SZ13-NEXT:    vl %v2, 0(%r3), 4
+; SZ13-NEXT:    ld %f8, 16(%r2)
+; SZ13-NEXT:    ld %f9, 16(%r3)
+; SZ13-NEXT:    lgr %r13, %r2
+; SZ13-NEXT:    vst %v0, 176(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    vst %v2, 192(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    # kill: def $f0d killed $f0d killed $v0
+; SZ13-NEXT:    # kill: def $f2d killed $f2d killed $v2
+; SZ13-NEXT:    brasl %r14, atan2 at PLT
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    vl %v0, 176(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    vl %v1, 192(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    vrepg %v0, %v0, 1
+; SZ13-NEXT:    vrepg %v2, %v1, 1
+; SZ13-NEXT:    # kill: def $f0d killed $f0d killed $v0
+; SZ13-NEXT:    # kill: def $f2d killed $f2d killed $v2
+; SZ13-NEXT:    brasl %r14, atan2 at PLT
+; SZ13-NEXT:    vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vmrhg %v0, %v1, %v0
+; SZ13-NEXT:    vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    ldr %f0, %f8
+; SZ13-NEXT:    ldr %f2, %f9
+; SZ13-NEXT:    brasl %r14, atan2 at PLT
+; SZ13-NEXT:    std %f0, 16(%r13)
+; SZ13-NEXT:    vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    ld %f8, 216(%r15) # 8-byte Folded Reload
+; SZ13-NEXT:    ld %f9, 208(%r15) # 8-byte Folded Reload
+; SZ13-NEXT:    vst %v0, 0(%r13), 4
+; SZ13-NEXT:    lmg %r13, %r15, 328(%r15)
+; SZ13-NEXT:    br %r14
+entry:
+  %c = load <3 x double>, ptr %a
+  %d = load <3 x double>, ptr %b
+  %atan2 = call <3 x double> @llvm.experimental.constrained.atan2.v3f64(
+                          <3 x double> %c,
+                          <3 x double> %d,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict") #0
+  store <3 x double> %atan2, ptr %a
+  ret void
+}
+
+define <4 x double> @constrained_vector_atan2_v4f64() #0 {
+; S390X-LABEL: constrained_vector_atan2_v4f64:
+; S390X:       # %bb.0: # %entry
+; S390X-NEXT:    stmg %r14, %r15, 112(%r15)
+; S390X-NEXT:    .cfi_offset %r14, -48
+; S390X-NEXT:    .cfi_offset %r15, -40
+; S390X-NEXT:    aghi %r15, -184
+; S390X-NEXT:    .cfi_def_cfa_offset 344
+; S390X-NEXT:    std %f8, 176(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    std %f9, 168(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    std %f10, 160(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    .cfi_offset %f8, -168
+; S390X-NEXT:    .cfi_offset %f9, -176
+; S390X-NEXT:    .cfi_offset %f10, -184
+; S390X-NEXT:    larl %r1, .LCPI128_0
+; S390X-NEXT:    ld %f0, 0(%r1)
+; S390X-NEXT:    larl %r1, .LCPI128_1
+; S390X-NEXT:    ld %f2, 0(%r1)
+; S390X-NEXT:    brasl %r14, atan2 at PLT
+; S390X-NEXT:    larl %r1, .LCPI128_2
+; S390X-NEXT:    ld %f1, 0(%r1)
+; S390X-NEXT:    larl %r1, .LCPI128_3
+; S390X-NEXT:    ld %f2, 0(%r1)
+; S390X-NEXT:    ldr %f8, %f0
+; S390X-NEXT:    ldr %f0, %f1
+; S390X-NEXT:    brasl %r14, atan2 at PLT
+; S390X-NEXT:    larl %r1, .LCPI128_4
+; S390X-NEXT:    ld %f1, 0(%r1)
+; S390X-NEXT:    larl %r1, .LCPI128_5
+; S390X-NEXT:    ld %f2, 0(%r1)
+; S390X-NEXT:    ldr %f9, %f0
+; S390X-NEXT:    ldr %f0, %f1
+; S390X-NEXT:    brasl %r14, atan2 at PLT
+; S390X-NEXT:    larl %r1, .LCPI128_6
+; S390X-NEXT:    ld %f1, 0(%r1)
+; S390X-NEXT:    larl %r1, .LCPI128_7
+; S390X-NEXT:    ld %f2, 0(%r1)
+; S390X-NEXT:    ldr %f10, %f0
+; S390X-NEXT:    ldr %f0, %f1
+; S390X-NEXT:    brasl %r14, atan2 at PLT
+; S390X-NEXT:    ldr %f2, %f10
+; S390X-NEXT:    ldr %f4, %f9
+; S390X-NEXT:    ldr %f6, %f8
+; S390X-NEXT:    ld %f8, 176(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    ld %f9, 168(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    ld %f10, 160(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    lmg %r14, %r15, 296(%r15)
+; S390X-NEXT:    br %r14
+;
+; SZ13-LABEL: constrained_vector_atan2_v4f64:
+; SZ13:       # %bb.0: # %entry
+; SZ13-NEXT:    stmg %r14, %r15, 112(%r15)
+; SZ13-NEXT:    .cfi_offset %r14, -48
+; SZ13-NEXT:    .cfi_offset %r15, -40
+; SZ13-NEXT:    aghi %r15, -192
+; SZ13-NEXT:    .cfi_def_cfa_offset 352
+; SZ13-NEXT:    larl %r1, .LCPI128_0
+; SZ13-NEXT:    ld %f0, 0(%r1)
+; SZ13-NEXT:    larl %r1, .LCPI128_1
+; SZ13-NEXT:    ld %f2, 0(%r1)
+; SZ13-NEXT:    brasl %r14, atan2 at PLT
+; SZ13-NEXT:    larl %r1, .LCPI128_2
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    ld %f0, 0(%r1)
+; SZ13-NEXT:    larl %r1, .LCPI128_3
+; SZ13-NEXT:    ld %f2, 0(%r1)
+; SZ13-NEXT:    brasl %r14, atan2 at PLT
+; SZ13-NEXT:    vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vmrhg %v0, %v0, %v1
+; SZ13-NEXT:    larl %r1, .LCPI128_4
+; SZ13-NEXT:    vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    ld %f0, 0(%r1)
+; SZ13-NEXT:    larl %r1, .LCPI128_5
+; SZ13-NEXT:    ld %f2, 0(%r1)
+; SZ13-NEXT:    brasl %r14, atan2 at PLT
+; SZ13-NEXT:    larl %r1, .LCPI128_6
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vst %v0, 176(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    ld %f0, 0(%r1)
+; SZ13-NEXT:    larl %r1, .LCPI128_7
+; SZ13-NEXT:    ld %f2, 0(%r1)
+; SZ13-NEXT:    brasl %r14, atan2 at PLT
+; SZ13-NEXT:    vl %v1, 176(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    vl %v24, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vmrhg %v26, %v0, %v1
+; SZ13-NEXT:    lmg %r14, %r15, 304(%r15)
+; SZ13-NEXT:    br %r14
+entry:
+  %atan2 = call <4 x double> @llvm.experimental.constrained.atan2.v4f64(
+                             <4 x double> <double 42.0, double 42.1,
+                                           double 42.2, double 42.3>,
+                             <4 x double> <double 43.0, double 43.1,
+                                           double 43.2, double 43.3>,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #0
+  ret <4 x double> %atan2
+}
+
 attributes #0 = { strictfp }
 
 declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata)
@@ -6552,6 +6938,7 @@ declare <2 x double> @llvm.experimental.constrained.powi.v2f64(<2 x double>, i32
 declare <2 x double> @llvm.experimental.constrained.sin.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.cos.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.tan.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.atan2.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.exp.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.exp2.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.log.v2f64(<2 x double>, metadata, metadata)
@@ -6579,6 +6966,7 @@ declare <1 x float> @llvm.experimental.constrained.powi.v1f32(<1 x float>, i32,
 declare <1 x float> @llvm.experimental.constrained.sin.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.cos.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.tan.v1f32(<1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.atan2.v1f32(<1 x float>, <1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.exp.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.exp2.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.log.v1f32(<1 x float>, metadata, metadata)
@@ -6617,6 +7005,8 @@ declare <3 x float> @llvm.experimental.constrained.cos.v3f32(<3 x float>, metada
 declare <3 x double> @llvm.experimental.constrained.cos.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.tan.v3f32(<3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.tan.v3f64(<3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.atan2.v3f32(<3 x float>, <3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.atan2.v3f64(<3 x double>, <3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.exp.v3f32(<3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.exp.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.exp2.v3f32(<3 x float>, metadata, metadata)
@@ -6657,6 +7047,7 @@ declare <4 x double> @llvm.experimental.constrained.powi.v4f64(<4 x double>, i32
 declare <4 x double> @llvm.experimental.constrained.sin.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.cos.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.tan.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.atan2.v4f64(<4 x double>, <4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.exp.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.exp2.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.log.v4f64(<4 x double>, metadata, metadata)

>From cf261ca0881bfe833807c7b9c6788dded1c37150 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr at microsoft.com>
Date: Mon, 11 Nov 2024 18:14:16 -0800
Subject: [PATCH 5/6] Update more tests

---
 .../ppcf128-constrained-fp-intrinsics.ll      |  46 ++
 .../vector-constrained-fp-intrinsics.ll       | 419 ++++++++++++++++++
 .../CodeGen/RISCV/float-intrinsics-strict.ll  |  60 +++
 3 files changed, 525 insertions(+)

diff --git a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
index 61a0fddeda33e1..c1ee436a40c557 100644
--- a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
@@ -2110,6 +2110,51 @@ entry:
   ret ppc_fp128 %tan
 }
 
+define ppc_fp128 @test_atan2_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) #0 {
+; PC64LE-LABEL: test_atan2_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    std 0, 48(1)
+; PC64LE-NEXT:    bl atan2l
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_atan2_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    std 0, 48(1)
+; PC64LE9-NEXT:    bl atan2l
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_atan2_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    std 0, 128(1)
+; PC64-NEXT:    bl atan2l
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %atan2 = call ppc_fp128 @llvm.experimental.constrained.atan2.ppcf128(
+                    ppc_fp128 %first,
+                    ppc_fp128 %second,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict") #1
+  ret ppc_fp128 %atan2
+}
+
 attributes #0 = { nounwind strictfp }
 attributes #1 = { strictfp }
 
@@ -2141,6 +2186,7 @@ declare ppc_fp128 @llvm.experimental.constrained.sin.ppcf128(ppc_fp128, metadata
 declare ppc_fp128 @llvm.experimental.constrained.sqrt.ppcf128(ppc_fp128, metadata, metadata)
 declare ppc_fp128 @llvm.experimental.constrained.fsub.ppcf128(ppc_fp128, ppc_fp128, metadata, metadata)
 declare ppc_fp128 @llvm.experimental.constrained.tan.ppcf128(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.atan2.ppcf128(ppc_fp128, ppc_fp128, metadata, metadata)
 declare ppc_fp128 @llvm.experimental.constrained.trunc.ppcf128(ppc_fp128, metadata)
 declare i64 @llvm.experimental.constrained.fptosi.i64.ppcf128(ppc_fp128, metadata)
 declare i32 @llvm.experimental.constrained.fptosi.i32.ppcf128(ppc_fp128, metadata)
diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
index aedb1a9c65cf89..71c3069a406fe3 100644
--- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
@@ -8333,6 +8333,420 @@ entry:
   ret <4 x double> %tan
 }
 
+define <1 x float> @constrained_vector_atan2_v1f32(<1 x float> %x, <1 x float> %y) #0 {
+; PC64LE-LABEL: constrained_vector_atan2_v1f32:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    std 0, 48(1)
+; PC64LE-NEXT:    bl atan2f
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: constrained_vector_atan2_v1f32:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    std 0, 48(1)
+; PC64LE9-NEXT:    bl atan2f
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+entry:
+  %atan2 = call <1 x float> @llvm.experimental.constrained.atan2.v1f32(
+                             <1 x float> %x,
+                             <1 x float> %y,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #1
+  ret <1 x float> %atan2
+}
+
+define <2 x double> @constrained_vector_atan2_v2f64(<2 x double> %x, <2 x double> %y) #0 {
+; PC64LE-LABEL: constrained_vector_atan2_v2f64:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    stdu 1, -96(1)
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    std 0, 112(1)
+; PC64LE-NEXT:    stxvd2x 61, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    stxvd2x 62, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    li 3, 80
+; PC64LE-NEXT:    vmr 30, 2
+; PC64LE-NEXT:    stxvd2x 63, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    vmr 31, 3
+; PC64LE-NEXT:    xxlor 1, 62, 62
+; PC64LE-NEXT:    xxlor 2, 63, 63
+; PC64LE-NEXT:    bl atan2
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xxlor 61, 1, 1
+; PC64LE-NEXT:    xxswapd 1, 62
+; PC64LE-NEXT:    xxswapd 2, 63
+; PC64LE-NEXT:    bl atan2
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    li 3, 80
+; PC64LE-NEXT:    xxmrghd 34, 61, 1
+; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    lxvd2x 62, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    lxvd2x 61, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    addi 1, 1, 96
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: constrained_vector_atan2_v2f64:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    stdu 1, -80(1)
+; PC64LE9-NEXT:    std 0, 96(1)
+; PC64LE9-NEXT:    stxv 62, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    stxv 63, 64(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    vmr 31, 3
+; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
+; PC64LE9-NEXT:    vmr 30, 2
+; PC64LE9-NEXT:    xscpsgndp 1, 62, 62
+; PC64LE9-NEXT:    stxv 61, 32(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    bl atan2
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
+; PC64LE9-NEXT:    xxswapd 1, 62
+; PC64LE9-NEXT:    xxswapd 2, 63
+; PC64LE9-NEXT:    bl atan2
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xxmrghd 34, 61, 1
+; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    lxv 61, 32(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    addi 1, 1, 80
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+entry:
+  %atan2 = call <2 x double> @llvm.experimental.constrained.atan2.v2f64(
+                             <2 x double> %x,
+                             <2 x double> %y,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #1
+  ret <2 x double> %atan2
+}
+
+define <3 x float> @constrained_vector_atan2_v3f32(<3 x float> %x, <3 x float> %y) #0 {
+; PC64LE-LABEL: constrained_vector_atan2_v3f32:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    stdu 1, -96(1)
+; PC64LE-NEXT:    xxsldwi 0, 34, 34, 1
+; PC64LE-NEXT:    xxsldwi 2, 35, 35, 1
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    std 0, 112(1)
+; PC64LE-NEXT:    stfd 30, 80(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    stfd 31, 88(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    xscvspdpn 1, 0
+; PC64LE-NEXT:    xscvspdpn 2, 2
+; PC64LE-NEXT:    stxvd2x 62, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    vmr 30, 2
+; PC64LE-NEXT:    stxvd2x 63, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    vmr 31, 3
+; PC64LE-NEXT:    bl atan2f
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xxswapd 0, 62
+; PC64LE-NEXT:    xxswapd 2, 63
+; PC64LE-NEXT:    fmr 31, 1
+; PC64LE-NEXT:    xscvspdpn 1, 0
+; PC64LE-NEXT:    xscvspdpn 2, 2
+; PC64LE-NEXT:    bl atan2f
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xxsldwi 0, 62, 62, 3
+; PC64LE-NEXT:    xxsldwi 2, 63, 63, 3
+; PC64LE-NEXT:    fmr 30, 1
+; PC64LE-NEXT:    xscvspdpn 1, 0
+; PC64LE-NEXT:    xscvspdpn 2, 2
+; PC64LE-NEXT:    bl atan2f
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xscvdpspn 0, 1
+; PC64LE-NEXT:    xscvdpspn 1, 30
+; PC64LE-NEXT:    addis 3, 2, .LCPI194_0 at toc@ha
+; PC64LE-NEXT:    lfd 30, 80(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    xscvdpspn 36, 31
+; PC64LE-NEXT:    lfd 31, 88(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    addi 3, 3, .LCPI194_0 at toc@l
+; PC64LE-NEXT:    xxmrghw 34, 1, 0
+; PC64LE-NEXT:    lxvd2x 0, 0, 3
+; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    lxvd2x 62, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    xxswapd 35, 0
+; PC64LE-NEXT:    vperm 2, 4, 2, 3
+; PC64LE-NEXT:    addi 1, 1, 96
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: constrained_vector_atan2_v3f32:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    stdu 1, -80(1)
+; PC64LE9-NEXT:    xxsldwi 0, 34, 34, 1
+; PC64LE9-NEXT:    std 0, 96(1)
+; PC64LE9-NEXT:    stfd 30, 64(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    stxv 62, 32(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    stfd 31, 72(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    xscvspdpn 1, 0
+; PC64LE9-NEXT:    xxsldwi 0, 35, 35, 1
+; PC64LE9-NEXT:    vmr 31, 3
+; PC64LE9-NEXT:    vmr 30, 2
+; PC64LE9-NEXT:    xscvspdpn 2, 0
+; PC64LE9-NEXT:    bl atan2f
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xxswapd 0, 62
+; PC64LE9-NEXT:    fmr 31, 1
+; PC64LE9-NEXT:    xscvspdpn 1, 0
+; PC64LE9-NEXT:    xxswapd 0, 63
+; PC64LE9-NEXT:    xscvspdpn 2, 0
+; PC64LE9-NEXT:    bl atan2f
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xxsldwi 0, 62, 62, 3
+; PC64LE9-NEXT:    fmr 30, 1
+; PC64LE9-NEXT:    xscvspdpn 1, 0
+; PC64LE9-NEXT:    xxsldwi 0, 63, 63, 3
+; PC64LE9-NEXT:    xscvspdpn 2, 0
+; PC64LE9-NEXT:    bl atan2f
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xscvdpspn 0, 1
+; PC64LE9-NEXT:    xscvdpspn 1, 30
+; PC64LE9-NEXT:    addis 3, 2, .LCPI194_0 at toc@ha
+; PC64LE9-NEXT:    xscvdpspn 34, 31
+; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    addi 3, 3, .LCPI194_0 at toc@l
+; PC64LE9-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    xxmrghw 35, 1, 0
+; PC64LE9-NEXT:    lxv 0, 0(3)
+; PC64LE9-NEXT:    xxperm 34, 35, 0
+; PC64LE9-NEXT:    addi 1, 1, 80
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+entry:
+  %atan2 = call <3 x float> @llvm.experimental.constrained.atan2.v3f32(
+                              <3 x float> %x,
+                              <3 x float> %y,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict") #1
+  ret <3 x float> %atan2
+}
+
+define <3 x double> @constrained_vector_atan2_v3f64(<3 x double> %x, <3 x double> %y) #0 {
+; PC64LE-LABEL: constrained_vector_atan2_v3f64:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    stdu 1, -96(1)
+; PC64LE-NEXT:    std 0, 112(1)
+; PC64LE-NEXT:    stfd 28, 64(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    fmr 28, 2
+; PC64LE-NEXT:    fmr 2, 4
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    stfd 29, 72(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    stfd 30, 80(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    fmr 30, 5
+; PC64LE-NEXT:    stfd 31, 88(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    stxvd2x 63, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    fmr 31, 6
+; PC64LE-NEXT:    fmr 29, 3
+; PC64LE-NEXT:    bl atan2
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xxlor 63, 1, 1
+; PC64LE-NEXT:    fmr 1, 28
+; PC64LE-NEXT:    fmr 2, 30
+; PC64LE-NEXT:    bl atan2
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xxmrghd 63, 1, 63
+; PC64LE-NEXT:    fmr 1, 29
+; PC64LE-NEXT:    fmr 2, 31
+; PC64LE-NEXT:    bl atan2
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    fmr 3, 1
+; PC64LE-NEXT:    xxswapd 1, 63
+; PC64LE-NEXT:    lfd 31, 88(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    xxlor 2, 63, 63
+; PC64LE-NEXT:    lfd 30, 80(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    lfd 29, 72(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    lfd 28, 64(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    addi 1, 1, 96
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: constrained_vector_atan2_v3f64:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    stdu 1, -80(1)
+; PC64LE9-NEXT:    std 0, 96(1)
+; PC64LE9-NEXT:    stfd 28, 48(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    stxv 63, 32(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    fmr 28, 2
+; PC64LE9-NEXT:    fmr 2, 4
+; PC64LE9-NEXT:    stfd 29, 56(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    stfd 30, 64(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    stfd 31, 72(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    fmr 31, 6
+; PC64LE9-NEXT:    fmr 30, 5
+; PC64LE9-NEXT:    fmr 29, 3
+; PC64LE9-NEXT:    bl atan2
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xscpsgndp 63, 1, 1
+; PC64LE9-NEXT:    fmr 1, 28
+; PC64LE9-NEXT:    fmr 2, 30
+; PC64LE9-NEXT:    bl atan2
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xxmrghd 63, 1, 63
+; PC64LE9-NEXT:    fmr 1, 29
+; PC64LE9-NEXT:    fmr 2, 31
+; PC64LE9-NEXT:    bl atan2
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    fmr 3, 1
+; PC64LE9-NEXT:    xxswapd 1, 63
+; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
+; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    lfd 29, 56(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    lfd 28, 48(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    addi 1, 1, 80
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+entry:
+  %atan2 = call <3 x double> @llvm.experimental.constrained.atan2.v3f64(
+                          <3 x double> %x,
+                          <3 x double> %y,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict") #1
+  ret <3 x double> %atan2
+}
+
+define <4 x double> @constrained_vector_atan2_v4f64(<4 x double> %x, <4 x double> %y) #0 {
+; PC64LE-LABEL: constrained_vector_atan2_v4f64:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    stdu 1, -128(1)
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    std 0, 144(1)
+; PC64LE-NEXT:    stxvd2x 59, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    stxvd2x 60, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    li 3, 80
+; PC64LE-NEXT:    vmr 28, 2
+; PC64LE-NEXT:    stxvd2x 61, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    li 3, 96
+; PC64LE-NEXT:    xxlor 1, 60, 60
+; PC64LE-NEXT:    vmr 29, 3
+; PC64LE-NEXT:    stxvd2x 62, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    vmr 30, 4
+; PC64LE-NEXT:    li 3, 112
+; PC64LE-NEXT:    xxlor 2, 62, 62
+; PC64LE-NEXT:    stxvd2x 63, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    vmr 31, 5
+; PC64LE-NEXT:    bl atan2
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xxlor 59, 1, 1
+; PC64LE-NEXT:    xxswapd 1, 60
+; PC64LE-NEXT:    xxswapd 2, 62
+; PC64LE-NEXT:    bl atan2
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xxmrghd 62, 59, 1
+; PC64LE-NEXT:    xxlor 1, 61, 61
+; PC64LE-NEXT:    xxlor 2, 63, 63
+; PC64LE-NEXT:    bl atan2
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xxlor 60, 1, 1
+; PC64LE-NEXT:    xxswapd 1, 61
+; PC64LE-NEXT:    xxswapd 2, 63
+; PC64LE-NEXT:    bl atan2
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    li 3, 112
+; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    xxmrghd 35, 60, 1
+; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    li 3, 96
+; PC64LE-NEXT:    lxvd2x 62, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    li 3, 80
+; PC64LE-NEXT:    lxvd2x 61, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    lxvd2x 60, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    lxvd2x 59, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    addi 1, 1, 128
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: constrained_vector_atan2_v4f64:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    stdu 1, -112(1)
+; PC64LE9-NEXT:    std 0, 128(1)
+; PC64LE9-NEXT:    stxv 60, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    stxv 62, 80(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    vmr 30, 4
+; PC64LE9-NEXT:    xscpsgndp 2, 62, 62
+; PC64LE9-NEXT:    vmr 28, 2
+; PC64LE9-NEXT:    xscpsgndp 1, 60, 60
+; PC64LE9-NEXT:    stxv 59, 32(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    stxv 61, 64(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    stxv 63, 96(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    vmr 31, 5
+; PC64LE9-NEXT:    vmr 29, 3
+; PC64LE9-NEXT:    bl atan2
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xscpsgndp 59, 1, 1
+; PC64LE9-NEXT:    xxswapd 1, 60
+; PC64LE9-NEXT:    xxswapd 2, 62
+; PC64LE9-NEXT:    bl atan2
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xxmrghd 62, 59, 1
+; PC64LE9-NEXT:    xscpsgndp 1, 61, 61
+; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
+; PC64LE9-NEXT:    bl atan2
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xscpsgndp 60, 1, 1
+; PC64LE9-NEXT:    xxswapd 1, 61
+; PC64LE9-NEXT:    xxswapd 2, 63
+; PC64LE9-NEXT:    bl atan2
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xxmrghd 35, 60, 1
+; PC64LE9-NEXT:    vmr 2, 30
+; PC64LE9-NEXT:    lxv 63, 96(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    lxv 62, 80(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    lxv 61, 64(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    lxv 60, 48(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    lxv 59, 32(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    addi 1, 1, 112
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+entry:
+  %atan2 = call <4 x double> @llvm.experimental.constrained.atan2.v4f64(
+                             <4 x double> %x,
+                             <4 x double> %y,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #1
+  ret <4 x double> %atan2
+}
+
 attributes #0 = { nounwind strictfp noimplicitfloat }
 attributes #1 = { strictfp }
 
@@ -8348,6 +8762,7 @@ declare <2 x double> @llvm.experimental.constrained.powi.v2f64(<2 x double>, i32
 declare <2 x double> @llvm.experimental.constrained.sin.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.cos.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.tan.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.atan2.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.exp.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.exp2.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.log.v2f64(<2 x double>, metadata, metadata)
@@ -8394,6 +8809,7 @@ declare <1 x float> @llvm.experimental.constrained.powi.v1f32(<1 x float>, i32,
 declare <1 x float> @llvm.experimental.constrained.sin.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.cos.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.tan.v1f32(<1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.atan2.v1f32(<1 x float>, <1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.exp.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.exp2.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.log.v1f32(<1 x float>, metadata, metadata)
@@ -8449,6 +8865,8 @@ declare <3 x float> @llvm.experimental.constrained.cos.v3f32(<3 x float>, metada
 declare <3 x double> @llvm.experimental.constrained.cos.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.tan.v3f32(<3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.tan.v3f64(<3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.atan2.v3f32(<3 x float>, <3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.atan2.v3f64(<3 x double>, <3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.exp.v3f32(<3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.exp.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.exp2.v3f32(<3 x float>, metadata, metadata)
@@ -8506,6 +8924,7 @@ declare <4 x double> @llvm.experimental.constrained.powi.v4f64(<4 x double>, i32
 declare <4 x double> @llvm.experimental.constrained.sin.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.cos.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.tan.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.atan2.v4f64(<4 x double>, <4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.exp.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.exp2.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.log.v4f64(<4 x double>, metadata, metadata)
diff --git a/llvm/test/CodeGen/RISCV/float-intrinsics-strict.ll b/llvm/test/CodeGen/RISCV/float-intrinsics-strict.ll
index 7b2d38fefaacb1..f04da712dce311 100644
--- a/llvm/test/CodeGen/RISCV/float-intrinsics-strict.ll
+++ b/llvm/test/CodeGen/RISCV/float-intrinsics-strict.ll
@@ -414,6 +414,66 @@ define float @tan_f32(float %a) nounwind strictfp {
   ret float %1
 }
 
+declare float @llvm.experimental.constrained.atan2.f32(float, float, metadata, metadata)
+
+define float @atan2_f32(float %a, float %b) nounwind strictfp {
+; RV32IF-LABEL: atan2_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    call atan2f
+; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+;
+; RV64IF-LABEL: atan2_f32:
+; RV64IF:       # %bb.0:
+; RV64IF-NEXT:    addi sp, sp, -16
+; RV64IF-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IF-NEXT:    call atan2f
+; RV64IF-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IF-NEXT:    addi sp, sp, 16
+; RV64IF-NEXT:    ret
+;
+; RV32IZFINX-LABEL: atan2_f32:
+; RV32IZFINX:       # %bb.0:
+; RV32IZFINX-NEXT:    addi sp, sp, -16
+; RV32IZFINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFINX-NEXT:    call atan2f
+; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFINX-NEXT:    addi sp, sp, 16
+; RV32IZFINX-NEXT:    ret
+;
+; RV64IZFINX-LABEL: atan2_f32:
+; RV64IZFINX:       # %bb.0:
+; RV64IZFINX-NEXT:    addi sp, sp, -16
+; RV64IZFINX-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IZFINX-NEXT:    call atan2f
+; RV64IZFINX-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IZFINX-NEXT:    addi sp, sp, 16
+; RV64IZFINX-NEXT:    ret
+;
+; RV32I-LABEL: atan2_f32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call atan2f
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: atan2_f32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call atan2f
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+  %1 = call float @llvm.experimental.constrained.atan2.f32(float %a, float %b, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp
+  ret float %1
+}
+
 declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata)
 
 define float @pow_f32(float %a, float %b) nounwind strictfp {

>From 5a40aebdf6e1183c109b4b9fdbee2daf7863b4bf Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr at microsoft.com>
Date: Mon, 11 Nov 2024 18:20:14 -0800
Subject: [PATCH 6/6] LangRef.rst: Add missing
 llvm.experimental.constrained.atan2 and revise llvm.atan2 definition.

---
 llvm/docs/LangRef.rst | 49 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ef38c5ab33b926..2146d99a6f812c 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15748,16 +15748,17 @@ all types however.
 
 ::
 
-      declare float     @llvm.atan2.f32(float  %X, float %Y)
-      declare double    @llvm.atan2.f64(double %X, double %Y)
-      declare x86_fp80  @llvm.atan2.f80(x86_fp80  %X, x86_fp80 %Y)
-      declare fp128     @llvm.atan2.f128(fp128 %X, fp128 %Y)
-      declare ppc_fp128 @llvm.atan2.ppcf128(ppc_fp128  %X, ppc_fp128 %Y)
+      declare float     @llvm.atan2.f32(float  %Y, float %X)
+      declare double    @llvm.atan2.f64(double %Y, double %X)
+      declare x86_fp80  @llvm.atan2.f80(x86_fp80  %Y, x86_fp80 %X)
+      declare fp128     @llvm.atan2.f128(fp128 %Y, fp128 %X)
+      declare ppc_fp128 @llvm.atan2.ppcf128(ppc_fp128  %Y, ppc_fp128 %X)
 
 Overview:
 """""""""
 
-The '``llvm.atan2.*``' intrinsics return the arctangent of the operand.
+The '``llvm.atan2.*``' intrinsics return the arctangent of ``Y/X`` accounting
+for the quadrant.
 
 Arguments:
 """"""""""
@@ -27259,6 +27260,42 @@ This function returns the arctangent of the specified operand, returning the
 same values as the libm ``atan`` functions would, and handles error
 conditions in the same way.
 
+'``llvm.experimental.constrained.atan2``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.atan2(<type> <op1>,
+                                           <type> <op2>,
+                                           metadata <rounding mode>,
+                                           metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.atan2``' intrinsic returns the arctangent
+of ``<op1>`` divided by ``<op2>`` accounting for the quadrant.
+
+Arguments:
+""""""""""
+
+The first two arguments and the return value are floating-point numbers of the
+same type.
+
+The third and fourth arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the quadrant-specific arctangent using the specified
+operands, returning the same values as the libm ``atan2`` functions would, and
+handles error conditions in the same way.
+
 '``llvm.experimental.constrained.sinh``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^