[llvm] [DirectX] Add atan2 intrinsic and expand for DXIL backend (p1) (PR #108865)

Mon Sep 23 15:19:57 PDT 2024

https://github.com/tex3d updated https://github.com/llvm/llvm-project/pull/108865

>From d6cb2067b00cfcd7a976bc9cbfc4c57e27bb539b Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr at microsoft.com>
Date: Fri, 13 Sep 2024 17:56:32 -0700
Subject: [PATCH 1/4] [DirectX] Add atan2 intrinsic and expand for DXIL backend

This change is part of this proposal: https://discourse.llvm.org/t/rfc-all-the-math-intrinsics/78294

This preliminary work adds the intrinsic to llvm and expands using atan intrinsic for DXIL backend, since DXIL has no atan2 op.
---
 llvm/docs/LangRef.rst                         | 37 +++++++++++++
 llvm/include/llvm/IR/Intrinsics.td            |  1 +
 .../Target/DirectX/DXILIntrinsicExpansion.cpp | 46 ++++++++++++++++
 llvm/test/CodeGen/DirectX/atan2.ll            | 52 +++++++++++++++++++
 llvm/test/CodeGen/DirectX/atan2_error.ll      | 11 ++++
 5 files changed, 147 insertions(+)
 create mode 100644 llvm/test/CodeGen/DirectX/atan2.ll
 create mode 100644 llvm/test/CodeGen/DirectX/atan2_error.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 91c3e60bb0acb1..41d1efab752fd7 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15583,6 +15583,43 @@ trapping or setting ``errno``.
 When specified with the fast-math-flag 'afn', the result may be approximated
 using a less accurate calculation.
 
+'``llvm.atan2.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.atan2`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+      declare float     @llvm.atan2.f32(float  %X, float %Y)
+      declare double    @llvm.atan2.f64(double %X, double %Y)
+      declare x86_fp80  @llvm.atan2.f80(x86_fp80  %X, x86_fp80 %Y)
+      declare fp128     @llvm.atan2.f128(fp128 %X, fp128 %Y)
+      declare ppc_fp128 @llvm.atan2.ppcf128(ppc_fp128  %X, ppc_fp128 %Y)
+
+Overview:
+"""""""""
+
+The '``llvm.atan2.*``' intrinsics return the arctangent of the operand.
+
+Arguments:
+""""""""""
+
+The arguments and return value are floating-point numbers of the same type.
+
+Semantics:
+""""""""""
+
+Return the same value as a corresponding libm '``atan2``' function but without
+trapping or setting ``errno``.
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
+
 '``llvm.sinh.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 0a74a217a5f010..48d57907e6d0bc 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1016,6 +1016,7 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
   def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_sin  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_cos  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_tan  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index dd73b895b14d37..19ae7023ec0a90 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -36,6 +36,7 @@ using namespace llvm;
 static bool isIntrinsicExpansion(Function &F) {
   switch (F.getIntrinsicID()) {
   case Intrinsic::abs:
+  case Intrinsic::atan2:
   case Intrinsic::exp:
   case Intrinsic::log:
   case Intrinsic::log10:
@@ -307,6 +308,48 @@ static Value *expandNormalizeIntrinsic(CallInst *Orig) {
   return Builder.CreateFMul(X, MultiplicandVec);
 }
 
+static Value *expandAtan2Intrinsic(CallInst *Orig) {
+  Value *Y = Orig->getOperand(0);
+  Value *X = Orig->getOperand(1);
+  Type *Ty = X->getType();
+  IRBuilder<> Builder(Orig);
+
+  Value *Tan = Builder.CreateFDiv(Y, X);
+
+  Value *Atan =
+      Builder.CreateIntrinsic(Ty, Intrinsic::atan, {Tan}, nullptr, "Elt.Atan");
+
+  Constant *Pi = ConstantFP::get(Ty, llvm::numbers::pi);
+  Constant *HalfPi = ConstantFP::get(Ty, llvm::numbers::pi / 2);
+  Constant *NegHalfPi = ConstantFP::get(Ty, -llvm::numbers::pi / 2);
+  Constant *Zero = ConstantFP::get(Ty, 0);
+
+  Value *AtanAddPi = Builder.CreateFAdd(Atan, Pi);
+  Value *AtanSubPi = Builder.CreateFSub(Atan, Pi);
+
+  Value *Result = Atan;
+
+  Value *XLt0 = Builder.CreateFCmpOLT(X, Zero);
+  Value *XEq0 = Builder.CreateFCmpOEQ(X, Zero);
+
+  Value *YGe0 = Builder.CreateFCmpOGE(Y, Zero);
+  Value *YLt0 = Builder.CreateFCmpOLT(Y, Zero);
+
+  Value *XLt0AndYGe0 = Builder.CreateAnd(XLt0, YGe0);
+  Result = Builder.CreateSelect(XLt0AndYGe0, AtanAddPi, Result);
+
+  Value *XLt0AndYLt0 = Builder.CreateAnd(XLt0, YLt0);
+  Result = Builder.CreateSelect(XLt0AndYLt0, AtanSubPi, Result);
+
+  Value *XEq0AndYLt0 = Builder.CreateAnd(XEq0, YLt0);
+  Result = Builder.CreateSelect(XEq0AndYLt0, NegHalfPi, Result);
+
+  Value *XEq0AndYGe0 = Builder.CreateAnd(XEq0, YGe0);
+  Result = Builder.CreateSelect(XEq0AndYGe0, HalfPi, Result);
+
+  return Result;
+}
+
 static Value *expandPowIntrinsic(CallInst *Orig) {
 
   Value *X = Orig->getOperand(0);
@@ -418,6 +461,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
   case Intrinsic::abs:
     Result = expandAbs(Orig);
     break;
+  case Intrinsic::atan2:
+    Result = expandAtan2Intrinsic(Orig);
+    break;
   case Intrinsic::exp:
     Result = expandExpIntrinsic(Orig);
     break;
diff --git a/llvm/test/CodeGen/DirectX/atan2.ll b/llvm/test/CodeGen/DirectX/atan2.ll
new file mode 100644
index 00000000000000..32b9e60661cc07
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/atan2.ll
@@ -0,0 +1,52 @@
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+; Make sure correct dxil expansions for atan2 are generated for float and half.
+
+define noundef float @atan2_float(float noundef %y, float noundef %x) {
+entry:
+; CHECK: [[DIV:%.+]] = fdiv float %y, %x
+; CHECK: [[TAN:%.+]] = call float @dx.op.unary.f32(i32 17, float [[DIV]])
+; CHECK-DAG: [[ADD_PI:%.+]] = fadd float [[TAN]], 0x400921FB60000000
+; CHECK-DAG: [[SUB_PI:%.+]] = fsub float [[TAN]], 0x400921FB60000000
+; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt float %x, 0.000000e+00
+; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq float %x, 0.000000e+00 
+; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge float %y, 0.000000e+00 
+; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt float %y, 0.000000e+00
+; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]]
+; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], float [[ADD_PI]], float [[TAN]]
+; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]]
+; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], float [[SUB_PI]], float [[SELECT_ADD_PI]]
+; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]]
+; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], float 0xBFF921FB60000000, float [[SELECT_SUB_PI]]
+; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]]
+; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], float 0x3FF921FB60000000, float [[SELECT_NEGHPI]]
+; CHECK: ret float [[SELECT_HPI]]
+  %elt.atan2 = call float @llvm.atan2.f32(float %y, float %x)
+  ret float %elt.atan2
+}
+
+define noundef half @atan2_half(half noundef %y, half noundef %x) {
+entry:
+; CHECK: [[DIV:%.+]] = fdiv half %y, %x
+; CHECK: [[TAN:%.+]] = call half @dx.op.unary.f16(i32 17, half [[DIV]])
+; CHECK-DAG: [[ADD_PI:%.+]] = fadd half [[TAN]], 0xH4248
+; CHECK-DAG: [[SUB_PI:%.+]] = fsub half [[TAN]], 0xH4248
+; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt half %x, 0xH0000
+; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq half %x, 0xH0000 
+; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge half %y, 0xH0000 
+; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt half %y, 0xH0000
+; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]]
+; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], half [[ADD_PI]], half [[TAN]]
+; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]]
+; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], half [[SUB_PI]], half [[SELECT_ADD_PI]]
+; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]]
+; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], half 0xHBE48, half [[SELECT_SUB_PI]]
+; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]]
+; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], half 0xH3E48, half [[SELECT_NEGHPI]]
+; CHECK: ret half [[SELECT_HPI]]
+  %elt.atan2 = call half @llvm.atan2.f16(half %y, half %x)
+  ret half %elt.atan2
+}
+
+declare half @llvm.atan2.f16(half, half)
+declare float @llvm.atan2.f32(float, float)
diff --git a/llvm/test/CodeGen/DirectX/atan2_error.ll b/llvm/test/CodeGen/DirectX/atan2_error.ll
new file mode 100644
index 00000000000000..5b3077f85f5d4e
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/atan2_error.ll
@@ -0,0 +1,11 @@
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
+
+; DXIL operation atan does not support double overload type
+; CHECK: in function atan2_double
+; CHECK-SAME: Cannot create ATan operation: Invalid overload type
+
+define noundef double @atan2_double(double noundef %a, double noundef %b) #0 {
+entry:
+  %1 = call double @llvm.atan2.f64(double %a, double %b)
+  ret double %1
+}

>From 678fa4636f586f5d99bfd1aeddb088d061d7246b Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr at microsoft.com>
Date: Tue, 17 Sep 2024 14:17:05 -0700
Subject: [PATCH 2/4] Address feedback for DXILIntrinsicExpansion.cpp

---
 llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index 19ae7023ec0a90..926cbe97f24fda 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -313,37 +313,43 @@ static Value *expandAtan2Intrinsic(CallInst *Orig) {
   Value *X = Orig->getOperand(1);
   Type *Ty = X->getType();
   IRBuilder<> Builder(Orig);
+  Builder.setFastMathFlags(Orig->getFastMathFlags());
 
   Value *Tan = Builder.CreateFDiv(Y, X);
 
-  Value *Atan =
+  CallInst *Atan =
       Builder.CreateIntrinsic(Ty, Intrinsic::atan, {Tan}, nullptr, "Elt.Atan");
+  Atan->setTailCall(Orig->isTailCall());
+  Atan->setAttributes(Orig->getAttributes());
 
+  // Modify atan result based on https://en.wikipedia.org/wiki/Atan2.
   Constant *Pi = ConstantFP::get(Ty, llvm::numbers::pi);
   Constant *HalfPi = ConstantFP::get(Ty, llvm::numbers::pi / 2);
   Constant *NegHalfPi = ConstantFP::get(Ty, -llvm::numbers::pi / 2);
   Constant *Zero = ConstantFP::get(Ty, 0);
-
   Value *AtanAddPi = Builder.CreateFAdd(Atan, Pi);
   Value *AtanSubPi = Builder.CreateFSub(Atan, Pi);
 
+  // x > 0 -> atan.
   Value *Result = Atan;
-
   Value *XLt0 = Builder.CreateFCmpOLT(X, Zero);
   Value *XEq0 = Builder.CreateFCmpOEQ(X, Zero);
-
   Value *YGe0 = Builder.CreateFCmpOGE(Y, Zero);
   Value *YLt0 = Builder.CreateFCmpOLT(Y, Zero);
 
+  // x < 0, y >= 0 -> atan + pi.
   Value *XLt0AndYGe0 = Builder.CreateAnd(XLt0, YGe0);
   Result = Builder.CreateSelect(XLt0AndYGe0, AtanAddPi, Result);
 
+  // x < 0, y < 0 -> atan - pi.
   Value *XLt0AndYLt0 = Builder.CreateAnd(XLt0, YLt0);
   Result = Builder.CreateSelect(XLt0AndYLt0, AtanSubPi, Result);
 
+  // x == 0, y < 0 -> -pi/2
   Value *XEq0AndYLt0 = Builder.CreateAnd(XEq0, YLt0);
   Result = Builder.CreateSelect(XEq0AndYLt0, NegHalfPi, Result);
 
+  // x == 0, y > 0 -> pi/2
   Value *XEq0AndYGe0 = Builder.CreateAnd(XEq0, YGe0);
   Result = Builder.CreateSelect(XEq0AndYGe0, HalfPi, Result);
 

>From d5d7445e1e23695d1a1ff0ecc8faa6ad1a0201ee Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr at microsoft.com>
Date: Wed, 18 Sep 2024 21:04:37 -0700
Subject: [PATCH 3/4] Add expansion-only test

---
 llvm/test/CodeGen/DirectX/atan2.ll | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/llvm/test/CodeGen/DirectX/atan2.ll b/llvm/test/CodeGen/DirectX/atan2.ll
index 32b9e60661cc07..708ccbbb05162d 100644
--- a/llvm/test/CodeGen/DirectX/atan2.ll
+++ b/llvm/test/CodeGen/DirectX/atan2.ll
@@ -1,19 +1,21 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
 
 ; Make sure correct dxil expansions for atan2 are generated for float and half.
 
 define noundef float @atan2_float(float noundef %y, float noundef %x) {
 entry:
 ; CHECK: [[DIV:%.+]] = fdiv float %y, %x
-; CHECK: [[TAN:%.+]] = call float @dx.op.unary.f32(i32 17, float [[DIV]])
-; CHECK-DAG: [[ADD_PI:%.+]] = fadd float [[TAN]], 0x400921FB60000000
-; CHECK-DAG: [[SUB_PI:%.+]] = fsub float [[TAN]], 0x400921FB60000000
+; EXPCHECK: [[ATAN:%.+]] = call float @llvm.atan.f32(float [[DIV]])
+; DOPCHECK: [[ATAN:%.+]] = call float @dx.op.unary.f32(i32 17, float [[DIV]])
+; CHECK-DAG: [[ADD_PI:%.+]] = fadd float [[ATAN]], 0x400921FB60000000
+; CHECK-DAG: [[SUB_PI:%.+]] = fsub float [[ATAN]], 0x400921FB60000000
 ; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt float %x, 0.000000e+00
 ; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq float %x, 0.000000e+00 
 ; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge float %y, 0.000000e+00 
 ; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt float %y, 0.000000e+00
 ; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]]
-; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], float [[ADD_PI]], float [[TAN]]
+; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], float [[ADD_PI]], float [[ATAN]]
 ; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]]
 ; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], float [[SUB_PI]], float [[SELECT_ADD_PI]]
 ; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]]
@@ -28,15 +30,16 @@ entry:
 define noundef half @atan2_half(half noundef %y, half noundef %x) {
 entry:
 ; CHECK: [[DIV:%.+]] = fdiv half %y, %x
-; CHECK: [[TAN:%.+]] = call half @dx.op.unary.f16(i32 17, half [[DIV]])
-; CHECK-DAG: [[ADD_PI:%.+]] = fadd half [[TAN]], 0xH4248
-; CHECK-DAG: [[SUB_PI:%.+]] = fsub half [[TAN]], 0xH4248
+; EXPCHECK: [[ATAN:%.+]] = call half @llvm.atan.f16(half [[DIV]])
+; DOPCHECK: [[ATAN:%.+]] = call half @dx.op.unary.f16(i32 17, half [[DIV]])
+; CHECK-DAG: [[ADD_PI:%.+]] = fadd half [[ATAN]], 0xH4248
+; CHECK-DAG: [[SUB_PI:%.+]] = fsub half [[ATAN]], 0xH4248
 ; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt half %x, 0xH0000
 ; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq half %x, 0xH0000 
 ; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge half %y, 0xH0000 
 ; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt half %y, 0xH0000
 ; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]]
-; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], half [[ADD_PI]], half [[TAN]]
+; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], half [[ADD_PI]], half [[ATAN]]
 ; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]]
 ; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], half [[SUB_PI]], half [[SELECT_ADD_PI]]
 ; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]]

>From 382930f64f48f84d806db0192f316e5ec2dd26c6 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr at microsoft.com>
Date: Thu, 19 Sep 2024 15:47:20 -0700
Subject: [PATCH 4/4] Add vector case

---
 llvm/test/CodeGen/DirectX/atan2.ll | 32 ++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/llvm/test/CodeGen/DirectX/atan2.ll b/llvm/test/CodeGen/DirectX/atan2.ll
index 708ccbbb05162d..9d86f87f3ed50e 100644
--- a/llvm/test/CodeGen/DirectX/atan2.ll
+++ b/llvm/test/CodeGen/DirectX/atan2.ll
@@ -51,5 +51,37 @@ entry:
   ret half %elt.atan2
 }
 
+define noundef <4 x float> @atan2_float4(<4 x float> noundef %y, <4 x float> noundef %x) {
+entry:
+; Just Expansion, no scalarization or lowering:
+; EXPCHECK: [[DIV:%.+]] = fdiv <4 x float> %y, %x
+; EXPCHECK: [[ATAN:%.+]] = call <4 x float> @llvm.atan.v4f32(<4 x float> [[DIV]])
+; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <4 x float> [[ATAN]], <float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000>
+; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <4 x float> [[ATAN]], <float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000>
+; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <4 x float> %x, zeroinitializer
+; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <4 x float> %x, zeroinitializer
+; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <4 x float> %y, zeroinitializer
+; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <4 x float> %y, zeroinitializer
+; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_GE_0]]
+; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <4 x i1> [[XLT0_AND_YGE0]], <4 x float> [[ADD_PI]], <4 x float> [[ATAN]]
+; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_LT_0]]
+; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <4 x i1> [[XLT0_AND_YLT0]], <4 x float> [[SUB_PI]], <4 x float> [[SELECT_ADD_PI]]
+; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_LT_0]]
+; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <4 x i1> [[XEQ0_AND_YLT0]], <4 x float> <float 0xBFF921FB60000000, float 0xBFF921FB60000000, float 0xBFF921FB60000000, float 0xBFF921FB60000000>, <4 x float> [[SELECT_SUB_PI]]
+; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_GE_0]]
+; EXPCHECK: [[SELECT_HPI:%.+]] = select <4 x i1> [[XEQ0_AND_YGE0]], <4 x float> <float 0x3FF921FB60000000, float 0x3FF921FB60000000, float 0x3FF921FB60000000, float 0x3FF921FB60000000>, <4 x float> [[SELECT_NEGHPI]]
+; EXPCHECK: ret <4 x float> [[SELECT_HPI]]
+
+; Scalarization occurs after expansion, so atan scalarization is tested separately.
+; Expansion, scalarization and lowering:
+; Just make sure this expands to exactly 4 scalar DXIL atan (OpCode=17) calls.
+; DOPCHECK-COUNT-4: call float @dx.op.unary.f32(i32 17, float %{{.*}})
+; DOPCHECK-NOT: call float @dx.op.unary.f32(i32 17,
+
+  %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %y, <4 x float> %x)
+  ret <4 x float> %elt.atan2
+}
+
 declare half @llvm.atan2.f16(half, half)
 declare float @llvm.atan2.f32(float, float)
+declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>)