[clang] [llvm] [HLSL] Add matrix support to atan2 (PR #194984)

Fri May 8 11:16:32 PDT 2026

https://github.com/joaosaffran updated https://github.com/llvm/llvm-project/pull/194984

>From babda1fba31e2b04f4b7934e98796ddb9fb4bb02 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Tue, 28 Apr 2026 15:07:52 -0700
Subject: [PATCH 01/20] make it support float

---
 clang/lib/Sema/SemaHLSL.cpp                | 10 ++++----
 clang/test/CodeGenHLSL/builtins/atan2.hlsl | 27 ++++++++++++++++++++++
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index aba1c5072a5fc..bb996d291675e 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3232,10 +3232,12 @@ static bool CheckFloatRepresentation(Sema *S, SourceLocation Loc,
 static bool CheckFloatOrHalfRepresentation(Sema *S, SourceLocation Loc,
                                            int ArgOrdinal,
                                            clang::QualType PassedType) {
-  clang::QualType BaseType =
-      PassedType->isVectorType()
-          ? PassedType->castAs<clang::VectorType>()->getElementType()
-          : PassedType;
+  clang::QualType BaseType = PassedType;
+  if (PassedType->isVectorType())
+    BaseType = PassedType->castAs<clang::VectorType>()->getElementType();
+  else if (PassedType->isMatrixType())
+    BaseType = PassedType->castAs<clang::MatrixType>()->getElementType();
+
   if (!BaseType->isHalfType() && !BaseType->isFloat32Type())
     return S->Diag(Loc, diag::err_builtin_invalid_arg_type)
            << ArgOrdinal << /* scalar or vector of */ 5 << /* no int */ 0
diff --git a/clang/test/CodeGenHLSL/builtins/atan2.hlsl b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
index 512b44a5780db..986ddc75b4f8e 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
@@ -34,6 +34,20 @@ half4 test_atan2_half4 (half4 p0, half4 p1) {
   return atan2(p0, p1);
 }
 
+// CHECK-LABEL: test_atan2_half4x4
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <16 x half> @llvm.atan2.v16f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <16 x float> @llvm.atan2.v16f32
+half4x4 test_atan2_half4x4 (half4x4 p0, half4x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half2x3
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <6 x half> @llvm.atan2.v6f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
+half2x3 test_atan2_half2x3 (half2x3 p0, half2x3 p1) {
+  return atan2(p0, p1);
+}
+
 // CHECK-LABEL: test_atan2_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.atan2.f32
 float test_atan2_float (float p0, float p1) {
@@ -57,3 +71,16 @@ float3 test_atan2_float3 (float3 p0, float3 p1) {
 float4 test_atan2_float4 (float4 p0, float4 p1) {
   return atan2(p0, p1);
 }
+
+// CHECK-LABEL: test_atan2_float4x4
+// CHECK:  call reassoc nnan ninf nsz arcp afn <16 x float> @llvm.atan2.v16f32
+float4x4 test_atan2_float4x4 (float4x4 p0, float4x4 p1) {
+  return atan2(p0, p1);
+}
+
+
+// CHECK-LABEL: test_atan2_float2x3
+// CHECK:  call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
+float2x3 test_atan2_float2x3 (float2x3 p0, float2x3 p1) {
+  return atan2(p0, p1);
+}

>From 43586d11f49f8a945fe7f4aa690d1e7eaff450f0 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Tue, 28 Apr 2026 18:01:58 -0700
Subject: [PATCH 02/20] adding matrix impl

---
 clang/lib/Headers/hlsl/hlsl_intrinsics.h             |  6 ++++++
 clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl | 10 ++++++++++
 2 files changed, 16 insertions(+)

diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index cced7b0eabb1f..eb2ebe485b7de 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -309,5 +309,11 @@ constexpr matrix<T, R, C> mul(matrix<T, R, C> x, T y) {
   return x * y;
 }
 
+template <typename T, int R, int C>
+constexpr matrix<float, R, C> atan2(matrix<T, R, C> y, matrix<T, R, C> x) {
+  return __builtin_elementwise_atan2((matrix<float, R, C>)y,
+                                     (matrix<float, R, C>)x);
+}
+
 } // namespace hlsl
 #endif //_HLSL_HLSL_INTRINSICS_H_
diff --git a/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
index 85ff75110a78e..0779ab2d13d7e 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
@@ -181,3 +181,13 @@ float3 test_atan2_uint64_t3 (uint64_t3 p0, uint64_t3 p1) {
 float4 test_atan2_uint64_t4 (uint64_t4 p0, uint64_t4 p1) {
   return atan2(p0, p1);
 }
+
+
+// CHECK: define [[FNATTRS]] <16 x float> @_Z21test_atan2_int64_t4x4u11matrix_typeILj4ELj4ElES_(
+// CHECK:    [[CONVI:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_int64_t4x4 (int64_t4x4 p0, int64_t4x4 p1) {
+  return atan2(p0, p1);
+}

>From c4a5db93bdc574b7a1f9f1113413537884106bb8 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Wed, 29 Apr 2026 16:35:41 -0700
Subject: [PATCH 03/20] add tests and overloads

---
 .../lib/Headers/hlsl/hlsl_compat_overloads.h  |  7 +++++++
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      |  6 ------
 clang/lib/Sema/SemaHLSL.cpp                   |  4 ++--
 .../CodeGenHLSL/builtins/atan2-overloads.hlsl | 19 ++++++++++++++++++-
 4 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
index ee243abef6a41..e916228f1cc11 100644
--- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
+++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
@@ -230,6 +230,12 @@ namespace hlsl {
     return fn((float4)V1, (float4)V2, (float4)V3);                             \
   }
 
+#define _DXC_COMPAT_BINARY_MATRIX_OVERLOADS(fn, ty)                            \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  template <typename T, int R, int C>                                          \
+  constexpr matrix<ty, R, C> fn(matrix<T, R, C> y, matrix<T, R, C> x) {        \
+    return fn((matrix<ty, R, C>)y, (matrix<ty, R, C>)x);                       \
+  }
 //===----------------------------------------------------------------------===//
 // acos builtins overloads
 //===----------------------------------------------------------------------===//
@@ -257,6 +263,7 @@ _DXC_COMPAT_UNARY_INTEGER_OVERLOADS(atan)
 
 _DXC_COMPAT_BINARY_DOUBLE_OVERLOADS(atan2)
 _DXC_COMPAT_BINARY_INTEGER_OVERLOADS(atan2)
+_DXC_COMPAT_BINARY_MATRIX_OVERLOADS(atan2, float)
 
 //===----------------------------------------------------------------------===//
 // ceil builtins overloads
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index eb2ebe485b7de..cced7b0eabb1f 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -309,11 +309,5 @@ constexpr matrix<T, R, C> mul(matrix<T, R, C> x, T y) {
   return x * y;
 }
 
-template <typename T, int R, int C>
-constexpr matrix<float, R, C> atan2(matrix<T, R, C> y, matrix<T, R, C> x) {
-  return __builtin_elementwise_atan2((matrix<float, R, C>)y,
-                                     (matrix<float, R, C>)x);
-}
-
 } // namespace hlsl
 #endif //_HLSL_HLSL_INTRINSICS_H_
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index bb996d291675e..368a54b44c2d5 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3240,8 +3240,8 @@ static bool CheckFloatOrHalfRepresentation(Sema *S, SourceLocation Loc,
 
   if (!BaseType->isHalfType() && !BaseType->isFloat32Type())
     return S->Diag(Loc, diag::err_builtin_invalid_arg_type)
-           << ArgOrdinal << /* scalar or vector of */ 5 << /* no int */ 0
-           << /* half or float */ 2 << PassedType;
+           << ArgOrdinal << /* scalar, vector or matrix of */ 5
+           << /* no int */ 0 << /* half or float */ 2 << PassedType;
   return false;
 }
 
diff --git a/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
index 0779ab2d13d7e..d36d2aebcf4c8 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
@@ -38,6 +38,15 @@ float4 test_atan2_double4 (double4 p0, double4 p1) {
   return atan2(p0, p1);
 }
 
+// CHECK: define [[FNATTRS]] <16 x float> @_Z20test_atan2_double4x4u11matrix_typeILj4ELj4EdES_(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <16 x double> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <16 x double> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_double4x4 (double4x4 p0, double4x4 p1) {
+  return atan2(p0, p1);
+}
+
 // CHECK: define [[FNATTRS]] float @_Z14test_atan2_intii(
 // CHECK:    [[CONVI:%.*]] = sitofp i32 %{{.*}} to float
 // CHECK:    [[CONV1I:%.*]] = sitofp i32 %{{.*}} to float
@@ -182,7 +191,6 @@ float4 test_atan2_uint64_t4 (uint64_t4 p0, uint64_t4 p1) {
   return atan2(p0, p1);
 }
 
-
 // CHECK: define [[FNATTRS]] <16 x float> @_Z21test_atan2_int64_t4x4u11matrix_typeILj4ELj4ElES_(
 // CHECK:    [[CONVI:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
 // CHECK:    [[CONV1I:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
@@ -191,3 +199,12 @@ float4 test_atan2_uint64_t4 (uint64_t4 p0, uint64_t4 p1) {
 float4x4 test_atan2_int64_t4x4 (int64_t4x4 p0, int64_t4x4 p1) {
   return atan2(p0, p1);
 }
+
+// CHECK: define [[FNATTRS]] <16 x float> @_Z22test_atan2_uint64_t4x4u11matrix_typeILj4ELj4EmES_(
+// CHECK:    [[CONVI:%.*]] = uitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_uint64_t4x4 (uint64_t4x4 p0, uint64_t4x4 p1) {
+  return atan2(p0, p1);
+}

>From 7ccd15fcae5d9a517a9a154944503de1ed42f5e9 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Wed, 29 Apr 2026 16:55:02 -0700
Subject: [PATCH 04/20] make macro match other's

---
 .../lib/Headers/hlsl/hlsl_compat_overloads.h  | 39 ++++++++++++++++---
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
index e916228f1cc11..0c4963b495b51 100644
--- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
+++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
@@ -230,11 +230,39 @@ namespace hlsl {
     return fn((float4)V1, (float4)V2, (float4)V3);                             \
   }
 
-#define _DXC_COMPAT_BINARY_MATRIX_OVERLOADS(fn, ty)                            \
+#define _DXC_COMPAT_BINARY_DOUBLE_MATRIX_OVERLOADS(fn)                         \
+  template <uint R, uint C>                                                    \
+  constexpr matrix<float, R, C> fn(matrix<double, R, C> y,                     \
+                                   matrix<double, R, C> x) {                   \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  }
+
+#define _DXC_COMPAT_BINARY_INTEGER_MATRIX_OVERLOADS(fn)                        \
+  template <uint R, uint C>                                                    \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr matrix<float, R, C> fn(matrix<int, R, C> y, matrix<int, R, C> x) { \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  }                                                                            \
+                                                                               \
+  template <uint R, uint C>                                                    \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr matrix<float, R, C> fn(matrix<uint, R, C> y,                       \
+                                   matrix<uint, R, C> x) {                     \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  }                                                                            \
+                                                                               \
+  template <uint R, uint C>                                                    \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr matrix<float, R, C> fn(matrix<int64_t, R, C> y,                    \
+                                   matrix<int64_t, R, C> x) {                  \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  }                                                                            \
+                                                                               \
+  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  template <typename T, int R, int C>                                          \
-  constexpr matrix<ty, R, C> fn(matrix<T, R, C> y, matrix<T, R, C> x) {        \
-    return fn((matrix<ty, R, C>)y, (matrix<ty, R, C>)x);                       \
+  constexpr matrix<float, R, C> fn(matrix<uint64_t, R, C> y,                   \
+                                   matrix<uint64_t, R, C> x) {                 \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
   }
 //===----------------------------------------------------------------------===//
 // acos builtins overloads
@@ -263,7 +291,8 @@ _DXC_COMPAT_UNARY_INTEGER_OVERLOADS(atan)
 
 _DXC_COMPAT_BINARY_DOUBLE_OVERLOADS(atan2)
 _DXC_COMPAT_BINARY_INTEGER_OVERLOADS(atan2)
-_DXC_COMPAT_BINARY_MATRIX_OVERLOADS(atan2, float)
+_DXC_COMPAT_BINARY_DOUBLE_MATRIX_OVERLOADS(atan2)
+_DXC_COMPAT_BINARY_INTEGER_MATRIX_OVERLOADS(atan2)
 
 //===----------------------------------------------------------------------===//
 // ceil builtins overloads

>From fb3072ab020b058f6f21646aa3dd6af2f86c1803 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Wed, 29 Apr 2026 17:02:28 -0700
Subject: [PATCH 05/20] add more tests

---
 .../CodeGenHLSL/builtins/atan2-overloads.hlsl  | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
index d36d2aebcf4c8..6ecad28850b00 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
@@ -191,6 +191,24 @@ float4 test_atan2_uint64_t4 (uint64_t4 p0, uint64_t4 p1) {
   return atan2(p0, p1);
 }
 
+// CHECK: define [[FNATTRS]] <16 x float> @_Z19test_atan2_int_t4x4u11matrix_typeILj4ELj4EiES_(
+// CHECK:    [[CONVI:%.*]] = sitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_int_t4x4 (int4x4 p0, int4x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <16 x float> @_Z20test_atan2_uint_t4x4u11matrix_typeILj4ELj4EjES_(
+// CHECK:    [[CONVI:%.*]] = uitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_uint_t4x4 (uint4x4 p0, uint4x4 p1) {
+  return atan2(p0, p1);
+}
+
 // CHECK: define [[FNATTRS]] <16 x float> @_Z21test_atan2_int64_t4x4u11matrix_typeILj4ELj4ElES_(
 // CHECK:    [[CONVI:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
 // CHECK:    [[CONV1I:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>

>From fe8cbb5342b9c8db8bd0f858ec533f115863057c Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Thu, 30 Apr 2026 11:06:45 -0700
Subject: [PATCH 06/20] adding sema tests

---
 .../lib/Headers/hlsl/hlsl_compat_overloads.h  |  1 +
 .../binary-compat-overload-warnings.hlsl      | 27 ++++++++++++++++++-
 .../BuiltIns/half-float-only-errors2.hlsl     |  6 +++++
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
index 0c4963b495b51..cddd940d1083e 100644
--- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
+++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
@@ -232,6 +232,7 @@ namespace hlsl {
 
 #define _DXC_COMPAT_BINARY_DOUBLE_MATRIX_OVERLOADS(fn)                         \
   template <uint R, uint C>                                                    \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
   constexpr matrix<float, R, C> fn(matrix<double, R, C> y,                     \
                                    matrix<double, R, C> x) {                   \
     return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
diff --git a/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl b/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl
index 27bb683825de8..7b93ea089d854 100644
--- a/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxilv1.0-unknown-shadermodel6.0-compute -std=hlsl202x -emit-llvm-only -disable-llvm-passes -DFUNC=atan2 %s 2>&1 | FileCheck %s -DFUNC=atan2
+// RUN: %clang_cc1 -finclude-default-header -triple dxilv1.0-unknown-shadermodel6.0-compute -std=hlsl202x -emit-llvm-only -disable-llvm-passes -DFUNC=atan2 %s 2>&1 | FileCheck %s -DFUNC=atan2 --check-prefixes=CHECK,ATAN2
 // RUN: %clang_cc1 -finclude-default-header -triple dxilv1.0-unknown-shadermodel6.0-compute -std=hlsl202x -emit-llvm-only -disable-llvm-passes -DFUNC=pow %s 2>&1 | FileCheck %s -DFUNC=pow
 // RUN: %clang_cc1 -finclude-default-header -triple dxilv1.0-unknown-shadermodel6.0-compute -std=hlsl202x -emit-llvm-only -disable-llvm-passes -DFUNC=step %s 2>&1 | FileCheck %s -DFUNC=step
 
@@ -23,6 +23,11 @@ float4 test_binary_double4(double4 p0) {
   return FUNC(p0, p0);
 }
 
+float4x4 test_binary_double4x4(double4x4 p0) {
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x 64 bit API lowering for [[FUNC]] is deprecated. Explicitly cast parameters to 32 or 16 bit types.
+  return FUNC(p0, p0);
+}
+
 // binary integer overloads
 // only test scalar ones for brevity
 float test_binary_int(int p0) {
@@ -44,3 +49,23 @@ float test_binary_int(uint64_t p0) {
   // CHECK: warning: '[[FUNC]]' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
   return FUNC(p0, p0);
 }
+
+float4x4 test_binary_uint4x4(uint4x4 p0) {
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  return FUNC(p0, p0);
+}
+
+float4x4 test_binary_int4x4(int4x4 p0) {
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  return FUNC(p0, p0);
+}
+
+float4x4 test_binary_int64_t4x4(int64_t4x4 p0) {
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  return FUNC(p0, p0);
+}
+
+float4x4 test_binary_uint64_t4x4(uint64_t4x4 p0) {
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  return FUNC(p0, p0);
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
index 9e10e1afa9385..d7a2f15f6baa8 100644
--- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
@@ -11,3 +11,9 @@ double2 test_vec_double_builtin(double2 p0, double2 p1) {
     return TEST_FUNC(p0, p1);
   // expected-error at -1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double2' (aka 'vector<double, 2>'))}}
 }
+
+// Temporary matrix workarround until we have proper matrix support in the builtins.
+double2x2 test_vec_double_builtin(double2x2 p0, double2x2 p1) {
+    return __builtin_elementwise_atan2(p0, p1);
+  // expected-error at -1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double2x2' (aka 'matrix<double, 2, 2>'))}}
+}

>From 93a09ea2a85848ce27a56591f12a6749f408ad13 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Thu, 30 Apr 2026 13:51:01 -0700
Subject: [PATCH 07/20] add matrix overloads

---
 clang/include/clang/Basic/HLSLIntrinsics.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/HLSLIntrinsics.td b/clang/include/clang/Basic/HLSLIntrinsics.td
index 144b27cab7398..cc1e30d684d3e 100644
--- a/clang/include/clang/Basic/HLSLIntrinsics.td
+++ b/clang/include/clang/Basic/HLSLIntrinsics.td
@@ -457,7 +457,7 @@ determine the correct quadrant.
 \param x The x-coordinate.
 }];
   let VaryingTypes = [HalfTy, FloatTy];
-  let VaryingMatDims = [];
+  let VaryingMatDims = AllMatDims;
 }
 
 // Returns the smallest integer value that is greater than or equal to the

>From e0ba50390bf11d79eda7e62cd99d4df18da50dec Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Thu, 30 Apr 2026 14:12:15 -0700
Subject: [PATCH 08/20] address comments

---
 clang/include/clang/Basic/HLSLIntrinsics.td | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/include/clang/Basic/HLSLIntrinsics.td b/clang/include/clang/Basic/HLSLIntrinsics.td
index cc1e30d684d3e..6084a6f92180e 100644
--- a/clang/include/clang/Basic/HLSLIntrinsics.td
+++ b/clang/include/clang/Basic/HLSLIntrinsics.td
@@ -457,7 +457,6 @@ determine the correct quadrant.
 \param x The x-coordinate.
 }];
   let VaryingTypes = [HalfTy, FloatTy];
-  let VaryingMatDims = AllMatDims;
 }
 
 // Returns the smallest integer value that is greater than or equal to the

>From 7b10499001800a74940fd3eff0c5f7a8b7da24bc Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Thu, 30 Apr 2026 15:13:57 -0700
Subject: [PATCH 09/20] add _mat tests

---
 clang/test/CodeGenHLSL/builtins/atan2.hlsl    |  27 ---
 .../test/CodeGenHLSL/builtins/atan2_mat.hlsl  | 215 ++++++++++++++++++
 .../test/SemaHLSL/BuiltIns/atan2-errors.hlsl  |   7 +
 .../BuiltIns/half-float-only-errors2.hlsl     |   6 -
 4 files changed, 222 insertions(+), 33 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/atan2_mat.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/atan2-errors.hlsl

diff --git a/clang/test/CodeGenHLSL/builtins/atan2.hlsl b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
index 986ddc75b4f8e..512b44a5780db 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
@@ -34,20 +34,6 @@ half4 test_atan2_half4 (half4 p0, half4 p1) {
   return atan2(p0, p1);
 }
 
-// CHECK-LABEL: test_atan2_half4x4
-// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <16 x half> @llvm.atan2.v16f16
-// NO_HALF: call reassoc nnan ninf nsz arcp afn <16 x float> @llvm.atan2.v16f32
-half4x4 test_atan2_half4x4 (half4x4 p0, half4x4 p1) {
-  return atan2(p0, p1);
-}
-
-// CHECK-LABEL: test_atan2_half2x3
-// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <6 x half> @llvm.atan2.v6f16
-// NO_HALF: call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
-half2x3 test_atan2_half2x3 (half2x3 p0, half2x3 p1) {
-  return atan2(p0, p1);
-}
-
 // CHECK-LABEL: test_atan2_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.atan2.f32
 float test_atan2_float (float p0, float p1) {
@@ -71,16 +57,3 @@ float3 test_atan2_float3 (float3 p0, float3 p1) {
 float4 test_atan2_float4 (float4 p0, float4 p1) {
   return atan2(p0, p1);
 }
-
-// CHECK-LABEL: test_atan2_float4x4
-// CHECK:  call reassoc nnan ninf nsz arcp afn <16 x float> @llvm.atan2.v16f32
-float4x4 test_atan2_float4x4 (float4x4 p0, float4x4 p1) {
-  return atan2(p0, p1);
-}
-
-
-// CHECK-LABEL: test_atan2_float2x3
-// CHECK:  call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
-float2x3 test_atan2_float2x3 (float2x3 p0, float2x3 p1) {
-  return atan2(p0, p1);
-}
diff --git a/clang/test/CodeGenHLSL/builtins/atan2_mat.hlsl b/clang/test/CodeGenHLSL/builtins/atan2_mat.hlsl
new file mode 100644
index 0000000000000..db9439edfcc11
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/atan2_mat.hlsl
@@ -0,0 +1,215 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// CHECK-LABEL: test_atan2_half1x1
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <1 x half> @llvm.atan2.v1f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <1 x float> @llvm.atan2.v1f32
+half1x1 test_atan2_half1x1 (half1x1 p0, half1x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half1x2
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.atan2.v2f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.atan2.v2f32
+half1x2 test_atan2_half1x2 (half1x2 p0, half1x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half1x3
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.atan2.v3f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.atan2.v3f32
+half1x3 test_atan2_half1x3 (half1x3 p0, half1x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half1x4
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.atan2.v4f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.atan2.v4f32
+half1x4 test_atan2_half1x4 (half1x4 p0, half1x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half2x1
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.atan2.v2f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.atan2.v2f32
+half2x1 test_atan2_half2x1 (half2x1 p0, half2x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half2x2
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.atan2.v4f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.atan2.v4f32
+half2x2 test_atan2_half2x2 (half2x2 p0, half2x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half2x3
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <6 x half> @llvm.atan2.v6f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
+half2x3 test_atan2_half2x3 (half2x3 p0, half2x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half2x4
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <8 x half> @llvm.atan2.v8f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <8 x float> @llvm.atan2.v8f32
+half2x4 test_atan2_half2x4 (half2x4 p0, half2x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half3x1
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.atan2.v3f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.atan2.v3f32
+half3x1 test_atan2_half3x1 (half3x1 p0, half3x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half3x2
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <6 x half> @llvm.atan2.v6f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
+half3x2 test_atan2_half3x2 (half3x2 p0, half3x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half3x3
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <9 x half> @llvm.atan2.v9f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <9 x float> @llvm.atan2.v9f32
+half3x3 test_atan2_half3x3 (half3x3 p0, half3x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half3x4
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <12 x half> @llvm.atan2.v12f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <12 x float> @llvm.atan2.v12f32
+half3x4 test_atan2_half3x4 (half3x4 p0, half3x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half4x1
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.atan2.v4f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.atan2.v4f32
+half4x1 test_atan2_half4x1 (half4x1 p0, half4x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half4x2
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <8 x half> @llvm.atan2.v8f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <8 x float> @llvm.atan2.v8f32
+half4x2 test_atan2_half4x2 (half4x2 p0, half4x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half4x3
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <12 x half> @llvm.atan2.v12f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <12 x float> @llvm.atan2.v12f32
+half4x3 test_atan2_half4x3 (half4x3 p0, half4x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half4x4
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <16 x half> @llvm.atan2.v16f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <16 x float> @llvm.atan2.v16f32
+half4x4 test_atan2_half4x4 (half4x4 p0, half4x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float1x1
+// CHECK:  call reassoc nnan ninf nsz arcp afn <1 x float> @llvm.atan2.v1f32
+float1x1 test_atan2_float1x1 (float1x1 p0, float1x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float1x2
+// CHECK:  call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.atan2.v2f32
+float1x2 test_atan2_float1x2 (float1x2 p0, float1x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float1x3
+// CHECK:  call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.atan2.v3f32
+float1x3 test_atan2_float1x3 (float1x3 p0, float1x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float1x4
+// CHECK:  call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.atan2.v4f32
+float1x4 test_atan2_float1x4 (float1x4 p0, float1x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float2x1
+// CHECK:  call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.atan2.v2f32
+float2x1 test_atan2_float2x1 (float2x1 p0, float2x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float2x2
+// CHECK:  call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.atan2.v4f32
+float2x2 test_atan2_float2x2 (float2x2 p0, float2x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float2x3
+// CHECK:  call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
+float2x3 test_atan2_float2x3 (float2x3 p0, float2x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float2x4
+// CHECK:  call reassoc nnan ninf nsz arcp afn <8 x float> @llvm.atan2.v8f32
+float2x4 test_atan2_float2x4 (float2x4 p0, float2x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float3x1
+// CHECK:  call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.atan2.v3f32
+float3x1 test_atan2_float3x1 (float3x1 p0, float3x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float3x2
+// CHECK:  call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
+float3x2 test_atan2_float3x2 (float3x2 p0, float3x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float3x3
+// CHECK:  call reassoc nnan ninf nsz arcp afn <9 x float> @llvm.atan2.v9f32
+float3x3 test_atan2_float3x3 (float3x3 p0, float3x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float3x4
+// CHECK:  call reassoc nnan ninf nsz arcp afn <12 x float> @llvm.atan2.v12f32
+float3x4 test_atan2_float3x4 (float3x4 p0, float3x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float4x1
+// CHECK:  call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.atan2.v4f32
+float4x1 test_atan2_float4x1 (float4x1 p0, float4x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float4x2
+// CHECK:  call reassoc nnan ninf nsz arcp afn <8 x float> @llvm.atan2.v8f32
+float4x2 test_atan2_float4x2 (float4x2 p0, float4x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float4x3
+// CHECK:  call reassoc nnan ninf nsz arcp afn <12 x float> @llvm.atan2.v12f32
+float4x3 test_atan2_float4x3 (float4x3 p0, float4x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float4x4
+// CHECK:  call reassoc nnan ninf nsz arcp afn <16 x float> @llvm.atan2.v16f32
+float4x4 test_atan2_float4x4 (float4x4 p0, float4x4 p1) {
+  return atan2(p0, p1);
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/atan2-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/atan2-errors.hlsl
new file mode 100644
index 0000000000000..19467d99d2292
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/atan2-errors.hlsl
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
+
+
+double2x2 test_vec_double_builtin(double2x2 p0, double2x2 p1) {
+    return __builtin_elementwise_atan2(p0, p1);
+  // expected-error at -1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double2x2' (aka 'matrix<double, 2, 2>'))}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
index d7a2f15f6baa8..9e10e1afa9385 100644
--- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
@@ -11,9 +11,3 @@ double2 test_vec_double_builtin(double2 p0, double2 p1) {
     return TEST_FUNC(p0, p1);
   // expected-error at -1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double2' (aka 'vector<double, 2>'))}}
 }
-
-// Temporary matrix workarround until we have proper matrix support in the builtins.
-double2x2 test_vec_double_builtin(double2x2 p0, double2x2 p1) {
-    return __builtin_elementwise_atan2(p0, p1);
-  // expected-error at -1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double2x2' (aka 'matrix<double, 2, 2>'))}}
-}

>From c328726c2425824319b31bde3e14b1dbe9a8bd37 Mon Sep 17 00:00:00 2001
From: joaosaffran <joaosaffran at gmail.com>
Date: Thu, 30 Apr 2026 15:18:46 -0700
Subject: [PATCH 10/20] Apply suggestion from @Icohedron

Co-authored-by: Deric C. <cheung.deric at gmail.com>
---
 clang/lib/Headers/hlsl/hlsl_compat_overloads.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
index cddd940d1083e..12223aee6a4e7 100644
--- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
+++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
@@ -265,6 +265,7 @@ namespace hlsl {
                                    matrix<uint64_t, R, C> x) {                 \
     return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
   }
+
 //===----------------------------------------------------------------------===//
 // acos builtins overloads
 //===----------------------------------------------------------------------===//

>From 9c0a5bcef00b9400fd85e9eca2cf0afb24c5daaf Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Thu, 30 Apr 2026 15:20:44 -0700
Subject: [PATCH 11/20] clean up

---
 clang/lib/Sema/SemaHLSL.cpp                                   | 4 ++--
 .../BuiltIns/{atan2-errors.hlsl => atan2-errors_mat.hlsl}     | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename clang/test/SemaHLSL/BuiltIns/{atan2-errors.hlsl => atan2-errors_mat.hlsl} (100%)

diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 368a54b44c2d5..bb996d291675e 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3240,8 +3240,8 @@ static bool CheckFloatOrHalfRepresentation(Sema *S, SourceLocation Loc,
 
   if (!BaseType->isHalfType() && !BaseType->isFloat32Type())
     return S->Diag(Loc, diag::err_builtin_invalid_arg_type)
-           << ArgOrdinal << /* scalar, vector or matrix of */ 5
-           << /* no int */ 0 << /* half or float */ 2 << PassedType;
+           << ArgOrdinal << /* scalar or vector of */ 5 << /* no int */ 0
+           << /* half or float */ 2 << PassedType;
   return false;
 }
 
diff --git a/clang/test/SemaHLSL/BuiltIns/atan2-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/atan2-errors_mat.hlsl
similarity index 100%
rename from clang/test/SemaHLSL/BuiltIns/atan2-errors.hlsl
rename to clang/test/SemaHLSL/BuiltIns/atan2-errors_mat.hlsl

>From a31a8fc0cc64a66631c9be680158b2944c3521b7 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Fri, 1 May 2026 11:35:19 -0700
Subject: [PATCH 12/20] remove templates

---
 .../lib/Headers/hlsl/hlsl_compat_overloads.h  | 402 ++++++++++-
 .../builtins/atan2-overloads_mat.hlsl         | 679 ++++++++++++++++++
 2 files changed, 1061 insertions(+), 20 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/atan2-overloads_mat.hlsl

diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
index 12223aee6a4e7..08af61bed7b9a 100644
--- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
+++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
@@ -231,41 +231,403 @@ namespace hlsl {
   }
 
 #define _DXC_COMPAT_BINARY_DOUBLE_MATRIX_OVERLOADS(fn)                         \
-  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr matrix<float, R, C> fn(matrix<double, R, C> y,                     \
-                                   matrix<double, R, C> x) {                   \
-    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  constexpr float1x1 fn(double1x1 y, double1x1 x) {                            \
+    return fn((float1x1)y, (float1x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float1x2 fn(double1x2 y, double1x2 x) {                            \
+    return fn((float1x2)y, (float1x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float1x3 fn(double1x3 y, double1x3 x) {                            \
+    return fn((float1x3)y, (float1x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float1x4 fn(double1x4 y, double1x4 x) {                            \
+    return fn((float1x4)y, (float1x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float2x1 fn(double2x1 y, double2x1 x) {                            \
+    return fn((float2x1)y, (float2x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float2x2 fn(double2x2 y, double2x2 x) {                            \
+    return fn((float2x2)y, (float2x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float2x3 fn(double2x3 y, double2x3 x) {                            \
+    return fn((float2x3)y, (float2x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float2x4 fn(double2x4 y, double2x4 x) {                            \
+    return fn((float2x4)y, (float2x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float3x1 fn(double3x1 y, double3x1 x) {                            \
+    return fn((float3x1)y, (float3x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float3x2 fn(double3x2 y, double3x2 x) {                            \
+    return fn((float3x2)y, (float3x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float3x3 fn(double3x3 y, double3x3 x) {                            \
+    return fn((float3x3)y, (float3x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float3x4 fn(double3x4 y, double3x4 x) {                            \
+    return fn((float3x4)y, (float3x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float4x1 fn(double4x1 y, double4x1 x) {                            \
+    return fn((float4x1)y, (float4x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float4x2 fn(double4x2 y, double4x2 x) {                            \
+    return fn((float4x2)y, (float4x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float4x3 fn(double4x3 y, double4x3 x) {                            \
+    return fn((float4x3)y, (float4x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float4x4 fn(double4x4 y, double4x4 x) {                            \
+    return fn((float4x4)y, (float4x4)x);                                       \
   }
 
 #define _DXC_COMPAT_BINARY_INTEGER_MATRIX_OVERLOADS(fn)                        \
-  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr matrix<float, R, C> fn(matrix<int, R, C> y, matrix<int, R, C> x) { \
-    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  constexpr float1x1 fn(int1x1 y, int1x1 x) {                                  \
+    return fn((float1x1)y, (float1x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x2 fn(int1x2 y, int1x2 x) {                                  \
+    return fn((float1x2)y, (float1x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x3 fn(int1x3 y, int1x3 x) {                                  \
+    return fn((float1x3)y, (float1x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x4 fn(int1x4 y, int1x4 x) {                                  \
+    return fn((float1x4)y, (float1x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x1 fn(int2x1 y, int2x1 x) {                                  \
+    return fn((float2x1)y, (float2x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x2 fn(int2x2 y, int2x2 x) {                                  \
+    return fn((float2x2)y, (float2x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x3 fn(int2x3 y, int2x3 x) {                                  \
+    return fn((float2x3)y, (float2x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x4 fn(int2x4 y, int2x4 x) {                                  \
+    return fn((float2x4)y, (float2x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x1 fn(int3x1 y, int3x1 x) {                                  \
+    return fn((float3x1)y, (float3x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x2 fn(int3x2 y, int3x2 x) {                                  \
+    return fn((float3x2)y, (float3x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x3 fn(int3x3 y, int3x3 x) {                                  \
+    return fn((float3x3)y, (float3x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x4 fn(int3x4 y, int3x4 x) {                                  \
+    return fn((float3x4)y, (float3x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x1 fn(int4x1 y, int4x1 x) {                                  \
+    return fn((float4x1)y, (float4x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x2 fn(int4x2 y, int4x2 x) {                                  \
+    return fn((float4x2)y, (float4x2)x);                                       \
   }                                                                            \
                                                                                \
-  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr matrix<float, R, C> fn(matrix<uint, R, C> y,                       \
-                                   matrix<uint, R, C> x) {                     \
-    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  constexpr float4x3 fn(int4x3 y, int4x3 x) {                                  \
+    return fn((float4x3)y, (float4x3)x);                                       \
   }                                                                            \
                                                                                \
-  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr matrix<float, R, C> fn(matrix<int64_t, R, C> y,                    \
-                                   matrix<int64_t, R, C> x) {                  \
-    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  constexpr float4x4 fn(int4x4 y, int4x4 x) {                                  \
+    return fn((float4x4)y, (float4x4)x);                                       \
+  }                                                                            \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x1 fn(uint1x1 y, uint1x1 x) {                                \
+    return fn((float1x1)y, (float1x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x2 fn(uint1x2 y, uint1x2 x) {                                \
+    return fn((float1x2)y, (float1x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x3 fn(uint1x3 y, uint1x3 x) {                                \
+    return fn((float1x3)y, (float1x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x4 fn(uint1x4 y, uint1x4 x) {                                \
+    return fn((float1x4)y, (float1x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x1 fn(uint2x1 y, uint2x1 x) {                                \
+    return fn((float2x1)y, (float2x1)x);                                       \
   }                                                                            \
                                                                                \
-  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr matrix<float, R, C> fn(matrix<uint64_t, R, C> y,                   \
-                                   matrix<uint64_t, R, C> x) {                 \
-    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  constexpr float2x2 fn(uint2x2 y, uint2x2 x) {                                \
+    return fn((float2x2)y, (float2x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x3 fn(uint2x3 y, uint2x3 x) {                                \
+    return fn((float2x3)y, (float2x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x4 fn(uint2x4 y, uint2x4 x) {                                \
+    return fn((float2x4)y, (float2x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x1 fn(uint3x1 y, uint3x1 x) {                                \
+    return fn((float3x1)y, (float3x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x2 fn(uint3x2 y, uint3x2 x) {                                \
+    return fn((float3x2)y, (float3x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x3 fn(uint3x3 y, uint3x3 x) {                                \
+    return fn((float3x3)y, (float3x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x4 fn(uint3x4 y, uint3x4 x) {                                \
+    return fn((float3x4)y, (float3x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x1 fn(uint4x1 y, uint4x1 x) {                                \
+    return fn((float4x1)y, (float4x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x2 fn(uint4x2 y, uint4x2 x) {                                \
+    return fn((float4x2)y, (float4x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x3 fn(uint4x3 y, uint4x3 x) {                                \
+    return fn((float4x3)y, (float4x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x4 fn(uint4x4 y, uint4x4 x) {                                \
+    return fn((float4x4)y, (float4x4)x);                                       \
+  }                                                                            \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x1 fn(int64_t1x1 y, int64_t1x1 x) {                          \
+    return fn((float1x1)y, (float1x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x2 fn(int64_t1x2 y, int64_t1x2 x) {                          \
+    return fn((float1x2)y, (float1x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x3 fn(int64_t1x3 y, int64_t1x3 x) {                          \
+    return fn((float1x3)y, (float1x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x4 fn(int64_t1x4 y, int64_t1x4 x) {                          \
+    return fn((float1x4)y, (float1x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x1 fn(int64_t2x1 y, int64_t2x1 x) {                          \
+    return fn((float2x1)y, (float2x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x2 fn(int64_t2x2 y, int64_t2x2 x) {                          \
+    return fn((float2x2)y, (float2x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x3 fn(int64_t2x3 y, int64_t2x3 x) {                          \
+    return fn((float2x3)y, (float2x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x4 fn(int64_t2x4 y, int64_t2x4 x) {                          \
+    return fn((float2x4)y, (float2x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x1 fn(int64_t3x1 y, int64_t3x1 x) {                          \
+    return fn((float3x1)y, (float3x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x2 fn(int64_t3x2 y, int64_t3x2 x) {                          \
+    return fn((float3x2)y, (float3x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x3 fn(int64_t3x3 y, int64_t3x3 x) {                          \
+    return fn((float3x3)y, (float3x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x4 fn(int64_t3x4 y, int64_t3x4 x) {                          \
+    return fn((float3x4)y, (float3x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x1 fn(int64_t4x1 y, int64_t4x1 x) {                          \
+    return fn((float4x1)y, (float4x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x2 fn(int64_t4x2 y, int64_t4x2 x) {                          \
+    return fn((float4x2)y, (float4x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x3 fn(int64_t4x3 y, int64_t4x3 x) {                          \
+    return fn((float4x3)y, (float4x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x4 fn(int64_t4x4 y, int64_t4x4 x) {                          \
+    return fn((float4x4)y, (float4x4)x);                                       \
+  }                                                                            \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x1 fn(uint64_t1x1 y, uint64_t1x1 x) {                        \
+    return fn((float1x1)y, (float1x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x2 fn(uint64_t1x2 y, uint64_t1x2 x) {                        \
+    return fn((float1x2)y, (float1x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x3 fn(uint64_t1x3 y, uint64_t1x3 x) {                        \
+    return fn((float1x3)y, (float1x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x4 fn(uint64_t1x4 y, uint64_t1x4 x) {                        \
+    return fn((float1x4)y, (float1x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x1 fn(uint64_t2x1 y, uint64_t2x1 x) {                        \
+    return fn((float2x1)y, (float2x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x2 fn(uint64_t2x2 y, uint64_t2x2 x) {                        \
+    return fn((float2x2)y, (float2x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x3 fn(uint64_t2x3 y, uint64_t2x3 x) {                        \
+    return fn((float2x3)y, (float2x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x4 fn(uint64_t2x4 y, uint64_t2x4 x) {                        \
+    return fn((float2x4)y, (float2x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x1 fn(uint64_t3x1 y, uint64_t3x1 x) {                        \
+    return fn((float3x1)y, (float3x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x2 fn(uint64_t3x2 y, uint64_t3x2 x) {                        \
+    return fn((float3x2)y, (float3x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x3 fn(uint64_t3x3 y, uint64_t3x3 x) {                        \
+    return fn((float3x3)y, (float3x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x4 fn(uint64_t3x4 y, uint64_t3x4 x) {                        \
+    return fn((float3x4)y, (float3x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x1 fn(uint64_t4x1 y, uint64_t4x1 x) {                        \
+    return fn((float4x1)y, (float4x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x2 fn(uint64_t4x2 y, uint64_t4x2 x) {                        \
+    return fn((float4x2)y, (float4x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x3 fn(uint64_t4x3 y, uint64_t4x3 x) {                        \
+    return fn((float4x3)y, (float4x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x4 fn(uint64_t4x4 y, uint64_t4x4 x) {                        \
+    return fn((float4x4)y, (float4x4)x);                                       \
   }
-
 //===----------------------------------------------------------------------===//
 // acos builtins overloads
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenHLSL/builtins/atan2-overloads_mat.hlsl b/clang/test/CodeGenHLSL/builtins/atan2-overloads_mat.hlsl
new file mode 100644
index 0000000000000..5cf473fa13d98
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/atan2-overloads_mat.hlsl
@@ -0,0 +1,679 @@
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm  \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" 
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_double1x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <2 x double> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <2 x double> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float1x2 test_atan2_double1x2 (double1x2 p0, double1x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_double1x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <3 x double> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <3 x double> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float1x3 test_atan2_double1x3 (double1x3 p0, double1x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_double1x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <4 x double> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <4 x double> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float1x4 test_atan2_double1x4 (double1x4 p0, double1x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_double2x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <2 x double> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <2 x double> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float2x1 test_atan2_double2x1 (double2x1 p0, double2x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_double2x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <4 x double> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <4 x double> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float2x2 test_atan2_double2x2 (double2x2 p0, double2x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_double2x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <6 x double> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <6 x double> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float2x3 test_atan2_double2x3 (double2x3 p0, double2x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_double2x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <8 x double> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <8 x double> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float2x4 test_atan2_double2x4 (double2x4 p0, double2x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_double3x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <3 x double> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <3 x double> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float3x1 test_atan2_double3x1 (double3x1 p0, double3x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_double3x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <6 x double> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <6 x double> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float3x2 test_atan2_double3x2 (double3x2 p0, double3x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <9 x float> @_{{.*}}test_atan2_double3x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <9 x double> %{{.*}} to <9 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <9 x double> %{{.*}} to <9 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <9 x float> @llvm.atan2.v9f32(<9 x float> [[CONVI]], <9 x float> [[CONV1I]])
+// CHECK:    ret <9 x float> [[V5]]
+float3x3 test_atan2_double3x3 (double3x3 p0, double3x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_double3x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <12 x double> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <12 x double> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float3x4 test_atan2_double3x4 (double3x4 p0, double3x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_double4x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <4 x double> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <4 x double> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float4x1 test_atan2_double4x1 (double4x1 p0, double4x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_double4x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <8 x double> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <8 x double> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float4x2 test_atan2_double4x2 (double4x2 p0, double4x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_double4x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <12 x double> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <12 x double> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float4x3 test_atan2_double4x3 (double4x3 p0, double4x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <16 x float> @_{{.*}}test_atan2_double4x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <16 x double> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <16 x double> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_double4x4 (double4x4 p0, double4x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_uint1x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float1x2 test_atan2_uint1x2 (uint1x2 p0, uint1x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_uint1x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float1x3 test_atan2_uint1x3 (uint1x3 p0, uint1x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_uint1x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float1x4 test_atan2_uint1x4 (uint1x4 p0, uint1x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_uint2x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float2x1 test_atan2_uint2x1 (uint2x1 p0, uint2x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_uint2x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float2x2 test_atan2_uint2x2 (uint2x2 p0, uint2x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_uint2x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float2x3 test_atan2_uint2x3 (uint2x3 p0, uint2x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_uint2x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float2x4 test_atan2_uint2x4 (uint2x4 p0, uint2x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_uint3x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float3x1 test_atan2_uint3x1 (uint3x1 p0, uint3x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_uint3x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float3x2 test_atan2_uint3x2 (uint3x2 p0, uint3x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <9 x float> @_{{.*}}test_atan2_uint3x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <9 x i32> %{{.*}} to <9 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <9 x i32> %{{.*}} to <9 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <9 x float> @llvm.atan2.v9f32(<9 x float> [[CONVI]], <9 x float> [[CONV1I]])
+// CHECK:    ret <9 x float> [[V5]]
+float3x3 test_atan2_uint3x3 (uint3x3 p0, uint3x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_uint3x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float3x4 test_atan2_uint3x4 (uint3x4 p0, uint3x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_uint4x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float4x1 test_atan2_uint4x1 (uint4x1 p0, uint4x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_uint4x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float4x2 test_atan2_uint4x2 (uint4x2 p0, uint4x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_uint4x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float4x3 test_atan2_uint4x3 (uint4x3 p0, uint4x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <16 x float> @_{{.*}}test_atan2_uint4x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_uint4x4 (uint4x4 p0, uint4x4 p1) {
+  return atan2(p0, p1);
+}
+
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_int1x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float1x2 test_atan2_int1x2 (int1x2 p0, int1x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_int1x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float1x3 test_atan2_int1x3 (int1x3 p0, int1x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_int1x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float1x4 test_atan2_int1x4 (int1x4 p0, int1x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_int2x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float2x1 test_atan2_int2x1 (int2x1 p0, int2x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_int2x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float2x2 test_atan2_int2x2 (int2x2 p0, int2x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_int2x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float2x3 test_atan2_int2x3 (int2x3 p0, int2x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_int2x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float2x4 test_atan2_int2x4 (int2x4 p0, int2x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_int3x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float3x1 test_atan2_int3x1 (int3x1 p0, int3x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_int3x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float3x2 test_atan2_int3x2 (int3x2 p0, int3x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <9 x float> @_{{.*}}test_atan2_int3x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <9 x i32> %{{.*}} to <9 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <9 x i32> %{{.*}} to <9 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <9 x float> @llvm.atan2.v9f32(<9 x float> [[CONVI]], <9 x float> [[CONV1I]])
+// CHECK:    ret <9 x float> [[V5]]
+float3x3 test_atan2_int3x3 (int3x3 p0, int3x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_int3x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float3x4 test_atan2_int3x4 (int3x4 p0, int3x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_int4x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float4x1 test_atan2_int4x1 (int4x1 p0, int4x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_int4x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float4x2 test_atan2_int4x2 (int4x2 p0, int4x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_int4x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float4x3 test_atan2_int4x3 (int4x3 p0, int4x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <16 x float> @_{{.*}}test_atan2_int4x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_int4x4 (int4x4 p0, int4x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_int64_t1x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float1x2 test_atan2_int64_t1x2 (int64_t1x2 p0, int64_t1x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_int64_t1x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float1x3 test_atan2_int64_t1x3 (int64_t1x3 p0, int64_t1x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_int64_t1x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float1x4 test_atan2_int64_t1x4 (int64_t1x4 p0, int64_t1x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_int64_t2x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float2x1 test_atan2_int64_t2x1 (int64_t2x1 p0, int64_t2x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_int64_t2x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float2x2 test_atan2_int64_t2x2 (int64_t2x2 p0, int64_t2x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_int64_t2x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float2x3 test_atan2_int64_t2x3 (int64_t2x3 p0, int64_t2x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_int64_t2x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float2x4 test_atan2_int64_t2x4 (int64_t2x4 p0, int64_t2x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_int64_t3x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float3x1 test_atan2_int64_t3x1 (int64_t3x1 p0, int64_t3x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_int64_t3x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float3x2 test_atan2_int64_t3x2 (int64_t3x2 p0, int64_t3x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <9 x float> @_{{.*}}test_atan2_int64_t3x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <9 x i64> %{{.*}} to <9 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <9 x i64> %{{.*}} to <9 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <9 x float> @llvm.atan2.v9f32(<9 x float> [[CONVI]], <9 x float> [[CONV1I]])
+// CHECK:    ret <9 x float> [[V5]]
+float3x3 test_atan2_int64_t3x3 (int64_t3x3 p0, int64_t3x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_int64_t3x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float3x4 test_atan2_int64_t3x4 (int64_t3x4 p0, int64_t3x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_int64_t4x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float4x1 test_atan2_int64_t4x1 (int64_t4x1 p0, int64_t4x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_int64_t4x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float4x2 test_atan2_int64_t4x2 (int64_t4x2 p0, int64_t4x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_int64_t4x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float4x3 test_atan2_int64_t4x3 (int64_t4x3 p0, int64_t4x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <16 x float> @_{{.*}}test_atan2_int64_t4x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_int64_t4x4 (int64_t4x4 p0, int64_t4x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_uint64_t1x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float1x2 test_atan2_uint64_t1x2 (uint64_t1x2 p0, uint64_t1x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_uint64_t1x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float1x3 test_atan2_uint64_t1x3 (uint64_t1x3 p0, uint64_t1x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_uint64_t1x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float1x4 test_atan2_uint64_t1x4 (uint64_t1x4 p0, uint64_t1x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_uint64_t2x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float2x1 test_atan2_uint64_t2x1 (uint64_t2x1 p0, uint64_t2x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_uint64_t2x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float2x2 test_atan2_uint64_t2x2 (uint64_t2x2 p0, uint64_t2x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_uint64_t2x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float2x3 test_atan2_uint64_t2x3 (uint64_t2x3 p0, uint64_t2x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_uint64_t2x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float2x4 test_atan2_uint64_t2x4 (uint64_t2x4 p0, uint64_t2x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_uint64_t3x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float3x1 test_atan2_uint64_t3x1 (uint64_t3x1 p0, uint64_t3x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_uint64_t3x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float3x2 test_atan2_uint64_t3x2 (uint64_t3x2 p0, uint64_t3x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <9 x float> @_{{.*}}test_atan2_uint64_t3x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <9 x i64> %{{.*}} to <9 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <9 x i64> %{{.*}} to <9 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <9 x float> @llvm.atan2.v9f32(<9 x float> [[CONVI]], <9 x float> [[CONV1I]])
+// CHECK:    ret <9 x float> [[V5]]
+float3x3 test_atan2_uint64_t3x3 (uint64_t3x3 p0, uint64_t3x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_uint64_t3x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float3x4 test_atan2_uint64_t3x4 (uint64_t3x4 p0, uint64_t3x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_uint64_t4x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float4x1 test_atan2_uint64_t4x1 (uint64_t4x1 p0, uint64_t4x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_uint64_t4x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float4x2 test_atan2_uint64_t4x2 (uint64_t4x2 p0, uint64_t4x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_uint64_t4x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float4x3 test_atan2_uint64_t4x3 (uint64_t4x3 p0, uint64_t4x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <16 x float> @_{{.*}}test_atan2_uint64_t4x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_uint64_t4x4 (uint64_t4x4 p0, uint64_t4x4 p1) {
+  return atan2(p0, p1);
+}

>From 43d7cbc0b19c847b4c3328f8d4d2354114e504b2 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Fri, 1 May 2026 11:42:28 -0700
Subject: [PATCH 13/20] clean up

---
 .../CodeGenHLSL/builtins/atan2-overloads.hlsl | 45 -------------------
 1 file changed, 45 deletions(-)

diff --git a/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
index 6ecad28850b00..85ff75110a78e 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
@@ -38,15 +38,6 @@ float4 test_atan2_double4 (double4 p0, double4 p1) {
   return atan2(p0, p1);
 }
 
-// CHECK: define [[FNATTRS]] <16 x float> @_Z20test_atan2_double4x4u11matrix_typeILj4ELj4EdES_(
-// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <16 x double> %{{.*}} to <16 x float>
-// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <16 x double> %{{.*}} to <16 x float>
-// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
-// CHECK:    ret <16 x float> [[V5]]
-float4x4 test_atan2_double4x4 (double4x4 p0, double4x4 p1) {
-  return atan2(p0, p1);
-}
-
 // CHECK: define [[FNATTRS]] float @_Z14test_atan2_intii(
 // CHECK:    [[CONVI:%.*]] = sitofp i32 %{{.*}} to float
 // CHECK:    [[CONV1I:%.*]] = sitofp i32 %{{.*}} to float
@@ -190,39 +181,3 @@ float3 test_atan2_uint64_t3 (uint64_t3 p0, uint64_t3 p1) {
 float4 test_atan2_uint64_t4 (uint64_t4 p0, uint64_t4 p1) {
   return atan2(p0, p1);
 }
-
-// CHECK: define [[FNATTRS]] <16 x float> @_Z19test_atan2_int_t4x4u11matrix_typeILj4ELj4EiES_(
-// CHECK:    [[CONVI:%.*]] = sitofp <16 x i32> %{{.*}} to <16 x float>
-// CHECK:    [[CONV1I:%.*]] = sitofp <16 x i32> %{{.*}} to <16 x float>
-// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
-// CHECK:    ret <16 x float> [[V5]]
-float4x4 test_atan2_int_t4x4 (int4x4 p0, int4x4 p1) {
-  return atan2(p0, p1);
-}
-
-// CHECK: define [[FNATTRS]] <16 x float> @_Z20test_atan2_uint_t4x4u11matrix_typeILj4ELj4EjES_(
-// CHECK:    [[CONVI:%.*]] = uitofp <16 x i32> %{{.*}} to <16 x float>
-// CHECK:    [[CONV1I:%.*]] = uitofp <16 x i32> %{{.*}} to <16 x float>
-// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
-// CHECK:    ret <16 x float> [[V5]]
-float4x4 test_atan2_uint_t4x4 (uint4x4 p0, uint4x4 p1) {
-  return atan2(p0, p1);
-}
-
-// CHECK: define [[FNATTRS]] <16 x float> @_Z21test_atan2_int64_t4x4u11matrix_typeILj4ELj4ElES_(
-// CHECK:    [[CONVI:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
-// CHECK:    [[CONV1I:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
-// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
-// CHECK:    ret <16 x float> [[V5]]
-float4x4 test_atan2_int64_t4x4 (int64_t4x4 p0, int64_t4x4 p1) {
-  return atan2(p0, p1);
-}
-
-// CHECK: define [[FNATTRS]] <16 x float> @_Z22test_atan2_uint64_t4x4u11matrix_typeILj4ELj4EmES_(
-// CHECK:    [[CONVI:%.*]] = uitofp <16 x i64> %{{.*}} to <16 x float>
-// CHECK:    [[CONV1I:%.*]] = uitofp <16 x i64> %{{.*}} to <16 x float>
-// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
-// CHECK:    ret <16 x float> [[V5]]
-float4x4 test_atan2_uint64_t4x4 (uint64_t4x4 p0, uint64_t4x4 p1) {
-  return atan2(p0, p1);
-}

>From 810ceebc71eafbf5fea23bfb50638df5f3d8ecad Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Fri, 1 May 2026 17:08:51 -0700
Subject: [PATCH 14/20] fix tests

---
 .../test/CodeGenHLSL/builtins/atan2_mat.hlsl  | 13 ----
 .../binary-compat-overload-warnings.hlsl      | 10 +--
 llvm/test/CodeGen/DirectX/atan2.ll            | 64 +++++++++++++++++++
 3 files changed, 69 insertions(+), 18 deletions(-)

diff --git a/clang/test/CodeGenHLSL/builtins/atan2_mat.hlsl b/clang/test/CodeGenHLSL/builtins/atan2_mat.hlsl
index db9439edfcc11..f0d2517a000ee 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2_mat.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2_mat.hlsl
@@ -6,13 +6,6 @@
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// CHECK-LABEL: test_atan2_half1x1
-// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <1 x half> @llvm.atan2.v1f16
-// NO_HALF: call reassoc nnan ninf nsz arcp afn <1 x float> @llvm.atan2.v1f32
-half1x1 test_atan2_half1x1 (half1x1 p0, half1x1 p1) {
-  return atan2(p0, p1);
-}
-
 // CHECK-LABEL: test_atan2_half1x2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.atan2.v2f16
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.atan2.v2f32
@@ -118,12 +111,6 @@ half4x4 test_atan2_half4x4 (half4x4 p0, half4x4 p1) {
   return atan2(p0, p1);
 }
 
-// CHECK-LABEL: test_atan2_float1x1
-// CHECK:  call reassoc nnan ninf nsz arcp afn <1 x float> @llvm.atan2.v1f32
-float1x1 test_atan2_float1x1 (float1x1 p0, float1x1 p1) {
-  return atan2(p0, p1);
-}
-
 // CHECK-LABEL: test_atan2_float1x2
 // CHECK:  call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.atan2.v2f32
 float1x2 test_atan2_float1x2 (float1x2 p0, float1x2 p1) {
diff --git a/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl b/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl
index 7b93ea089d854..fada02f4b7c1a 100644
--- a/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl
@@ -24,7 +24,7 @@ float4 test_binary_double4(double4 p0) {
 }
 
 float4x4 test_binary_double4x4(double4x4 p0) {
-  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x 64 bit API lowering for [[FUNC]] is deprecated. Explicitly cast parameters to 32 or 16 bit types.
+  // ATAN2: warning: '[[FUNC]]' is deprecated: In 202x 64 bit API lowering for [[FUNC]] is deprecated. Explicitly cast parameters to 32 or 16 bit types.
   return FUNC(p0, p0);
 }
 
@@ -51,21 +51,21 @@ float test_binary_int(uint64_t p0) {
 }
 
 float4x4 test_binary_uint4x4(uint4x4 p0) {
-  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  // ATAN2: warning: '[[FUNC]]' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
   return FUNC(p0, p0);
 }
 
 float4x4 test_binary_int4x4(int4x4 p0) {
-  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  // ATAN2: warning: '[[FUNC]]' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
   return FUNC(p0, p0);
 }
 
 float4x4 test_binary_int64_t4x4(int64_t4x4 p0) {
-  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  // ATAN2: warning: '[[FUNC]]' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
   return FUNC(p0, p0);
 }
 
 float4x4 test_binary_uint64_t4x4(uint64_t4x4 p0) {
-  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  // ATAN2: warning: '[[FUNC]]' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
   return FUNC(p0, p0);
 }
diff --git a/llvm/test/CodeGen/DirectX/atan2.ll b/llvm/test/CodeGen/DirectX/atan2.ll
index 8f51ab1b7a902..3a1a9d8fc80ac 100644
--- a/llvm/test/CodeGen/DirectX/atan2.ll
+++ b/llvm/test/CodeGen/DirectX/atan2.ll
@@ -82,6 +82,70 @@ entry:
   ret <4 x float> %elt.atan2
 }
 
+define noundef <16 x half> @atan2_half4x4(<16 x half> noundef %y, <16 x half> noundef %x) {
+entry:
+; Just Expansion, no scalarization or lowering:
+; EXPCHECK: [[DIV:%.+]] = fdiv <16 x half> %y, %x
+; EXPCHECK: [[ATAN:%.+]] = call <16 x half> @llvm.atan.v16f16(<16 x half> [[DIV]])
+; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <16 x half> [[ATAN]], splat (half 0xH4248)
+; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <16 x half> [[ATAN]], splat (half 0xH4248)
+; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <16 x half> %x, zeroinitializer
+; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <16 x half> %x, zeroinitializer
+; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <16 x half> %y, zeroinitializer
+; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <16 x half> %y, zeroinitializer
+; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <16 x i1> [[X_LT_0]], [[Y_GE_0]]
+; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <16 x i1> [[XLT0_AND_YGE0]], <16 x half> [[ADD_PI]], <16 x half> [[ATAN]]
+; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <16 x i1> [[X_LT_0]], [[Y_LT_0]]
+; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <16 x i1> [[XLT0_AND_YLT0]], <16 x half> [[SUB_PI]], <16 x half> [[SELECT_ADD_PI]]
+; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <16 x i1> [[X_EQ_0]], [[Y_LT_0]]
+; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <16 x i1> [[XEQ0_AND_YLT0]], <16 x half> splat (half 0xHBE48), <16 x half> [[SELECT_SUB_PI]]
+; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <16 x i1> [[X_EQ_0]], [[Y_GE_0]]
+; EXPCHECK: [[SELECT_HPI:%.+]] = select <16 x i1> [[XEQ0_AND_YGE0]], <16 x half> splat (half 0xH3E48), <16 x half> [[SELECT_NEGHPI]]
+; EXPCHECK: ret <16 x half> [[SELECT_HPI]]
+
+; Scalarization occurs after expansion, so atan scalarization is tested separately.
+; Expansion, scalarization and lowering:
+; Just make sure this expands to exactly 16 scalar DXIL atan (OpCode=17) calls.
+; DOPCHECK-COUNT-16: call half @dx.op.unary.f16(i32 17, half %{{.*}})
+; DOPCHECK-NOT: call half @dx.op.unary.f16(i32 17,
+
+  %elt.atan2 = call <16 x half> @llvm.atan2.v16f16(<16 x half> %y, <16 x half> %x)
+  ret <16 x half> %elt.atan2
+}
+
+define noundef <16 x float> @atan2_float4x4(<16 x float> noundef %y, <16 x float> noundef %x) {
+entry:
+; Just Expansion, no scalarization or lowering:
+; EXPCHECK: [[DIV:%.+]] = fdiv <16 x float> %y, %x
+; EXPCHECK: [[ATAN:%.+]] = call <16 x float> @llvm.atan.v16f32(<16 x float> [[DIV]])
+; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <16 x float> [[ATAN]], splat (float 0x400921FB60000000)
+; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <16 x float> [[ATAN]], splat (float 0x400921FB60000000)
+; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <16 x float> %x, zeroinitializer
+; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <16 x float> %x, zeroinitializer
+; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <16 x float> %y, zeroinitializer
+; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <16 x float> %y, zeroinitializer
+; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <16 x i1> [[X_LT_0]], [[Y_GE_0]]
+; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <16 x i1> [[XLT0_AND_YGE0]], <16 x float> [[ADD_PI]], <16 x float> [[ATAN]]
+; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <16 x i1> [[X_LT_0]], [[Y_LT_0]]
+; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <16 x i1> [[XLT0_AND_YLT0]], <16 x float> [[SUB_PI]], <16 x float> [[SELECT_ADD_PI]]
+; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <16 x i1> [[X_EQ_0]], [[Y_LT_0]]
+; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <16 x i1> [[XEQ0_AND_YLT0]], <16 x float> splat (float 0xBFF921FB60000000), <16 x float> [[SELECT_SUB_PI]]
+; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <16 x i1> [[X_EQ_0]], [[Y_GE_0]]
+; EXPCHECK: [[SELECT_HPI:%.+]] = select <16 x i1> [[XEQ0_AND_YGE0]], <16 x float> splat (float 0x3FF921FB60000000), <16 x float> [[SELECT_NEGHPI]]
+; EXPCHECK: ret <16 x float> [[SELECT_HPI]]
+
+; Scalarization occurs after expansion, so atan scalarization is tested separately.
+; Expansion, scalarization and lowering:
+; Just make sure this expands to exactly 16 scalar DXIL atan (OpCode=17) calls.
+; DOPCHECK-COUNT-16: call float @dx.op.unary.f32(i32 17, float %{{.*}})
+; DOPCHECK-NOT: call float @dx.op.unary.f32(i32 17,
+
+  %elt.atan2 = call <16 x float> @llvm.atan2.v16f32(<16 x float> %y, <16 x float> %x)
+  ret <16 x float> %elt.atan2
+}
+
 declare half @llvm.atan2.f16(half, half)
 declare float @llvm.atan2.f32(float, float)
 declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>)
+declare <16 x float> @llvm.atan2.v16f32(<16 x float>, <16 x float>)
+declare <16 x half> @llvm.atan2.v16f16(<16 x half>, <16 x half>)

>From f075a16ebc03ac65209d572dabd6942d60eacd13 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Thu, 7 May 2026 14:06:28 -0700
Subject: [PATCH 15/20] fix legalization issue and improve tests

---
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp  |  19 ++-
 .../SPIRV/hlsl-intrinsics/atan2_mat.ll        | 147 ++++++++++++++++++
 2 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2_mat.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 47ffecc4085ab..de91c751e0079 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -135,10 +135,22 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
 
   auto allFloatScalarsAndF16Vector2AndVector4s = {s16, s32, s64, v2s16, v4s16};
 
+  auto allFloatScalars = {s16, s32, s64};
+
   auto allFloatScalarsAndVectors = {
       s16,   s32,   s64,   v2s16, v2s32, v2s64, v3s16,  v3s32,  v3s64,
       v4s16, v4s32, v4s64, v8s16, v8s32, v8s64, v16s16, v16s32, v16s64};
 
+  auto allShaderFloatVectors = {v2s16, v2s32, v2s64, v3s16, v3s32,
+                                v3s64, v4s16, v4s32, v4s64};
+
+  auto allFloatVectors = {v2s16, v2s32, v2s64,  v3s16,  v3s32,
+                          v3s64, v4s16, v4s32,  v4s64,  v8s16,
+                          v8s32, v8s64, v16s16, v16s32, v16s64};
+
+  auto &allowedFloatVectorTypes =
+      ST.isShader() ? allShaderFloatVectors : allFloatVectors;
+
   auto allFloatAndIntScalarsAndPtrs = {s8, s16, s32, s64, p0,  p1,
                                        p2, p3,  p4,  p5,  p6,  p7,
                                        p8, p9,  p10, p11, p12, p13};
@@ -490,7 +502,12 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
                                G_FMINIMUM,
                                G_FMAXIMUM,
                                G_INTRINSIC_ROUNDEVEN})
-      .legalFor(allFloatScalarsAndVectors);
+      .legalFor(allFloatScalars)
+      .legalFor(allowedFloatVectorTypes)
+      .moreElementsToNextPow2(0)
+      .fewerElementsIf(vectorElementCountIsGreaterThan(0, MaxVectorSize),
+                       LegalizeMutations::changeElementCountTo(
+                           0, ElementCount::getFixed(MaxVectorSize)));
   // clang-format on
 
   getActionDefinitionsBuilder(G_FCOPYSIGN)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2_mat.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2_mat.ll
new file mode 100644
index 0000000000000..f377c32c0ab6a
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2_mat.ll
@@ -0,0 +1,147 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %}
+
+; Vulkan/Shader does not allow the Vector16 capability, so a 4x4 matrix is
+; represented as [4 x <4 x float>] in LLVM IR and the elementwise atan2 is
+; computed per-row as 4 OpExtInst Atan2 calls on <4 x float> (and similarly
+; for half).
+
+; CHECK-NOT: OpCapability Vector16
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#void:]] = OpTypeVoid
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+; CHECK-DAG: %[[#int_32:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#const_0:]] = OpConstant %[[#int_32]] 0
+; CHECK-DAG: %[[#const_1:]] = OpConstant %[[#int_32]] 1
+; CHECK-DAG: %[[#const_2:]] = OpConstant %[[#int_32]] 2
+; CHECK-DAG: %[[#const_3:]] = OpConstant %[[#int_32]] 3
+; CHECK-DAG: %[[#const_4:]] = OpConstant %[[#int_32]] 4
+; CHECK-DAG: %[[#arr_f32:]] = OpTypeArray %[[#vec4_float_32]] %[[#const_4]]
+; CHECK-DAG: %[[#arr_f16:]] = OpTypeArray %[[#vec4_float_16]] %[[#const_4]]
+; CHECK-DAG: %[[#ptr_arr_f32:]] = OpTypePointer Private %[[#arr_f32]]
+; CHECK-DAG: %[[#ptr_arr_f16:]] = OpTypePointer Private %[[#arr_f16]]
+; CHECK-DAG: %[[#ptr_vec4_f32:]] = OpTypePointer Private %[[#vec4_float_32]]
+; CHECK-DAG: %[[#ptr_vec4_f16:]] = OpTypePointer Private %[[#vec4_float_16]]
+; CHECK-DAG: %[[#fn_f32:]] = OpTypeFunction %[[#void]] %[[#ptr_arr_f32]] %[[#ptr_arr_f32]] %[[#ptr_arr_f32]]
+; CHECK-DAG: %[[#fn_f16:]] = OpTypeFunction %[[#void]] %[[#ptr_arr_f16]] %[[#ptr_arr_f16]] %[[#ptr_arr_f16]]
+
+define internal void @atan2_float4x4(ptr addrspace(10) %out, ptr addrspace(10) %a, ptr addrspace(10) %b) {
+entry:
+  ; CHECK: OpFunction %[[#void]] None %[[#fn_f32]]
+  ; CHECK: %[[#out_f32:]] = OpFunctionParameter %[[#ptr_arr_f32]]
+  ; CHECK: %[[#a_f32:]] = OpFunctionParameter %[[#ptr_arr_f32]]
+  ; CHECK: %[[#b_f32:]] = OpFunctionParameter %[[#ptr_arr_f32]]
+  ; CHECK: %[[#a0_ptr_f32:]] = OpAccessChain %[[#ptr_vec4_f32]] %[[#a_f32]] %[[#const_0]]
+  ; CHECK: %[[#a1_ptr_f32:]] = OpAccessChain %[[#ptr_vec4_f32]] %[[#a_f32]] %[[#const_1]]
+  ; CHECK: %[[#a2_ptr_f32:]] = OpAccessChain %[[#ptr_vec4_f32]] %[[#a_f32]] %[[#const_2]]
+  ; CHECK: %[[#a3_ptr_f32:]] = OpAccessChain %[[#ptr_vec4_f32]] %[[#a_f32]] %[[#const_3]]
+  ; CHECK: %[[#b0_ptr_f32:]] = OpAccessChain %[[#ptr_vec4_f32]] %[[#b_f32]] %[[#const_0]]
+  ; CHECK: %[[#b1_ptr_f32:]] = OpAccessChain %[[#ptr_vec4_f32]] %[[#b_f32]] %[[#const_1]]
+  ; CHECK: %[[#b2_ptr_f32:]] = OpAccessChain %[[#ptr_vec4_f32]] %[[#b_f32]] %[[#const_2]]
+  ; CHECK: %[[#b3_ptr_f32:]] = OpAccessChain %[[#ptr_vec4_f32]] %[[#b_f32]] %[[#const_3]]
+  ; CHECK: %[[#a0_f32:]] = OpLoad %[[#vec4_float_32]] %[[#a0_ptr_f32]]
+  ; CHECK: %[[#a1_f32:]] = OpLoad %[[#vec4_float_32]] %[[#a1_ptr_f32]]
+  ; CHECK: %[[#a2_f32:]] = OpLoad %[[#vec4_float_32]] %[[#a2_ptr_f32]]
+  ; CHECK: %[[#a3_f32:]] = OpLoad %[[#vec4_float_32]] %[[#a3_ptr_f32]]
+  ; CHECK: %[[#b0_f32:]] = OpLoad %[[#vec4_float_32]] %[[#b0_ptr_f32]]
+  ; CHECK: %[[#b1_f32:]] = OpLoad %[[#vec4_float_32]] %[[#b1_ptr_f32]]
+  ; CHECK: %[[#b2_f32:]] = OpLoad %[[#vec4_float_32]] %[[#b2_ptr_f32]]
+  ; CHECK: %[[#b3_f32:]] = OpLoad %[[#vec4_float_32]] %[[#b3_ptr_f32]]
+  ; CHECK: OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Atan2 %[[#a0_f32]] %[[#b0_f32]]
+  ; CHECK: OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Atan2 %[[#a1_f32]] %[[#b1_f32]]
+  ; CHECK: OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Atan2 %[[#a2_f32]] %[[#b2_f32]]
+  ; CHECK: OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Atan2 %[[#a3_f32]] %[[#b3_f32]]
+  %a0 = getelementptr [4 x <4 x float>], ptr addrspace(10) %a, i32 0, i32 0
+  %a1 = getelementptr [4 x <4 x float>], ptr addrspace(10) %a, i32 0, i32 1
+  %a2 = getelementptr [4 x <4 x float>], ptr addrspace(10) %a, i32 0, i32 2
+  %a3 = getelementptr [4 x <4 x float>], ptr addrspace(10) %a, i32 0, i32 3
+  %b0 = getelementptr [4 x <4 x float>], ptr addrspace(10) %b, i32 0, i32 0
+  %b1 = getelementptr [4 x <4 x float>], ptr addrspace(10) %b, i32 0, i32 1
+  %b2 = getelementptr [4 x <4 x float>], ptr addrspace(10) %b, i32 0, i32 2
+  %b3 = getelementptr [4 x <4 x float>], ptr addrspace(10) %b, i32 0, i32 3
+  %va0 = load <4 x float>, ptr addrspace(10) %a0
+  %va1 = load <4 x float>, ptr addrspace(10) %a1
+  %va2 = load <4 x float>, ptr addrspace(10) %a2
+  %va3 = load <4 x float>, ptr addrspace(10) %a3
+  %vb0 = load <4 x float>, ptr addrspace(10) %b0
+  %vb1 = load <4 x float>, ptr addrspace(10) %b1
+  %vb2 = load <4 x float>, ptr addrspace(10) %b2
+  %vb3 = load <4 x float>, ptr addrspace(10) %b3
+  %r0 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %va0, <4 x float> %vb0)
+  %r1 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %va1, <4 x float> %vb1)
+  %r2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %va2, <4 x float> %vb2)
+  %r3 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %va3, <4 x float> %vb3)
+  %out0 = getelementptr [4 x <4 x float>], ptr addrspace(10) %out, i32 0, i32 0
+  %out1 = getelementptr [4 x <4 x float>], ptr addrspace(10) %out, i32 0, i32 1
+  %out2 = getelementptr [4 x <4 x float>], ptr addrspace(10) %out, i32 0, i32 2
+  %out3 = getelementptr [4 x <4 x float>], ptr addrspace(10) %out, i32 0, i32 3
+  store <4 x float> %r0, ptr addrspace(10) %out0
+  store <4 x float> %r1, ptr addrspace(10) %out1
+  store <4 x float> %r2, ptr addrspace(10) %out2
+  store <4 x float> %r3, ptr addrspace(10) %out3
+  ret void
+}
+
+define internal void @atan2_half4x4(ptr addrspace(10) %out, ptr addrspace(10) %a, ptr addrspace(10) %b) {
+entry:
+  ; CHECK: OpFunction %[[#void]] None %[[#fn_f16]]
+  ; CHECK: %[[#out_f16:]] = OpFunctionParameter %[[#ptr_arr_f16]]
+  ; CHECK: %[[#a_f16:]] = OpFunctionParameter %[[#ptr_arr_f16]]
+  ; CHECK: %[[#b_f16:]] = OpFunctionParameter %[[#ptr_arr_f16]]
+  ; CHECK: %[[#a0_ptr_f16:]] = OpAccessChain %[[#ptr_vec4_f16]] %[[#a_f16]] %[[#const_0]]
+  ; CHECK: %[[#a1_ptr_f16:]] = OpAccessChain %[[#ptr_vec4_f16]] %[[#a_f16]] %[[#const_1]]
+  ; CHECK: %[[#a2_ptr_f16:]] = OpAccessChain %[[#ptr_vec4_f16]] %[[#a_f16]] %[[#const_2]]
+  ; CHECK: %[[#a3_ptr_f16:]] = OpAccessChain %[[#ptr_vec4_f16]] %[[#a_f16]] %[[#const_3]]
+  ; CHECK: %[[#b0_ptr_f16:]] = OpAccessChain %[[#ptr_vec4_f16]] %[[#b_f16]] %[[#const_0]]
+  ; CHECK: %[[#b1_ptr_f16:]] = OpAccessChain %[[#ptr_vec4_f16]] %[[#b_f16]] %[[#const_1]]
+  ; CHECK: %[[#b2_ptr_f16:]] = OpAccessChain %[[#ptr_vec4_f16]] %[[#b_f16]] %[[#const_2]]
+  ; CHECK: %[[#b3_ptr_f16:]] = OpAccessChain %[[#ptr_vec4_f16]] %[[#b_f16]] %[[#const_3]]
+  ; CHECK: %[[#a0_f16:]] = OpLoad %[[#vec4_float_16]] %[[#a0_ptr_f16]]
+  ; CHECK: %[[#a1_f16:]] = OpLoad %[[#vec4_float_16]] %[[#a1_ptr_f16]]
+  ; CHECK: %[[#a2_f16:]] = OpLoad %[[#vec4_float_16]] %[[#a2_ptr_f16]]
+  ; CHECK: %[[#a3_f16:]] = OpLoad %[[#vec4_float_16]] %[[#a3_ptr_f16]]
+  ; CHECK: %[[#b0_f16:]] = OpLoad %[[#vec4_float_16]] %[[#b0_ptr_f16]]
+  ; CHECK: %[[#b1_f16:]] = OpLoad %[[#vec4_float_16]] %[[#b1_ptr_f16]]
+  ; CHECK: %[[#b2_f16:]] = OpLoad %[[#vec4_float_16]] %[[#b2_ptr_f16]]
+  ; CHECK: %[[#b3_f16:]] = OpLoad %[[#vec4_float_16]] %[[#b3_ptr_f16]]
+  ; CHECK: OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Atan2 %[[#a0_f16]] %[[#b0_f16]]
+  ; CHECK: OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Atan2 %[[#a1_f16]] %[[#b1_f16]]
+  ; CHECK: OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Atan2 %[[#a2_f16]] %[[#b2_f16]]
+  ; CHECK: OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Atan2 %[[#a3_f16]] %[[#b3_f16]]
+  %a0 = getelementptr [4 x <4 x half>], ptr addrspace(10) %a, i32 0, i32 0
+  %a1 = getelementptr [4 x <4 x half>], ptr addrspace(10) %a, i32 0, i32 1
+  %a2 = getelementptr [4 x <4 x half>], ptr addrspace(10) %a, i32 0, i32 2
+  %a3 = getelementptr [4 x <4 x half>], ptr addrspace(10) %a, i32 0, i32 3
+  %b0 = getelementptr [4 x <4 x half>], ptr addrspace(10) %b, i32 0, i32 0
+  %b1 = getelementptr [4 x <4 x half>], ptr addrspace(10) %b, i32 0, i32 1
+  %b2 = getelementptr [4 x <4 x half>], ptr addrspace(10) %b, i32 0, i32 2
+  %b3 = getelementptr [4 x <4 x half>], ptr addrspace(10) %b, i32 0, i32 3
+  %va0 = load <4 x half>, ptr addrspace(10) %a0
+  %va1 = load <4 x half>, ptr addrspace(10) %a1
+  %va2 = load <4 x half>, ptr addrspace(10) %a2
+  %va3 = load <4 x half>, ptr addrspace(10) %a3
+  %vb0 = load <4 x half>, ptr addrspace(10) %b0
+  %vb1 = load <4 x half>, ptr addrspace(10) %b1
+  %vb2 = load <4 x half>, ptr addrspace(10) %b2
+  %vb3 = load <4 x half>, ptr addrspace(10) %b3
+  %r0 = call <4 x half> @llvm.atan2.v4f16(<4 x half> %va0, <4 x half> %vb0)
+  %r1 = call <4 x half> @llvm.atan2.v4f16(<4 x half> %va1, <4 x half> %vb1)
+  %r2 = call <4 x half> @llvm.atan2.v4f16(<4 x half> %va2, <4 x half> %vb2)
+  %r3 = call <4 x half> @llvm.atan2.v4f16(<4 x half> %va3, <4 x half> %vb3)
+  %out0 = getelementptr [4 x <4 x half>], ptr addrspace(10) %out, i32 0, i32 0
+  %out1 = getelementptr [4 x <4 x half>], ptr addrspace(10) %out, i32 0, i32 1
+  %out2 = getelementptr [4 x <4 x half>], ptr addrspace(10) %out, i32 0, i32 2
+  %out3 = getelementptr [4 x <4 x half>], ptr addrspace(10) %out, i32 0, i32 3
+  store <4 x half> %r0, ptr addrspace(10) %out0
+  store <4 x half> %r1, ptr addrspace(10) %out1
+  store <4 x half> %r2, ptr addrspace(10) %out2
+  store <4 x half> %r3, ptr addrspace(10) %out3
+  ret void
+}
+
+declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>)
+declare <4 x half> @llvm.atan2.v4f16(<4 x half>, <4 x half>)

>From 70e57b6de10866d0368010b756a41460814478af Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Thu, 7 May 2026 18:11:22 -0700
Subject: [PATCH 16/20] fixing test

---
 llvm/test/CodeGen/DirectX/atan2.ll     | 64 ------------------------
 llvm/test/CodeGen/DirectX/atan2_mat.ll | 69 ++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 64 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/atan2_mat.ll

diff --git a/llvm/test/CodeGen/DirectX/atan2.ll b/llvm/test/CodeGen/DirectX/atan2.ll
index 3a1a9d8fc80ac..8f51ab1b7a902 100644
--- a/llvm/test/CodeGen/DirectX/atan2.ll
+++ b/llvm/test/CodeGen/DirectX/atan2.ll
@@ -82,70 +82,6 @@ entry:
   ret <4 x float> %elt.atan2
 }
 
-define noundef <16 x half> @atan2_half4x4(<16 x half> noundef %y, <16 x half> noundef %x) {
-entry:
-; Just Expansion, no scalarization or lowering:
-; EXPCHECK: [[DIV:%.+]] = fdiv <16 x half> %y, %x
-; EXPCHECK: [[ATAN:%.+]] = call <16 x half> @llvm.atan.v16f16(<16 x half> [[DIV]])
-; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <16 x half> [[ATAN]], splat (half 0xH4248)
-; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <16 x half> [[ATAN]], splat (half 0xH4248)
-; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <16 x half> %x, zeroinitializer
-; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <16 x half> %x, zeroinitializer
-; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <16 x half> %y, zeroinitializer
-; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <16 x half> %y, zeroinitializer
-; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <16 x i1> [[X_LT_0]], [[Y_GE_0]]
-; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <16 x i1> [[XLT0_AND_YGE0]], <16 x half> [[ADD_PI]], <16 x half> [[ATAN]]
-; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <16 x i1> [[X_LT_0]], [[Y_LT_0]]
-; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <16 x i1> [[XLT0_AND_YLT0]], <16 x half> [[SUB_PI]], <16 x half> [[SELECT_ADD_PI]]
-; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <16 x i1> [[X_EQ_0]], [[Y_LT_0]]
-; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <16 x i1> [[XEQ0_AND_YLT0]], <16 x half> splat (half 0xHBE48), <16 x half> [[SELECT_SUB_PI]]
-; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <16 x i1> [[X_EQ_0]], [[Y_GE_0]]
-; EXPCHECK: [[SELECT_HPI:%.+]] = select <16 x i1> [[XEQ0_AND_YGE0]], <16 x half> splat (half 0xH3E48), <16 x half> [[SELECT_NEGHPI]]
-; EXPCHECK: ret <16 x half> [[SELECT_HPI]]
-
-; Scalarization occurs after expansion, so atan scalarization is tested separately.
-; Expansion, scalarization and lowering:
-; Just make sure this expands to exactly 16 scalar DXIL atan (OpCode=17) calls.
-; DOPCHECK-COUNT-16: call half @dx.op.unary.f16(i32 17, half %{{.*}})
-; DOPCHECK-NOT: call half @dx.op.unary.f16(i32 17,
-
-  %elt.atan2 = call <16 x half> @llvm.atan2.v16f16(<16 x half> %y, <16 x half> %x)
-  ret <16 x half> %elt.atan2
-}
-
-define noundef <16 x float> @atan2_float4x4(<16 x float> noundef %y, <16 x float> noundef %x) {
-entry:
-; Just Expansion, no scalarization or lowering:
-; EXPCHECK: [[DIV:%.+]] = fdiv <16 x float> %y, %x
-; EXPCHECK: [[ATAN:%.+]] = call <16 x float> @llvm.atan.v16f32(<16 x float> [[DIV]])
-; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <16 x float> [[ATAN]], splat (float 0x400921FB60000000)
-; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <16 x float> [[ATAN]], splat (float 0x400921FB60000000)
-; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <16 x float> %x, zeroinitializer
-; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <16 x float> %x, zeroinitializer
-; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <16 x float> %y, zeroinitializer
-; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <16 x float> %y, zeroinitializer
-; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <16 x i1> [[X_LT_0]], [[Y_GE_0]]
-; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <16 x i1> [[XLT0_AND_YGE0]], <16 x float> [[ADD_PI]], <16 x float> [[ATAN]]
-; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <16 x i1> [[X_LT_0]], [[Y_LT_0]]
-; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <16 x i1> [[XLT0_AND_YLT0]], <16 x float> [[SUB_PI]], <16 x float> [[SELECT_ADD_PI]]
-; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <16 x i1> [[X_EQ_0]], [[Y_LT_0]]
-; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <16 x i1> [[XEQ0_AND_YLT0]], <16 x float> splat (float 0xBFF921FB60000000), <16 x float> [[SELECT_SUB_PI]]
-; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <16 x i1> [[X_EQ_0]], [[Y_GE_0]]
-; EXPCHECK: [[SELECT_HPI:%.+]] = select <16 x i1> [[XEQ0_AND_YGE0]], <16 x float> splat (float 0x3FF921FB60000000), <16 x float> [[SELECT_NEGHPI]]
-; EXPCHECK: ret <16 x float> [[SELECT_HPI]]
-
-; Scalarization occurs after expansion, so atan scalarization is tested separately.
-; Expansion, scalarization and lowering:
-; Just make sure this expands to exactly 16 scalar DXIL atan (OpCode=17) calls.
-; DOPCHECK-COUNT-16: call float @dx.op.unary.f32(i32 17, float %{{.*}})
-; DOPCHECK-NOT: call float @dx.op.unary.f32(i32 17,
-
-  %elt.atan2 = call <16 x float> @llvm.atan2.v16f32(<16 x float> %y, <16 x float> %x)
-  ret <16 x float> %elt.atan2
-}
-
 declare half @llvm.atan2.f16(half, half)
 declare float @llvm.atan2.f32(float, float)
 declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>)
-declare <16 x float> @llvm.atan2.v16f32(<16 x float>, <16 x float>)
-declare <16 x half> @llvm.atan2.v16f16(<16 x half>, <16 x half>)
diff --git a/llvm/test/CodeGen/DirectX/atan2_mat.ll b/llvm/test/CodeGen/DirectX/atan2_mat.ll
new file mode 100644
index 0000000000000..c47fd781b2ede
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/atan2_mat.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=EXPCHECK
+; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=DOPCHECK
+
+; Make sure correct dxil expansions for atan2 are generated for float and half.
+
+define noundef <16 x half> @atan2_half4x4(<16 x half> noundef %y, <16 x half> noundef %x) {
+entry:
+; Just Expansion, no scalarization or lowering:
+; EXPCHECK: [[DIV:%.+]] = fdiv <16 x half> %y, %x
+; EXPCHECK: [[ATAN:%.+]] = call <16 x half> @llvm.atan.v16f16(<16 x half> [[DIV]])
+; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <16 x half> [[ATAN]], splat (half 0xH4248)
+; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <16 x half> [[ATAN]], splat (half 0xH4248)
+; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <16 x half> %x, zeroinitializer
+; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <16 x half> %x, zeroinitializer
+; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <16 x half> %y, zeroinitializer
+; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <16 x half> %y, zeroinitializer
+; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <16 x i1> [[X_LT_0]], [[Y_GE_0]]
+; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <16 x i1> [[XLT0_AND_YGE0]], <16 x half> [[ADD_PI]], <16 x half> [[ATAN]]
+; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <16 x i1> [[X_LT_0]], [[Y_LT_0]]
+; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <16 x i1> [[XLT0_AND_YLT0]], <16 x half> [[SUB_PI]], <16 x half> [[SELECT_ADD_PI]]
+; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <16 x i1> [[X_EQ_0]], [[Y_LT_0]]
+; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <16 x i1> [[XEQ0_AND_YLT0]], <16 x half> splat (half 0xHBE48), <16 x half> [[SELECT_SUB_PI]]
+; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <16 x i1> [[X_EQ_0]], [[Y_GE_0]]
+; EXPCHECK: [[SELECT_HPI:%.+]] = select <16 x i1> [[XEQ0_AND_YGE0]], <16 x half> splat (half 0xH3E48), <16 x half> [[SELECT_NEGHPI]]
+; EXPCHECK: ret <16 x half> [[SELECT_HPI]]
+
+; Scalarization occurs after expansion, so atan scalarization is tested separately.
+; Expansion, scalarization and lowering:
+; Just make sure this expands to exactly 16 scalar DXIL atan (OpCode=17) calls.
+; DOPCHECK-COUNT-16: call half @dx.op.unary.f16(i32 17, half %{{.*}})
+; DOPCHECK-NOT: call half @dx.op.unary.f16(i32 17,
+
+  %elt.atan2 = call <16 x half> @llvm.atan2.v16f16(<16 x half> %y, <16 x half> %x)
+  ret <16 x half> %elt.atan2
+}
+
+define noundef <16 x float> @atan2_float4x4(<16 x float> noundef %y, <16 x float> noundef %x) {
+entry:
+; Just Expansion, no scalarization or lowering:
+; EXPCHECK: [[DIV:%.+]] = fdiv <16 x float> %y, %x
+; EXPCHECK: [[ATAN:%.+]] = call <16 x float> @llvm.atan.v16f32(<16 x float> [[DIV]])
+; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <16 x float> [[ATAN]], splat (float 0x400921FB60000000)
+; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <16 x float> [[ATAN]], splat (float 0x400921FB60000000)
+; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <16 x float> %x, zeroinitializer
+; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <16 x float> %x, zeroinitializer
+; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <16 x float> %y, zeroinitializer
+; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <16 x float> %y, zeroinitializer
+; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <16 x i1> [[X_LT_0]], [[Y_GE_0]]
+; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <16 x i1> [[XLT0_AND_YGE0]], <16 x float> [[ADD_PI]], <16 x float> [[ATAN]]
+; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <16 x i1> [[X_LT_0]], [[Y_LT_0]]
+; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <16 x i1> [[XLT0_AND_YLT0]], <16 x float> [[SUB_PI]], <16 x float> [[SELECT_ADD_PI]]
+; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <16 x i1> [[X_EQ_0]], [[Y_LT_0]]
+; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <16 x i1> [[XEQ0_AND_YLT0]], <16 x float> splat (float 0xBFF921FB60000000), <16 x float> [[SELECT_SUB_PI]]
+; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <16 x i1> [[X_EQ_0]], [[Y_GE_0]]
+; EXPCHECK: [[SELECT_HPI:%.+]] = select <16 x i1> [[XEQ0_AND_YGE0]], <16 x float> splat (float 0x3FF921FB60000000), <16 x float> [[SELECT_NEGHPI]]
+; EXPCHECK: ret <16 x float> [[SELECT_HPI]]
+
+; Scalarization occurs after expansion, so atan scalarization is tested separately.
+; Expansion, scalarization and lowering:
+; Just make sure this expands to exactly 16 scalar DXIL atan (OpCode=17) calls.
+; DOPCHECK-COUNT-16: call float @dx.op.unary.f32(i32 17, float %{{.*}})
+; DOPCHECK-NOT: call float @dx.op.unary.f32(i32 17,
+
+  %elt.atan2 = call <16 x float> @llvm.atan2.v16f32(<16 x float> %y, <16 x float> %x)
+  ret <16 x float> %elt.atan2
+}
+
+declare <16 x float> @llvm.atan2.v16f32(<16 x float>, <16 x float>)
+declare <16 x half> @llvm.atan2.v16f16(<16 x half>, <16 x half>)

>From a095c36ced3c269248e1022b7f6673c1fb6f6430 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Thu, 7 May 2026 18:48:12 -0700
Subject: [PATCH 17/20] clean up

---
 .../lib/Headers/hlsl/hlsl_compat_overloads.h  | 402 +-----------------
 .../binary-compat-overload-warnings.hlsl      |  10 +-
 2 files changed, 25 insertions(+), 387 deletions(-)

diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
index 08af61bed7b9a..12223aee6a4e7 100644
--- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
+++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
@@ -231,403 +231,41 @@ namespace hlsl {
   }
 
 #define _DXC_COMPAT_BINARY_DOUBLE_MATRIX_OVERLOADS(fn)                         \
+  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float1x1 fn(double1x1 y, double1x1 x) {                            \
-    return fn((float1x1)y, (float1x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float1x2 fn(double1x2 y, double1x2 x) {                            \
-    return fn((float1x2)y, (float1x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float1x3 fn(double1x3 y, double1x3 x) {                            \
-    return fn((float1x3)y, (float1x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float1x4 fn(double1x4 y, double1x4 x) {                            \
-    return fn((float1x4)y, (float1x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float2x1 fn(double2x1 y, double2x1 x) {                            \
-    return fn((float2x1)y, (float2x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float2x2 fn(double2x2 y, double2x2 x) {                            \
-    return fn((float2x2)y, (float2x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float2x3 fn(double2x3 y, double2x3 x) {                            \
-    return fn((float2x3)y, (float2x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float2x4 fn(double2x4 y, double2x4 x) {                            \
-    return fn((float2x4)y, (float2x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float3x1 fn(double3x1 y, double3x1 x) {                            \
-    return fn((float3x1)y, (float3x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float3x2 fn(double3x2 y, double3x2 x) {                            \
-    return fn((float3x2)y, (float3x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float3x3 fn(double3x3 y, double3x3 x) {                            \
-    return fn((float3x3)y, (float3x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float3x4 fn(double3x4 y, double3x4 x) {                            \
-    return fn((float3x4)y, (float3x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float4x1 fn(double4x1 y, double4x1 x) {                            \
-    return fn((float4x1)y, (float4x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float4x2 fn(double4x2 y, double4x2 x) {                            \
-    return fn((float4x2)y, (float4x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float4x3 fn(double4x3 y, double4x3 x) {                            \
-    return fn((float4x3)y, (float4x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr float4x4 fn(double4x4 y, double4x4 x) {                            \
-    return fn((float4x4)y, (float4x4)x);                                       \
+  constexpr matrix<float, R, C> fn(matrix<double, R, C> y,                     \
+                                   matrix<double, R, C> x) {                   \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
   }
 
 #define _DXC_COMPAT_BINARY_INTEGER_MATRIX_OVERLOADS(fn)                        \
+  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x1 fn(int1x1 y, int1x1 x) {                                  \
-    return fn((float1x1)y, (float1x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x2 fn(int1x2 y, int1x2 x) {                                  \
-    return fn((float1x2)y, (float1x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x3 fn(int1x3 y, int1x3 x) {                                  \
-    return fn((float1x3)y, (float1x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x4 fn(int1x4 y, int1x4 x) {                                  \
-    return fn((float1x4)y, (float1x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x1 fn(int2x1 y, int2x1 x) {                                  \
-    return fn((float2x1)y, (float2x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x2 fn(int2x2 y, int2x2 x) {                                  \
-    return fn((float2x2)y, (float2x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x3 fn(int2x3 y, int2x3 x) {                                  \
-    return fn((float2x3)y, (float2x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x4 fn(int2x4 y, int2x4 x) {                                  \
-    return fn((float2x4)y, (float2x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x1 fn(int3x1 y, int3x1 x) {                                  \
-    return fn((float3x1)y, (float3x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x2 fn(int3x2 y, int3x2 x) {                                  \
-    return fn((float3x2)y, (float3x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x3 fn(int3x3 y, int3x3 x) {                                  \
-    return fn((float3x3)y, (float3x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x4 fn(int3x4 y, int3x4 x) {                                  \
-    return fn((float3x4)y, (float3x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x1 fn(int4x1 y, int4x1 x) {                                  \
-    return fn((float4x1)y, (float4x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x2 fn(int4x2 y, int4x2 x) {                                  \
-    return fn((float4x2)y, (float4x2)x);                                       \
+  constexpr matrix<float, R, C> fn(matrix<int, R, C> y, matrix<int, R, C> x) { \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
   }                                                                            \
                                                                                \
+  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x3 fn(int4x3 y, int4x3 x) {                                  \
-    return fn((float4x3)y, (float4x3)x);                                       \
+  constexpr matrix<float, R, C> fn(matrix<uint, R, C> y,                       \
+                                   matrix<uint, R, C> x) {                     \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
   }                                                                            \
                                                                                \
+  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x4 fn(int4x4 y, int4x4 x) {                                  \
-    return fn((float4x4)y, (float4x4)x);                                       \
-  }                                                                            \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x1 fn(uint1x1 y, uint1x1 x) {                                \
-    return fn((float1x1)y, (float1x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x2 fn(uint1x2 y, uint1x2 x) {                                \
-    return fn((float1x2)y, (float1x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x3 fn(uint1x3 y, uint1x3 x) {                                \
-    return fn((float1x3)y, (float1x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x4 fn(uint1x4 y, uint1x4 x) {                                \
-    return fn((float1x4)y, (float1x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x1 fn(uint2x1 y, uint2x1 x) {                                \
-    return fn((float2x1)y, (float2x1)x);                                       \
+  constexpr matrix<float, R, C> fn(matrix<int64_t, R, C> y,                    \
+                                   matrix<int64_t, R, C> x) {                  \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
   }                                                                            \
                                                                                \
+  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x2 fn(uint2x2 y, uint2x2 x) {                                \
-    return fn((float2x2)y, (float2x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x3 fn(uint2x3 y, uint2x3 x) {                                \
-    return fn((float2x3)y, (float2x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x4 fn(uint2x4 y, uint2x4 x) {                                \
-    return fn((float2x4)y, (float2x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x1 fn(uint3x1 y, uint3x1 x) {                                \
-    return fn((float3x1)y, (float3x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x2 fn(uint3x2 y, uint3x2 x) {                                \
-    return fn((float3x2)y, (float3x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x3 fn(uint3x3 y, uint3x3 x) {                                \
-    return fn((float3x3)y, (float3x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x4 fn(uint3x4 y, uint3x4 x) {                                \
-    return fn((float3x4)y, (float3x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x1 fn(uint4x1 y, uint4x1 x) {                                \
-    return fn((float4x1)y, (float4x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x2 fn(uint4x2 y, uint4x2 x) {                                \
-    return fn((float4x2)y, (float4x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x3 fn(uint4x3 y, uint4x3 x) {                                \
-    return fn((float4x3)y, (float4x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x4 fn(uint4x4 y, uint4x4 x) {                                \
-    return fn((float4x4)y, (float4x4)x);                                       \
-  }                                                                            \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x1 fn(int64_t1x1 y, int64_t1x1 x) {                          \
-    return fn((float1x1)y, (float1x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x2 fn(int64_t1x2 y, int64_t1x2 x) {                          \
-    return fn((float1x2)y, (float1x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x3 fn(int64_t1x3 y, int64_t1x3 x) {                          \
-    return fn((float1x3)y, (float1x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x4 fn(int64_t1x4 y, int64_t1x4 x) {                          \
-    return fn((float1x4)y, (float1x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x1 fn(int64_t2x1 y, int64_t2x1 x) {                          \
-    return fn((float2x1)y, (float2x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x2 fn(int64_t2x2 y, int64_t2x2 x) {                          \
-    return fn((float2x2)y, (float2x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x3 fn(int64_t2x3 y, int64_t2x3 x) {                          \
-    return fn((float2x3)y, (float2x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x4 fn(int64_t2x4 y, int64_t2x4 x) {                          \
-    return fn((float2x4)y, (float2x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x1 fn(int64_t3x1 y, int64_t3x1 x) {                          \
-    return fn((float3x1)y, (float3x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x2 fn(int64_t3x2 y, int64_t3x2 x) {                          \
-    return fn((float3x2)y, (float3x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x3 fn(int64_t3x3 y, int64_t3x3 x) {                          \
-    return fn((float3x3)y, (float3x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x4 fn(int64_t3x4 y, int64_t3x4 x) {                          \
-    return fn((float3x4)y, (float3x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x1 fn(int64_t4x1 y, int64_t4x1 x) {                          \
-    return fn((float4x1)y, (float4x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x2 fn(int64_t4x2 y, int64_t4x2 x) {                          \
-    return fn((float4x2)y, (float4x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x3 fn(int64_t4x3 y, int64_t4x3 x) {                          \
-    return fn((float4x3)y, (float4x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x4 fn(int64_t4x4 y, int64_t4x4 x) {                          \
-    return fn((float4x4)y, (float4x4)x);                                       \
-  }                                                                            \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x1 fn(uint64_t1x1 y, uint64_t1x1 x) {                        \
-    return fn((float1x1)y, (float1x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x2 fn(uint64_t1x2 y, uint64_t1x2 x) {                        \
-    return fn((float1x2)y, (float1x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x3 fn(uint64_t1x3 y, uint64_t1x3 x) {                        \
-    return fn((float1x3)y, (float1x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float1x4 fn(uint64_t1x4 y, uint64_t1x4 x) {                        \
-    return fn((float1x4)y, (float1x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x1 fn(uint64_t2x1 y, uint64_t2x1 x) {                        \
-    return fn((float2x1)y, (float2x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x2 fn(uint64_t2x2 y, uint64_t2x2 x) {                        \
-    return fn((float2x2)y, (float2x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x3 fn(uint64_t2x3 y, uint64_t2x3 x) {                        \
-    return fn((float2x3)y, (float2x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float2x4 fn(uint64_t2x4 y, uint64_t2x4 x) {                        \
-    return fn((float2x4)y, (float2x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x1 fn(uint64_t3x1 y, uint64_t3x1 x) {                        \
-    return fn((float3x1)y, (float3x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x2 fn(uint64_t3x2 y, uint64_t3x2 x) {                        \
-    return fn((float3x2)y, (float3x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x3 fn(uint64_t3x3 y, uint64_t3x3 x) {                        \
-    return fn((float3x3)y, (float3x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float3x4 fn(uint64_t3x4 y, uint64_t3x4 x) {                        \
-    return fn((float3x4)y, (float3x4)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x1 fn(uint64_t4x1 y, uint64_t4x1 x) {                        \
-    return fn((float4x1)y, (float4x1)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x2 fn(uint64_t4x2 y, uint64_t4x2 x) {                        \
-    return fn((float4x2)y, (float4x2)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x3 fn(uint64_t4x3 y, uint64_t4x3 x) {                        \
-    return fn((float4x3)y, (float4x3)x);                                       \
-  }                                                                            \
-                                                                               \
-  _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr float4x4 fn(uint64_t4x4 y, uint64_t4x4 x) {                        \
-    return fn((float4x4)y, (float4x4)x);                                       \
+  constexpr matrix<float, R, C> fn(matrix<uint64_t, R, C> y,                   \
+                                   matrix<uint64_t, R, C> x) {                 \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
   }
+
 //===----------------------------------------------------------------------===//
 // acos builtins overloads
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl b/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl
index fada02f4b7c1a..7b93ea089d854 100644
--- a/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl
@@ -24,7 +24,7 @@ float4 test_binary_double4(double4 p0) {
 }
 
 float4x4 test_binary_double4x4(double4x4 p0) {
-  // ATAN2: warning: '[[FUNC]]' is deprecated: In 202x 64 bit API lowering for [[FUNC]] is deprecated. Explicitly cast parameters to 32 or 16 bit types.
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x 64 bit API lowering for [[FUNC]] is deprecated. Explicitly cast parameters to 32 or 16 bit types.
   return FUNC(p0, p0);
 }
 
@@ -51,21 +51,21 @@ float test_binary_int(uint64_t p0) {
 }
 
 float4x4 test_binary_uint4x4(uint4x4 p0) {
-  // ATAN2: warning: '[[FUNC]]' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
   return FUNC(p0, p0);
 }
 
 float4x4 test_binary_int4x4(int4x4 p0) {
-  // ATAN2: warning: '[[FUNC]]' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
   return FUNC(p0, p0);
 }
 
 float4x4 test_binary_int64_t4x4(int64_t4x4 p0) {
-  // ATAN2: warning: '[[FUNC]]' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
   return FUNC(p0, p0);
 }
 
 float4x4 test_binary_uint64_t4x4(uint64_t4x4 p0) {
-  // ATAN2: warning: '[[FUNC]]' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
   return FUNC(p0, p0);
 }

>From 0637d22c8f6602660bf54ef2d5edce8d049ba42e Mon Sep 17 00:00:00 2001
From: joaosaffran <joaosaffran at gmail.com>
Date: Thu, 7 May 2026 18:49:31 -0700
Subject: [PATCH 18/20] Apply suggestion from @farzonl

Co-authored-by: Farzon Lotfi <farzonl at gmail.com>
---
 clang/lib/Sema/SemaHLSL.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index bb996d291675e..79ba0de8e9ae1 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3233,10 +3233,13 @@ static bool CheckFloatOrHalfRepresentation(Sema *S, SourceLocation Loc,
                                            int ArgOrdinal,
                                            clang::QualType PassedType) {
   clang::QualType BaseType = PassedType;
-  if (PassedType->isVectorType())
-    BaseType = PassedType->castAs<clang::VectorType>()->getElementType();
-  else if (PassedType->isMatrixType())
-    BaseType = PassedType->castAs<clang::MatrixType>()->getElementType();
+  if (const auto *VT = PassedType->getAs<clang::VectorType>()) {
+      BaseType = VT->getElementType();
+  } else {
+      const auto *MT = PassedType->getAs<clang::MatrixType>()
+      assert(MT && "expected to be Vector or MatrixType");
+      BaseType = MT->getElementType();
+  }
 
   if (!BaseType->isHalfType() && !BaseType->isFloat32Type())
     return S->Diag(Loc, diag::err_builtin_invalid_arg_type)

>From fac7e0fc5cde502e144484443a4f8af6f9150184 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Thu, 7 May 2026 19:03:41 -0700
Subject: [PATCH 19/20] fix

---
 clang/lib/Sema/SemaHLSL.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 79ba0de8e9ae1..0225645887a52 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3235,10 +3235,8 @@ static bool CheckFloatOrHalfRepresentation(Sema *S, SourceLocation Loc,
   clang::QualType BaseType = PassedType;
   if (const auto *VT = PassedType->getAs<clang::VectorType>()) {
       BaseType = VT->getElementType();
-  } else {
-      const auto *MT = PassedType->getAs<clang::MatrixType>()
-      assert(MT && "expected to be Vector or MatrixType");
-      BaseType = MT->getElementType();
+  } else if (const auto *MT = PassedType->getAs<clang::MatrixType>()) {
+    BaseType = MT->getElementType();
   }
 
   if (!BaseType->isHalfType() && !BaseType->isFloat32Type())

>From 80852d3af1053d5d20f3397f3ba208b77b407125 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Fri, 8 May 2026 11:16:05 -0700
Subject: [PATCH 20/20] fix test.. hopefully

---
 llvm/test/CodeGen/DirectX/atan2_mat.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/DirectX/atan2_mat.ll b/llvm/test/CodeGen/DirectX/atan2_mat.ll
index c47fd781b2ede..df37a5c5fa706 100644
--- a/llvm/test/CodeGen/DirectX/atan2_mat.ll
+++ b/llvm/test/CodeGen/DirectX/atan2_mat.ll
@@ -8,8 +8,8 @@ entry:
 ; Just Expansion, no scalarization or lowering:
 ; EXPCHECK: [[DIV:%.+]] = fdiv <16 x half> %y, %x
 ; EXPCHECK: [[ATAN:%.+]] = call <16 x half> @llvm.atan.v16f16(<16 x half> [[DIV]])
-; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <16 x half> [[ATAN]], splat (half 0xH4248)
-; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <16 x half> [[ATAN]], splat (half 0xH4248)
+; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <16 x half> [[ATAN]], splat (half
+; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <16 x half> [[ATAN]], splat (half
 ; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <16 x half> %x, zeroinitializer
 ; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <16 x half> %x, zeroinitializer
 ; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <16 x half> %y, zeroinitializer