[clang] [HLSL] Add matrix support to atan2 (PR #194984)

Fri May 1 11:35:39 PDT 2026

https://github.com/joaosaffran updated https://github.com/llvm/llvm-project/pull/194984

>From babda1fba31e2b04f4b7934e98796ddb9fb4bb02 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Tue, 28 Apr 2026 15:07:52 -0700
Subject: [PATCH 01/12] make it support float

---
 clang/lib/Sema/SemaHLSL.cpp                | 10 ++++----
 clang/test/CodeGenHLSL/builtins/atan2.hlsl | 27 ++++++++++++++++++++++
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index aba1c5072a5fc..bb996d291675e 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3232,10 +3232,12 @@ static bool CheckFloatRepresentation(Sema *S, SourceLocation Loc,
 static bool CheckFloatOrHalfRepresentation(Sema *S, SourceLocation Loc,
                                            int ArgOrdinal,
                                            clang::QualType PassedType) {
-  clang::QualType BaseType =
-      PassedType->isVectorType()
-          ? PassedType->castAs<clang::VectorType>()->getElementType()
-          : PassedType;
+  clang::QualType BaseType = PassedType;
+  if (PassedType->isVectorType())
+    BaseType = PassedType->castAs<clang::VectorType>()->getElementType();
+  else if (PassedType->isMatrixType())
+    BaseType = PassedType->castAs<clang::MatrixType>()->getElementType();
+
   if (!BaseType->isHalfType() && !BaseType->isFloat32Type())
     return S->Diag(Loc, diag::err_builtin_invalid_arg_type)
            << ArgOrdinal << /* scalar or vector of */ 5 << /* no int */ 0
diff --git a/clang/test/CodeGenHLSL/builtins/atan2.hlsl b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
index 512b44a5780db..986ddc75b4f8e 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
@@ -34,6 +34,20 @@ half4 test_atan2_half4 (half4 p0, half4 p1) {
   return atan2(p0, p1);
 }
 
+// CHECK-LABEL: test_atan2_half4x4
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <16 x half> @llvm.atan2.v16f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <16 x float> @llvm.atan2.v16f32
+half4x4 test_atan2_half4x4 (half4x4 p0, half4x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half2x3
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <6 x half> @llvm.atan2.v6f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
+half2x3 test_atan2_half2x3 (half2x3 p0, half2x3 p1) {
+  return atan2(p0, p1);
+}
+
 // CHECK-LABEL: test_atan2_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.atan2.f32
 float test_atan2_float (float p0, float p1) {
@@ -57,3 +71,16 @@ float3 test_atan2_float3 (float3 p0, float3 p1) {
 float4 test_atan2_float4 (float4 p0, float4 p1) {
   return atan2(p0, p1);
 }
+
+// CHECK-LABEL: test_atan2_float4x4
+// CHECK:  call reassoc nnan ninf nsz arcp afn <16 x float> @llvm.atan2.v16f32
+float4x4 test_atan2_float4x4 (float4x4 p0, float4x4 p1) {
+  return atan2(p0, p1);
+}
+
+
+// CHECK-LABEL: test_atan2_float2x3
+// CHECK:  call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
+float2x3 test_atan2_float2x3 (float2x3 p0, float2x3 p1) {
+  return atan2(p0, p1);
+}

>From 43586d11f49f8a945fe7f4aa690d1e7eaff450f0 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Tue, 28 Apr 2026 18:01:58 -0700
Subject: [PATCH 02/12] adding matrix impl

---
 clang/lib/Headers/hlsl/hlsl_intrinsics.h             |  6 ++++++
 clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl | 10 ++++++++++
 2 files changed, 16 insertions(+)

diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index cced7b0eabb1f..eb2ebe485b7de 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -309,5 +309,11 @@ constexpr matrix<T, R, C> mul(matrix<T, R, C> x, T y) {
   return x * y;
 }
 
+template <typename T, int R, int C>
+constexpr matrix<float, R, C> atan2(matrix<T, R, C> y, matrix<T, R, C> x) {
+  return __builtin_elementwise_atan2((matrix<float, R, C>)y,
+                                     (matrix<float, R, C>)x);
+}
+
 } // namespace hlsl
 #endif //_HLSL_HLSL_INTRINSICS_H_
diff --git a/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
index 85ff75110a78e..0779ab2d13d7e 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
@@ -181,3 +181,13 @@ float3 test_atan2_uint64_t3 (uint64_t3 p0, uint64_t3 p1) {
 float4 test_atan2_uint64_t4 (uint64_t4 p0, uint64_t4 p1) {
   return atan2(p0, p1);
 }
+
+
+// CHECK: define [[FNATTRS]] <16 x float> @_Z21test_atan2_int64_t4x4u11matrix_typeILj4ELj4ElES_(
+// CHECK:    [[CONVI:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_int64_t4x4 (int64_t4x4 p0, int64_t4x4 p1) {
+  return atan2(p0, p1);
+}

>From c4a5db93bdc574b7a1f9f1113413537884106bb8 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Wed, 29 Apr 2026 16:35:41 -0700
Subject: [PATCH 03/12] add tests and overloads

---
 .../lib/Headers/hlsl/hlsl_compat_overloads.h  |  7 +++++++
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      |  6 ------
 clang/lib/Sema/SemaHLSL.cpp                   |  4 ++--
 .../CodeGenHLSL/builtins/atan2-overloads.hlsl | 19 ++++++++++++++++++-
 4 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
index ee243abef6a41..e916228f1cc11 100644
--- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
+++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
@@ -230,6 +230,12 @@ namespace hlsl {
     return fn((float4)V1, (float4)V2, (float4)V3);                             \
   }
 
+#define _DXC_COMPAT_BINARY_MATRIX_OVERLOADS(fn, ty)                            \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  template <typename T, int R, int C>                                          \
+  constexpr matrix<ty, R, C> fn(matrix<T, R, C> y, matrix<T, R, C> x) {        \
+    return fn((matrix<ty, R, C>)y, (matrix<ty, R, C>)x);                       \
+  }
 //===----------------------------------------------------------------------===//
 // acos builtins overloads
 //===----------------------------------------------------------------------===//
@@ -257,6 +263,7 @@ _DXC_COMPAT_UNARY_INTEGER_OVERLOADS(atan)
 
 _DXC_COMPAT_BINARY_DOUBLE_OVERLOADS(atan2)
 _DXC_COMPAT_BINARY_INTEGER_OVERLOADS(atan2)
+_DXC_COMPAT_BINARY_MATRIX_OVERLOADS(atan2, float)
 
 //===----------------------------------------------------------------------===//
 // ceil builtins overloads
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index eb2ebe485b7de..cced7b0eabb1f 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -309,11 +309,5 @@ constexpr matrix<T, R, C> mul(matrix<T, R, C> x, T y) {
   return x * y;
 }
 
-template <typename T, int R, int C>
-constexpr matrix<float, R, C> atan2(matrix<T, R, C> y, matrix<T, R, C> x) {
-  return __builtin_elementwise_atan2((matrix<float, R, C>)y,
-                                     (matrix<float, R, C>)x);
-}
-
 } // namespace hlsl
 #endif //_HLSL_HLSL_INTRINSICS_H_
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index bb996d291675e..368a54b44c2d5 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3240,8 +3240,8 @@ static bool CheckFloatOrHalfRepresentation(Sema *S, SourceLocation Loc,
 
   if (!BaseType->isHalfType() && !BaseType->isFloat32Type())
     return S->Diag(Loc, diag::err_builtin_invalid_arg_type)
-           << ArgOrdinal << /* scalar or vector of */ 5 << /* no int */ 0
-           << /* half or float */ 2 << PassedType;
+           << ArgOrdinal << /* scalar, vector or matrix of */ 5
+           << /* no int */ 0 << /* half or float */ 2 << PassedType;
   return false;
 }
 
diff --git a/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
index 0779ab2d13d7e..d36d2aebcf4c8 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
@@ -38,6 +38,15 @@ float4 test_atan2_double4 (double4 p0, double4 p1) {
   return atan2(p0, p1);
 }
 
+// CHECK: define [[FNATTRS]] <16 x float> @_Z20test_atan2_double4x4u11matrix_typeILj4ELj4EdES_(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <16 x double> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <16 x double> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_double4x4 (double4x4 p0, double4x4 p1) {
+  return atan2(p0, p1);
+}
+
 // CHECK: define [[FNATTRS]] float @_Z14test_atan2_intii(
 // CHECK:    [[CONVI:%.*]] = sitofp i32 %{{.*}} to float
 // CHECK:    [[CONV1I:%.*]] = sitofp i32 %{{.*}} to float
@@ -182,7 +191,6 @@ float4 test_atan2_uint64_t4 (uint64_t4 p0, uint64_t4 p1) {
   return atan2(p0, p1);
 }
 
-
 // CHECK: define [[FNATTRS]] <16 x float> @_Z21test_atan2_int64_t4x4u11matrix_typeILj4ELj4ElES_(
 // CHECK:    [[CONVI:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
 // CHECK:    [[CONV1I:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
@@ -191,3 +199,12 @@ float4 test_atan2_uint64_t4 (uint64_t4 p0, uint64_t4 p1) {
 float4x4 test_atan2_int64_t4x4 (int64_t4x4 p0, int64_t4x4 p1) {
   return atan2(p0, p1);
 }
+
+// CHECK: define [[FNATTRS]] <16 x float> @_Z22test_atan2_uint64_t4x4u11matrix_typeILj4ELj4EmES_(
+// CHECK:    [[CONVI:%.*]] = uitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_uint64_t4x4 (uint64_t4x4 p0, uint64_t4x4 p1) {
+  return atan2(p0, p1);
+}

>From 7ccd15fcae5d9a517a9a154944503de1ed42f5e9 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Wed, 29 Apr 2026 16:55:02 -0700
Subject: [PATCH 04/12] make macro match other's

---
 .../lib/Headers/hlsl/hlsl_compat_overloads.h  | 39 ++++++++++++++++---
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
index e916228f1cc11..0c4963b495b51 100644
--- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
+++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
@@ -230,11 +230,39 @@ namespace hlsl {
     return fn((float4)V1, (float4)V2, (float4)V3);                             \
   }
 
-#define _DXC_COMPAT_BINARY_MATRIX_OVERLOADS(fn, ty)                            \
+#define _DXC_COMPAT_BINARY_DOUBLE_MATRIX_OVERLOADS(fn)                         \
+  template <uint R, uint C>                                                    \
+  constexpr matrix<float, R, C> fn(matrix<double, R, C> y,                     \
+                                   matrix<double, R, C> x) {                   \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  }
+
+#define _DXC_COMPAT_BINARY_INTEGER_MATRIX_OVERLOADS(fn)                        \
+  template <uint R, uint C>                                                    \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr matrix<float, R, C> fn(matrix<int, R, C> y, matrix<int, R, C> x) { \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  }                                                                            \
+                                                                               \
+  template <uint R, uint C>                                                    \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr matrix<float, R, C> fn(matrix<uint, R, C> y,                       \
+                                   matrix<uint, R, C> x) {                     \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  }                                                                            \
+                                                                               \
+  template <uint R, uint C>                                                    \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr matrix<float, R, C> fn(matrix<int64_t, R, C> y,                    \
+                                   matrix<int64_t, R, C> x) {                  \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  }                                                                            \
+                                                                               \
+  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  template <typename T, int R, int C>                                          \
-  constexpr matrix<ty, R, C> fn(matrix<T, R, C> y, matrix<T, R, C> x) {        \
-    return fn((matrix<ty, R, C>)y, (matrix<ty, R, C>)x);                       \
+  constexpr matrix<float, R, C> fn(matrix<uint64_t, R, C> y,                   \
+                                   matrix<uint64_t, R, C> x) {                 \
+    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
   }
 //===----------------------------------------------------------------------===//
 // acos builtins overloads
@@ -263,7 +291,8 @@ _DXC_COMPAT_UNARY_INTEGER_OVERLOADS(atan)
 
 _DXC_COMPAT_BINARY_DOUBLE_OVERLOADS(atan2)
 _DXC_COMPAT_BINARY_INTEGER_OVERLOADS(atan2)
-_DXC_COMPAT_BINARY_MATRIX_OVERLOADS(atan2, float)
+_DXC_COMPAT_BINARY_DOUBLE_MATRIX_OVERLOADS(atan2)
+_DXC_COMPAT_BINARY_INTEGER_MATRIX_OVERLOADS(atan2)
 
 //===----------------------------------------------------------------------===//
 // ceil builtins overloads

>From fb3072ab020b058f6f21646aa3dd6af2f86c1803 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Wed, 29 Apr 2026 17:02:28 -0700
Subject: [PATCH 05/12] add more tests

---
 .../CodeGenHLSL/builtins/atan2-overloads.hlsl  | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
index d36d2aebcf4c8..6ecad28850b00 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2-overloads.hlsl
@@ -191,6 +191,24 @@ float4 test_atan2_uint64_t4 (uint64_t4 p0, uint64_t4 p1) {
   return atan2(p0, p1);
 }
 
+// CHECK: define [[FNATTRS]] <16 x float> @_Z19test_atan2_int_t4x4u11matrix_typeILj4ELj4EiES_(
+// CHECK:    [[CONVI:%.*]] = sitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_int_t4x4 (int4x4 p0, int4x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <16 x float> @_Z20test_atan2_uint_t4x4u11matrix_typeILj4ELj4EjES_(
+// CHECK:    [[CONVI:%.*]] = uitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_uint_t4x4 (uint4x4 p0, uint4x4 p1) {
+  return atan2(p0, p1);
+}
+
 // CHECK: define [[FNATTRS]] <16 x float> @_Z21test_atan2_int64_t4x4u11matrix_typeILj4ELj4ElES_(
 // CHECK:    [[CONVI:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
 // CHECK:    [[CONV1I:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>

>From fe8cbb5342b9c8db8bd0f858ec533f115863057c Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Thu, 30 Apr 2026 11:06:45 -0700
Subject: [PATCH 06/12] adding sema tests

---
 .../lib/Headers/hlsl/hlsl_compat_overloads.h  |  1 +
 .../binary-compat-overload-warnings.hlsl      | 27 ++++++++++++++++++-
 .../BuiltIns/half-float-only-errors2.hlsl     |  6 +++++
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
index 0c4963b495b51..cddd940d1083e 100644
--- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
+++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
@@ -232,6 +232,7 @@ namespace hlsl {
 
 #define _DXC_COMPAT_BINARY_DOUBLE_MATRIX_OVERLOADS(fn)                         \
   template <uint R, uint C>                                                    \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
   constexpr matrix<float, R, C> fn(matrix<double, R, C> y,                     \
                                    matrix<double, R, C> x) {                   \
     return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
diff --git a/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl b/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl
index 27bb683825de8..7b93ea089d854 100644
--- a/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/binary-compat-overload-warnings.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxilv1.0-unknown-shadermodel6.0-compute -std=hlsl202x -emit-llvm-only -disable-llvm-passes -DFUNC=atan2 %s 2>&1 | FileCheck %s -DFUNC=atan2
+// RUN: %clang_cc1 -finclude-default-header -triple dxilv1.0-unknown-shadermodel6.0-compute -std=hlsl202x -emit-llvm-only -disable-llvm-passes -DFUNC=atan2 %s 2>&1 | FileCheck %s -DFUNC=atan2 --check-prefixes=CHECK,ATAN2
 // RUN: %clang_cc1 -finclude-default-header -triple dxilv1.0-unknown-shadermodel6.0-compute -std=hlsl202x -emit-llvm-only -disable-llvm-passes -DFUNC=pow %s 2>&1 | FileCheck %s -DFUNC=pow
 // RUN: %clang_cc1 -finclude-default-header -triple dxilv1.0-unknown-shadermodel6.0-compute -std=hlsl202x -emit-llvm-only -disable-llvm-passes -DFUNC=step %s 2>&1 | FileCheck %s -DFUNC=step
 
@@ -23,6 +23,11 @@ float4 test_binary_double4(double4 p0) {
   return FUNC(p0, p0);
 }
 
+float4x4 test_binary_double4x4(double4x4 p0) {
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x 64 bit API lowering for [[FUNC]] is deprecated. Explicitly cast parameters to 32 or 16 bit types.
+  return FUNC(p0, p0);
+}
+
 // binary integer overloads
 // only test scalar ones for brevity
 float test_binary_int(int p0) {
@@ -44,3 +49,23 @@ float test_binary_int(uint64_t p0) {
   // CHECK: warning: '[[FUNC]]' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
   return FUNC(p0, p0);
 }
+
+float4x4 test_binary_uint4x4(uint4x4 p0) {
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  return FUNC(p0, p0);
+}
+
+float4x4 test_binary_int4x4(int4x4 p0) {
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  return FUNC(p0, p0);
+}
+
+float4x4 test_binary_int64_t4x4(int64_t4x4 p0) {
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  return FUNC(p0, p0);
+}
+
+float4x4 test_binary_uint64_t4x4(uint64_t4x4 p0) {
+  // ATAN2: warning: '[[FUNC]]<4U, 4U>' is deprecated: In 202x int lowering for [[FUNC]] is deprecated. Explicitly cast parameters to float types.
+  return FUNC(p0, p0);
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
index 9e10e1afa9385..d7a2f15f6baa8 100644
--- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
@@ -11,3 +11,9 @@ double2 test_vec_double_builtin(double2 p0, double2 p1) {
     return TEST_FUNC(p0, p1);
   // expected-error at -1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double2' (aka 'vector<double, 2>'))}}
 }
+
+// Temporary matrix workarround until we have proper matrix support in the builtins.
+double2x2 test_vec_double_builtin(double2x2 p0, double2x2 p1) {
+    return __builtin_elementwise_atan2(p0, p1);
+  // expected-error at -1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double2x2' (aka 'matrix<double, 2, 2>'))}}
+}

>From 93a09ea2a85848ce27a56591f12a6749f408ad13 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Thu, 30 Apr 2026 13:51:01 -0700
Subject: [PATCH 07/12] add matrix overloads

---
 clang/include/clang/Basic/HLSLIntrinsics.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/HLSLIntrinsics.td b/clang/include/clang/Basic/HLSLIntrinsics.td
index 144b27cab7398..cc1e30d684d3e 100644
--- a/clang/include/clang/Basic/HLSLIntrinsics.td
+++ b/clang/include/clang/Basic/HLSLIntrinsics.td
@@ -457,7 +457,7 @@ determine the correct quadrant.
 \param x The x-coordinate.
 }];
   let VaryingTypes = [HalfTy, FloatTy];
-  let VaryingMatDims = [];
+  let VaryingMatDims = AllMatDims;
 }
 
 // Returns the smallest integer value that is greater than or equal to the

>From e0ba50390bf11d79eda7e62cd99d4df18da50dec Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Thu, 30 Apr 2026 14:12:15 -0700
Subject: [PATCH 08/12] address comments

---
 clang/include/clang/Basic/HLSLIntrinsics.td | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/include/clang/Basic/HLSLIntrinsics.td b/clang/include/clang/Basic/HLSLIntrinsics.td
index cc1e30d684d3e..6084a6f92180e 100644
--- a/clang/include/clang/Basic/HLSLIntrinsics.td
+++ b/clang/include/clang/Basic/HLSLIntrinsics.td
@@ -457,7 +457,6 @@ determine the correct quadrant.
 \param x The x-coordinate.
 }];
   let VaryingTypes = [HalfTy, FloatTy];
-  let VaryingMatDims = AllMatDims;
 }
 
 // Returns the smallest integer value that is greater than or equal to the

>From 7b10499001800a74940fd3eff0c5f7a8b7da24bc Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Thu, 30 Apr 2026 15:13:57 -0700
Subject: [PATCH 09/12] add _mat tests

---
 clang/test/CodeGenHLSL/builtins/atan2.hlsl    |  27 ---
 .../test/CodeGenHLSL/builtins/atan2_mat.hlsl  | 215 ++++++++++++++++++
 .../test/SemaHLSL/BuiltIns/atan2-errors.hlsl  |   7 +
 .../BuiltIns/half-float-only-errors2.hlsl     |   6 -
 4 files changed, 222 insertions(+), 33 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/atan2_mat.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/atan2-errors.hlsl

diff --git a/clang/test/CodeGenHLSL/builtins/atan2.hlsl b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
index 986ddc75b4f8e..512b44a5780db 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
@@ -34,20 +34,6 @@ half4 test_atan2_half4 (half4 p0, half4 p1) {
   return atan2(p0, p1);
 }
 
-// CHECK-LABEL: test_atan2_half4x4
-// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <16 x half> @llvm.atan2.v16f16
-// NO_HALF: call reassoc nnan ninf nsz arcp afn <16 x float> @llvm.atan2.v16f32
-half4x4 test_atan2_half4x4 (half4x4 p0, half4x4 p1) {
-  return atan2(p0, p1);
-}
-
-// CHECK-LABEL: test_atan2_half2x3
-// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <6 x half> @llvm.atan2.v6f16
-// NO_HALF: call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
-half2x3 test_atan2_half2x3 (half2x3 p0, half2x3 p1) {
-  return atan2(p0, p1);
-}
-
 // CHECK-LABEL: test_atan2_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.atan2.f32
 float test_atan2_float (float p0, float p1) {
@@ -71,16 +57,3 @@ float3 test_atan2_float3 (float3 p0, float3 p1) {
 float4 test_atan2_float4 (float4 p0, float4 p1) {
   return atan2(p0, p1);
 }
-
-// CHECK-LABEL: test_atan2_float4x4
-// CHECK:  call reassoc nnan ninf nsz arcp afn <16 x float> @llvm.atan2.v16f32
-float4x4 test_atan2_float4x4 (float4x4 p0, float4x4 p1) {
-  return atan2(p0, p1);
-}
-
-
-// CHECK-LABEL: test_atan2_float2x3
-// CHECK:  call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
-float2x3 test_atan2_float2x3 (float2x3 p0, float2x3 p1) {
-  return atan2(p0, p1);
-}
diff --git a/clang/test/CodeGenHLSL/builtins/atan2_mat.hlsl b/clang/test/CodeGenHLSL/builtins/atan2_mat.hlsl
new file mode 100644
index 0000000000000..db9439edfcc11
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/atan2_mat.hlsl
@@ -0,0 +1,215 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// CHECK-LABEL: test_atan2_half1x1
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <1 x half> @llvm.atan2.v1f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <1 x float> @llvm.atan2.v1f32
+half1x1 test_atan2_half1x1 (half1x1 p0, half1x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half1x2
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.atan2.v2f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.atan2.v2f32
+half1x2 test_atan2_half1x2 (half1x2 p0, half1x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half1x3
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.atan2.v3f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.atan2.v3f32
+half1x3 test_atan2_half1x3 (half1x3 p0, half1x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half1x4
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.atan2.v4f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.atan2.v4f32
+half1x4 test_atan2_half1x4 (half1x4 p0, half1x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half2x1
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.atan2.v2f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.atan2.v2f32
+half2x1 test_atan2_half2x1 (half2x1 p0, half2x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half2x2
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.atan2.v4f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.atan2.v4f32
+half2x2 test_atan2_half2x2 (half2x2 p0, half2x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half2x3
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <6 x half> @llvm.atan2.v6f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
+half2x3 test_atan2_half2x3 (half2x3 p0, half2x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half2x4
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <8 x half> @llvm.atan2.v8f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <8 x float> @llvm.atan2.v8f32
+half2x4 test_atan2_half2x4 (half2x4 p0, half2x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half3x1
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.atan2.v3f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.atan2.v3f32
+half3x1 test_atan2_half3x1 (half3x1 p0, half3x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half3x2
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <6 x half> @llvm.atan2.v6f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
+half3x2 test_atan2_half3x2 (half3x2 p0, half3x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half3x3
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <9 x half> @llvm.atan2.v9f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <9 x float> @llvm.atan2.v9f32
+half3x3 test_atan2_half3x3 (half3x3 p0, half3x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half3x4
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <12 x half> @llvm.atan2.v12f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <12 x float> @llvm.atan2.v12f32
+half3x4 test_atan2_half3x4 (half3x4 p0, half3x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half4x1
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.atan2.v4f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.atan2.v4f32
+half4x1 test_atan2_half4x1 (half4x1 p0, half4x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half4x2
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <8 x half> @llvm.atan2.v8f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <8 x float> @llvm.atan2.v8f32
+half4x2 test_atan2_half4x2 (half4x2 p0, half4x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half4x3
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <12 x half> @llvm.atan2.v12f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <12 x float> @llvm.atan2.v12f32
+half4x3 test_atan2_half4x3 (half4x3 p0, half4x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half4x4
+// NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <16 x half> @llvm.atan2.v16f16
+// NO_HALF: call reassoc nnan ninf nsz arcp afn <16 x float> @llvm.atan2.v16f32
+half4x4 test_atan2_half4x4 (half4x4 p0, half4x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float1x1
+// CHECK:  call reassoc nnan ninf nsz arcp afn <1 x float> @llvm.atan2.v1f32
+float1x1 test_atan2_float1x1 (float1x1 p0, float1x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float1x2
+// CHECK:  call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.atan2.v2f32
+float1x2 test_atan2_float1x2 (float1x2 p0, float1x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float1x3
+// CHECK:  call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.atan2.v3f32
+float1x3 test_atan2_float1x3 (float1x3 p0, float1x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float1x4
+// CHECK:  call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.atan2.v4f32
+float1x4 test_atan2_float1x4 (float1x4 p0, float1x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float2x1
+// CHECK:  call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.atan2.v2f32
+float2x1 test_atan2_float2x1 (float2x1 p0, float2x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float2x2
+// CHECK:  call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.atan2.v4f32
+float2x2 test_atan2_float2x2 (float2x2 p0, float2x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float2x3
+// CHECK:  call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
+float2x3 test_atan2_float2x3 (float2x3 p0, float2x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float2x4
+// CHECK:  call reassoc nnan ninf nsz arcp afn <8 x float> @llvm.atan2.v8f32
+float2x4 test_atan2_float2x4 (float2x4 p0, float2x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float3x1
+// CHECK:  call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.atan2.v3f32
+float3x1 test_atan2_float3x1 (float3x1 p0, float3x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float3x2
+// CHECK:  call reassoc nnan ninf nsz arcp afn <6 x float> @llvm.atan2.v6f32
+float3x2 test_atan2_float3x2 (float3x2 p0, float3x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float3x3
+// CHECK:  call reassoc nnan ninf nsz arcp afn <9 x float> @llvm.atan2.v9f32
+float3x3 test_atan2_float3x3 (float3x3 p0, float3x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float3x4
+// CHECK:  call reassoc nnan ninf nsz arcp afn <12 x float> @llvm.atan2.v12f32
+float3x4 test_atan2_float3x4 (float3x4 p0, float3x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float4x1
+// CHECK:  call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.atan2.v4f32
+float4x1 test_atan2_float4x1 (float4x1 p0, float4x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float4x2
+// CHECK:  call reassoc nnan ninf nsz arcp afn <8 x float> @llvm.atan2.v8f32
+float4x2 test_atan2_float4x2 (float4x2 p0, float4x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float4x3
+// CHECK:  call reassoc nnan ninf nsz arcp afn <12 x float> @llvm.atan2.v12f32
+float4x3 test_atan2_float4x3 (float4x3 p0, float4x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float4x4
+// CHECK:  call reassoc nnan ninf nsz arcp afn <16 x float> @llvm.atan2.v16f32
+float4x4 test_atan2_float4x4 (float4x4 p0, float4x4 p1) {
+  return atan2(p0, p1);
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/atan2-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/atan2-errors.hlsl
new file mode 100644
index 0000000000000..19467d99d2292
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/atan2-errors.hlsl
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
+
+
+double2x2 test_vec_double_builtin(double2x2 p0, double2x2 p1) {
+    return __builtin_elementwise_atan2(p0, p1);
+  // expected-error at -1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double2x2' (aka 'matrix<double, 2, 2>'))}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
index d7a2f15f6baa8..9e10e1afa9385 100644
--- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
@@ -11,9 +11,3 @@ double2 test_vec_double_builtin(double2 p0, double2 p1) {
     return TEST_FUNC(p0, p1);
   // expected-error at -1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double2' (aka 'vector<double, 2>'))}}
 }
-
-// Temporary matrix workarround until we have proper matrix support in the builtins.
-double2x2 test_vec_double_builtin(double2x2 p0, double2x2 p1) {
-    return __builtin_elementwise_atan2(p0, p1);
-  // expected-error at -1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double2x2' (aka 'matrix<double, 2, 2>'))}}
-}

>From c328726c2425824319b31bde3e14b1dbe9a8bd37 Mon Sep 17 00:00:00 2001
From: joaosaffran <joaosaffran at gmail.com>
Date: Thu, 30 Apr 2026 15:18:46 -0700
Subject: [PATCH 10/12] Apply suggestion from @Icohedron

Co-authored-by: Deric C. <cheung.deric at gmail.com>
---
 clang/lib/Headers/hlsl/hlsl_compat_overloads.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
index cddd940d1083e..12223aee6a4e7 100644
--- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
+++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
@@ -265,6 +265,7 @@ namespace hlsl {
                                    matrix<uint64_t, R, C> x) {                 \
     return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
   }
+
 //===----------------------------------------------------------------------===//
 // acos builtins overloads
 //===----------------------------------------------------------------------===//

>From 9c0a5bcef00b9400fd85e9eca2cf0afb24c5daaf Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Thu, 30 Apr 2026 15:20:44 -0700
Subject: [PATCH 11/12] clean up

---
 clang/lib/Sema/SemaHLSL.cpp                                   | 4 ++--
 .../BuiltIns/{atan2-errors.hlsl => atan2-errors_mat.hlsl}     | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename clang/test/SemaHLSL/BuiltIns/{atan2-errors.hlsl => atan2-errors_mat.hlsl} (100%)

diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 368a54b44c2d5..bb996d291675e 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3240,8 +3240,8 @@ static bool CheckFloatOrHalfRepresentation(Sema *S, SourceLocation Loc,
 
   if (!BaseType->isHalfType() && !BaseType->isFloat32Type())
     return S->Diag(Loc, diag::err_builtin_invalid_arg_type)
-           << ArgOrdinal << /* scalar, vector or matrix of */ 5
-           << /* no int */ 0 << /* half or float */ 2 << PassedType;
+           << ArgOrdinal << /* scalar or vector of */ 5 << /* no int */ 0
+           << /* half or float */ 2 << PassedType;
   return false;
 }
 
diff --git a/clang/test/SemaHLSL/BuiltIns/atan2-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/atan2-errors_mat.hlsl
similarity index 100%
rename from clang/test/SemaHLSL/BuiltIns/atan2-errors.hlsl
rename to clang/test/SemaHLSL/BuiltIns/atan2-errors_mat.hlsl

>From a31a8fc0cc64a66631c9be680158b2944c3521b7 Mon Sep 17 00:00:00 2001
From: Joao Saffran <joaosaffranllvm at gmail.com>
Date: Fri, 1 May 2026 11:35:19 -0700
Subject: [PATCH 12/12] remove templates

---
 .../lib/Headers/hlsl/hlsl_compat_overloads.h  | 402 ++++++++++-
 .../builtins/atan2-overloads_mat.hlsl         | 679 ++++++++++++++++++
 2 files changed, 1061 insertions(+), 20 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/atan2-overloads_mat.hlsl

diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
index 12223aee6a4e7..08af61bed7b9a 100644
--- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
+++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
@@ -231,41 +231,403 @@ namespace hlsl {
   }
 
 #define _DXC_COMPAT_BINARY_DOUBLE_MATRIX_OVERLOADS(fn)                         \
-  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
-  constexpr matrix<float, R, C> fn(matrix<double, R, C> y,                     \
-                                   matrix<double, R, C> x) {                   \
-    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  constexpr float1x1 fn(double1x1 y, double1x1 x) {                            \
+    return fn((float1x1)y, (float1x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float1x2 fn(double1x2 y, double1x2 x) {                            \
+    return fn((float1x2)y, (float1x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float1x3 fn(double1x3 y, double1x3 x) {                            \
+    return fn((float1x3)y, (float1x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float1x4 fn(double1x4 y, double1x4 x) {                            \
+    return fn((float1x4)y, (float1x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float2x1 fn(double2x1 y, double2x1 x) {                            \
+    return fn((float2x1)y, (float2x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float2x2 fn(double2x2 y, double2x2 x) {                            \
+    return fn((float2x2)y, (float2x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float2x3 fn(double2x3 y, double2x3 x) {                            \
+    return fn((float2x3)y, (float2x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float2x4 fn(double2x4 y, double2x4 x) {                            \
+    return fn((float2x4)y, (float2x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float3x1 fn(double3x1 y, double3x1 x) {                            \
+    return fn((float3x1)y, (float3x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float3x2 fn(double3x2 y, double3x2 x) {                            \
+    return fn((float3x2)y, (float3x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float3x3 fn(double3x3 y, double3x3 x) {                            \
+    return fn((float3x3)y, (float3x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float3x4 fn(double3x4 y, double3x4 x) {                            \
+    return fn((float3x4)y, (float3x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float4x1 fn(double4x1 y, double4x1 x) {                            \
+    return fn((float4x1)y, (float4x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float4x2 fn(double4x2 y, double4x2 x) {                            \
+    return fn((float4x2)y, (float4x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float4x3 fn(double4x3 y, double4x3 x) {                            \
+    return fn((float4x3)y, (float4x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_64BIT_FN(fn)                                                 \
+  constexpr float4x4 fn(double4x4 y, double4x4 x) {                            \
+    return fn((float4x4)y, (float4x4)x);                                       \
   }
 
 #define _DXC_COMPAT_BINARY_INTEGER_MATRIX_OVERLOADS(fn)                        \
-  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr matrix<float, R, C> fn(matrix<int, R, C> y, matrix<int, R, C> x) { \
-    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  constexpr float1x1 fn(int1x1 y, int1x1 x) {                                  \
+    return fn((float1x1)y, (float1x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x2 fn(int1x2 y, int1x2 x) {                                  \
+    return fn((float1x2)y, (float1x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x3 fn(int1x3 y, int1x3 x) {                                  \
+    return fn((float1x3)y, (float1x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x4 fn(int1x4 y, int1x4 x) {                                  \
+    return fn((float1x4)y, (float1x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x1 fn(int2x1 y, int2x1 x) {                                  \
+    return fn((float2x1)y, (float2x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x2 fn(int2x2 y, int2x2 x) {                                  \
+    return fn((float2x2)y, (float2x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x3 fn(int2x3 y, int2x3 x) {                                  \
+    return fn((float2x3)y, (float2x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x4 fn(int2x4 y, int2x4 x) {                                  \
+    return fn((float2x4)y, (float2x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x1 fn(int3x1 y, int3x1 x) {                                  \
+    return fn((float3x1)y, (float3x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x2 fn(int3x2 y, int3x2 x) {                                  \
+    return fn((float3x2)y, (float3x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x3 fn(int3x3 y, int3x3 x) {                                  \
+    return fn((float3x3)y, (float3x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x4 fn(int3x4 y, int3x4 x) {                                  \
+    return fn((float3x4)y, (float3x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x1 fn(int4x1 y, int4x1 x) {                                  \
+    return fn((float4x1)y, (float4x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x2 fn(int4x2 y, int4x2 x) {                                  \
+    return fn((float4x2)y, (float4x2)x);                                       \
   }                                                                            \
                                                                                \
-  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr matrix<float, R, C> fn(matrix<uint, R, C> y,                       \
-                                   matrix<uint, R, C> x) {                     \
-    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  constexpr float4x3 fn(int4x3 y, int4x3 x) {                                  \
+    return fn((float4x3)y, (float4x3)x);                                       \
   }                                                                            \
                                                                                \
-  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr matrix<float, R, C> fn(matrix<int64_t, R, C> y,                    \
-                                   matrix<int64_t, R, C> x) {                  \
-    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  constexpr float4x4 fn(int4x4 y, int4x4 x) {                                  \
+    return fn((float4x4)y, (float4x4)x);                                       \
+  }                                                                            \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x1 fn(uint1x1 y, uint1x1 x) {                                \
+    return fn((float1x1)y, (float1x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x2 fn(uint1x2 y, uint1x2 x) {                                \
+    return fn((float1x2)y, (float1x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x3 fn(uint1x3 y, uint1x3 x) {                                \
+    return fn((float1x3)y, (float1x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x4 fn(uint1x4 y, uint1x4 x) {                                \
+    return fn((float1x4)y, (float1x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x1 fn(uint2x1 y, uint2x1 x) {                                \
+    return fn((float2x1)y, (float2x1)x);                                       \
   }                                                                            \
                                                                                \
-  template <uint R, uint C>                                                    \
   _DXC_DEPRECATED_INT_FN(fn)                                                   \
-  constexpr matrix<float, R, C> fn(matrix<uint64_t, R, C> y,                   \
-                                   matrix<uint64_t, R, C> x) {                 \
-    return fn((matrix<float, R, C>)y, (matrix<float, R, C>)x);                 \
+  constexpr float2x2 fn(uint2x2 y, uint2x2 x) {                                \
+    return fn((float2x2)y, (float2x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x3 fn(uint2x3 y, uint2x3 x) {                                \
+    return fn((float2x3)y, (float2x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x4 fn(uint2x4 y, uint2x4 x) {                                \
+    return fn((float2x4)y, (float2x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x1 fn(uint3x1 y, uint3x1 x) {                                \
+    return fn((float3x1)y, (float3x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x2 fn(uint3x2 y, uint3x2 x) {                                \
+    return fn((float3x2)y, (float3x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x3 fn(uint3x3 y, uint3x3 x) {                                \
+    return fn((float3x3)y, (float3x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x4 fn(uint3x4 y, uint3x4 x) {                                \
+    return fn((float3x4)y, (float3x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x1 fn(uint4x1 y, uint4x1 x) {                                \
+    return fn((float4x1)y, (float4x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x2 fn(uint4x2 y, uint4x2 x) {                                \
+    return fn((float4x2)y, (float4x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x3 fn(uint4x3 y, uint4x3 x) {                                \
+    return fn((float4x3)y, (float4x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x4 fn(uint4x4 y, uint4x4 x) {                                \
+    return fn((float4x4)y, (float4x4)x);                                       \
+  }                                                                            \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x1 fn(int64_t1x1 y, int64_t1x1 x) {                          \
+    return fn((float1x1)y, (float1x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x2 fn(int64_t1x2 y, int64_t1x2 x) {                          \
+    return fn((float1x2)y, (float1x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x3 fn(int64_t1x3 y, int64_t1x3 x) {                          \
+    return fn((float1x3)y, (float1x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x4 fn(int64_t1x4 y, int64_t1x4 x) {                          \
+    return fn((float1x4)y, (float1x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x1 fn(int64_t2x1 y, int64_t2x1 x) {                          \
+    return fn((float2x1)y, (float2x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x2 fn(int64_t2x2 y, int64_t2x2 x) {                          \
+    return fn((float2x2)y, (float2x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x3 fn(int64_t2x3 y, int64_t2x3 x) {                          \
+    return fn((float2x3)y, (float2x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x4 fn(int64_t2x4 y, int64_t2x4 x) {                          \
+    return fn((float2x4)y, (float2x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x1 fn(int64_t3x1 y, int64_t3x1 x) {                          \
+    return fn((float3x1)y, (float3x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x2 fn(int64_t3x2 y, int64_t3x2 x) {                          \
+    return fn((float3x2)y, (float3x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x3 fn(int64_t3x3 y, int64_t3x3 x) {                          \
+    return fn((float3x3)y, (float3x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x4 fn(int64_t3x4 y, int64_t3x4 x) {                          \
+    return fn((float3x4)y, (float3x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x1 fn(int64_t4x1 y, int64_t4x1 x) {                          \
+    return fn((float4x1)y, (float4x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x2 fn(int64_t4x2 y, int64_t4x2 x) {                          \
+    return fn((float4x2)y, (float4x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x3 fn(int64_t4x3 y, int64_t4x3 x) {                          \
+    return fn((float4x3)y, (float4x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x4 fn(int64_t4x4 y, int64_t4x4 x) {                          \
+    return fn((float4x4)y, (float4x4)x);                                       \
+  }                                                                            \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x1 fn(uint64_t1x1 y, uint64_t1x1 x) {                        \
+    return fn((float1x1)y, (float1x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x2 fn(uint64_t1x2 y, uint64_t1x2 x) {                        \
+    return fn((float1x2)y, (float1x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x3 fn(uint64_t1x3 y, uint64_t1x3 x) {                        \
+    return fn((float1x3)y, (float1x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float1x4 fn(uint64_t1x4 y, uint64_t1x4 x) {                        \
+    return fn((float1x4)y, (float1x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x1 fn(uint64_t2x1 y, uint64_t2x1 x) {                        \
+    return fn((float2x1)y, (float2x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x2 fn(uint64_t2x2 y, uint64_t2x2 x) {                        \
+    return fn((float2x2)y, (float2x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x3 fn(uint64_t2x3 y, uint64_t2x3 x) {                        \
+    return fn((float2x3)y, (float2x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float2x4 fn(uint64_t2x4 y, uint64_t2x4 x) {                        \
+    return fn((float2x4)y, (float2x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x1 fn(uint64_t3x1 y, uint64_t3x1 x) {                        \
+    return fn((float3x1)y, (float3x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x2 fn(uint64_t3x2 y, uint64_t3x2 x) {                        \
+    return fn((float3x2)y, (float3x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x3 fn(uint64_t3x3 y, uint64_t3x3 x) {                        \
+    return fn((float3x3)y, (float3x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float3x4 fn(uint64_t3x4 y, uint64_t3x4 x) {                        \
+    return fn((float3x4)y, (float3x4)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x1 fn(uint64_t4x1 y, uint64_t4x1 x) {                        \
+    return fn((float4x1)y, (float4x1)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x2 fn(uint64_t4x2 y, uint64_t4x2 x) {                        \
+    return fn((float4x2)y, (float4x2)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x3 fn(uint64_t4x3 y, uint64_t4x3 x) {                        \
+    return fn((float4x3)y, (float4x3)x);                                       \
+  }                                                                            \
+                                                                               \
+  _DXC_DEPRECATED_INT_FN(fn)                                                   \
+  constexpr float4x4 fn(uint64_t4x4 y, uint64_t4x4 x) {                        \
+    return fn((float4x4)y, (float4x4)x);                                       \
   }
-
 //===----------------------------------------------------------------------===//
 // acos builtins overloads
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenHLSL/builtins/atan2-overloads_mat.hlsl b/clang/test/CodeGenHLSL/builtins/atan2-overloads_mat.hlsl
new file mode 100644
index 0000000000000..5cf473fa13d98
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/atan2-overloads_mat.hlsl
@@ -0,0 +1,679 @@
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm  \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" 
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_double1x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <2 x double> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <2 x double> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float1x2 test_atan2_double1x2 (double1x2 p0, double1x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_double1x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <3 x double> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <3 x double> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float1x3 test_atan2_double1x3 (double1x3 p0, double1x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_double1x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <4 x double> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <4 x double> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float1x4 test_atan2_double1x4 (double1x4 p0, double1x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_double2x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <2 x double> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <2 x double> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float2x1 test_atan2_double2x1 (double2x1 p0, double2x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_double2x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <4 x double> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <4 x double> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float2x2 test_atan2_double2x2 (double2x2 p0, double2x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_double2x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <6 x double> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <6 x double> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float2x3 test_atan2_double2x3 (double2x3 p0, double2x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_double2x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <8 x double> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <8 x double> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float2x4 test_atan2_double2x4 (double2x4 p0, double2x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_double3x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <3 x double> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <3 x double> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float3x1 test_atan2_double3x1 (double3x1 p0, double3x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_double3x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <6 x double> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <6 x double> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float3x2 test_atan2_double3x2 (double3x2 p0, double3x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <9 x float> @_{{.*}}test_atan2_double3x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <9 x double> %{{.*}} to <9 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <9 x double> %{{.*}} to <9 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <9 x float> @llvm.atan2.v9f32(<9 x float> [[CONVI]], <9 x float> [[CONV1I]])
+// CHECK:    ret <9 x float> [[V5]]
+float3x3 test_atan2_double3x3 (double3x3 p0, double3x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_double3x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <12 x double> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <12 x double> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float3x4 test_atan2_double3x4 (double3x4 p0, double3x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_double4x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <4 x double> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <4 x double> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float4x1 test_atan2_double4x1 (double4x1 p0, double4x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_double4x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <8 x double> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <8 x double> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float4x2 test_atan2_double4x2 (double4x2 p0, double4x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_double4x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <12 x double> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <12 x double> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float4x3 test_atan2_double4x3 (double4x3 p0, double4x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <16 x float> @_{{.*}}test_atan2_double4x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = fptrunc {{.*}} <16 x double> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = fptrunc {{.*}} <16 x double> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_double4x4 (double4x4 p0, double4x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_uint1x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float1x2 test_atan2_uint1x2 (uint1x2 p0, uint1x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_uint1x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float1x3 test_atan2_uint1x3 (uint1x3 p0, uint1x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_uint1x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float1x4 test_atan2_uint1x4 (uint1x4 p0, uint1x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_uint2x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float2x1 test_atan2_uint2x1 (uint2x1 p0, uint2x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_uint2x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float2x2 test_atan2_uint2x2 (uint2x2 p0, uint2x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_uint2x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float2x3 test_atan2_uint2x3 (uint2x3 p0, uint2x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_uint2x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float2x4 test_atan2_uint2x4 (uint2x4 p0, uint2x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_uint3x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float3x1 test_atan2_uint3x1 (uint3x1 p0, uint3x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_uint3x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float3x2 test_atan2_uint3x2 (uint3x2 p0, uint3x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <9 x float> @_{{.*}}test_atan2_uint3x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <9 x i32> %{{.*}} to <9 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <9 x i32> %{{.*}} to <9 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <9 x float> @llvm.atan2.v9f32(<9 x float> [[CONVI]], <9 x float> [[CONV1I]])
+// CHECK:    ret <9 x float> [[V5]]
+float3x3 test_atan2_uint3x3 (uint3x3 p0, uint3x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_uint3x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float3x4 test_atan2_uint3x4 (uint3x4 p0, uint3x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_uint4x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float4x1 test_atan2_uint4x1 (uint4x1 p0, uint4x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_uint4x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float4x2 test_atan2_uint4x2 (uint4x2 p0, uint4x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_uint4x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float4x3 test_atan2_uint4x3 (uint4x3 p0, uint4x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <16 x float> @_{{.*}}test_atan2_uint4x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_uint4x4 (uint4x4 p0, uint4x4 p1) {
+  return atan2(p0, p1);
+}
+
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_int1x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float1x2 test_atan2_int1x2 (int1x2 p0, int1x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_int1x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float1x3 test_atan2_int1x3 (int1x3 p0, int1x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_int1x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float1x4 test_atan2_int1x4 (int1x4 p0, int1x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_int2x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <2 x i32> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float2x1 test_atan2_int2x1 (int2x1 p0, int2x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_int2x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float2x2 test_atan2_int2x2 (int2x2 p0, int2x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_int2x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float2x3 test_atan2_int2x3 (int2x3 p0, int2x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_int2x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float2x4 test_atan2_int2x4 (int2x4 p0, int2x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_int3x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <3 x i32> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float3x1 test_atan2_int3x1 (int3x1 p0, int3x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_int3x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <6 x i32> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float3x2 test_atan2_int3x2 (int3x2 p0, int3x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <9 x float> @_{{.*}}test_atan2_int3x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <9 x i32> %{{.*}} to <9 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <9 x i32> %{{.*}} to <9 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <9 x float> @llvm.atan2.v9f32(<9 x float> [[CONVI]], <9 x float> [[CONV1I]])
+// CHECK:    ret <9 x float> [[V5]]
+float3x3 test_atan2_int3x3 (int3x3 p0, int3x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_int3x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float3x4 test_atan2_int3x4 (int3x4 p0, int3x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_int4x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float4x1 test_atan2_int4x1 (int4x1 p0, int4x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_int4x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <8 x i32> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float4x2 test_atan2_int4x2 (int4x2 p0, int4x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_int4x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <12 x i32> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float4x3 test_atan2_int4x3 (int4x3 p0, int4x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <16 x float> @_{{.*}}test_atan2_int4x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <16 x i32> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_int4x4 (int4x4 p0, int4x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_int64_t1x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float1x2 test_atan2_int64_t1x2 (int64_t1x2 p0, int64_t1x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_int64_t1x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float1x3 test_atan2_int64_t1x3 (int64_t1x3 p0, int64_t1x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_int64_t1x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float1x4 test_atan2_int64_t1x4 (int64_t1x4 p0, int64_t1x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_int64_t2x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float2x1 test_atan2_int64_t2x1 (int64_t2x1 p0, int64_t2x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_int64_t2x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float2x2 test_atan2_int64_t2x2 (int64_t2x2 p0, int64_t2x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_int64_t2x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float2x3 test_atan2_int64_t2x3 (int64_t2x3 p0, int64_t2x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_int64_t2x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float2x4 test_atan2_int64_t2x4 (int64_t2x4 p0, int64_t2x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_int64_t3x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float3x1 test_atan2_int64_t3x1 (int64_t3x1 p0, int64_t3x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_int64_t3x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float3x2 test_atan2_int64_t3x2 (int64_t3x2 p0, int64_t3x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <9 x float> @_{{.*}}test_atan2_int64_t3x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <9 x i64> %{{.*}} to <9 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <9 x i64> %{{.*}} to <9 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <9 x float> @llvm.atan2.v9f32(<9 x float> [[CONVI]], <9 x float> [[CONV1I]])
+// CHECK:    ret <9 x float> [[V5]]
+float3x3 test_atan2_int64_t3x3 (int64_t3x3 p0, int64_t3x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_int64_t3x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float3x4 test_atan2_int64_t3x4 (int64_t3x4 p0, int64_t3x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_int64_t4x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float4x1 test_atan2_int64_t4x1 (int64_t4x1 p0, int64_t4x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_int64_t4x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float4x2 test_atan2_int64_t4x2 (int64_t4x2 p0, int64_t4x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_int64_t4x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float4x3 test_atan2_int64_t4x3 (int64_t4x3 p0, int64_t4x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <16 x float> @_{{.*}}test_atan2_int64_t4x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = sitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_int64_t4x4 (int64_t4x4 p0, int64_t4x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_uint64_t1x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float1x2 test_atan2_uint64_t1x2 (uint64_t1x2 p0, uint64_t1x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_uint64_t1x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float1x3 test_atan2_uint64_t1x3 (uint64_t1x3 p0, uint64_t1x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_uint64_t1x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float1x4 test_atan2_uint64_t1x4 (uint64_t1x4 p0, uint64_t1x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <2 x float> @_{{.*}}test_atan2_uint64_t2x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <2 x i64> %{{.*}} to <2 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <2 x float> @llvm.atan2.v2f32(<2 x float> [[CONVI]], <2 x float> [[CONV1I]])
+// CHECK:    ret <2 x float> [[V5]]
+float2x1 test_atan2_uint64_t2x1 (uint64_t2x1 p0, uint64_t2x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_uint64_t2x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float2x2 test_atan2_uint64_t2x2 (uint64_t2x2 p0, uint64_t2x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_uint64_t2x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float2x3 test_atan2_uint64_t2x3 (uint64_t2x3 p0, uint64_t2x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_uint64_t2x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float2x4 test_atan2_uint64_t2x4 (uint64_t2x4 p0, uint64_t2x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @_{{.*}}test_atan2_uint64_t3x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <3 x i64> %{{.*}} to <3 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <3 x float> @llvm.atan2.v3f32(<3 x float> [[CONVI]], <3 x float> [[CONV1I]])
+// CHECK:    ret <3 x float> [[V5]]
+float3x1 test_atan2_uint64_t3x1 (uint64_t3x1 p0, uint64_t3x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <6 x float> @_{{.*}}test_atan2_uint64_t3x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <6 x i64> %{{.*}} to <6 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <6 x float> @llvm.atan2.v6f32(<6 x float> [[CONVI]], <6 x float> [[CONV1I]])
+// CHECK:    ret <6 x float> [[V5]]
+float3x2 test_atan2_uint64_t3x2 (uint64_t3x2 p0, uint64_t3x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <9 x float> @_{{.*}}test_atan2_uint64_t3x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <9 x i64> %{{.*}} to <9 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <9 x i64> %{{.*}} to <9 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <9 x float> @llvm.atan2.v9f32(<9 x float> [[CONVI]], <9 x float> [[CONV1I]])
+// CHECK:    ret <9 x float> [[V5]]
+float3x3 test_atan2_uint64_t3x3 (uint64_t3x3 p0, uint64_t3x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_uint64_t3x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float3x4 test_atan2_uint64_t3x4 (uint64_t3x4 p0, uint64_t3x4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <4 x float> @_{{.*}}test_atan2_uint64_t4x1{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <4 x float> @llvm.atan2.v4f32(<4 x float> [[CONVI]], <4 x float> [[CONV1I]])
+// CHECK:    ret <4 x float> [[V5]]
+float4x1 test_atan2_uint64_t4x1 (uint64_t4x1 p0, uint64_t4x1 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <8 x float> @_{{.*}}test_atan2_uint64_t4x2{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <8 x i64> %{{.*}} to <8 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <8 x float> @llvm.atan2.v8f32(<8 x float> [[CONVI]], <8 x float> [[CONV1I]])
+// CHECK:    ret <8 x float> [[V5]]
+float4x2 test_atan2_uint64_t4x2 (uint64_t4x2 p0, uint64_t4x2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <12 x float> @_{{.*}}test_atan2_uint64_t4x3{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <12 x i64> %{{.*}} to <12 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <12 x float> @llvm.atan2.v12f32(<12 x float> [[CONVI]], <12 x float> [[CONV1I]])
+// CHECK:    ret <12 x float> [[V5]]
+float4x3 test_atan2_uint64_t4x3 (uint64_t4x3 p0, uint64_t4x3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <16 x float> @_{{.*}}test_atan2_uint64_t4x4{{.*}}(
+// CHECK:    [[CONVI:%.*]] = uitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[CONV1I:%.*]] = uitofp <16 x i64> %{{.*}} to <16 x float>
+// CHECK:    [[V5:%.*]] = call {{.*}} <16 x float> @llvm.atan2.v16f32(<16 x float> [[CONVI]], <16 x float> [[CONV1I]])
+// CHECK:    ret <16 x float> [[V5]]
+float4x4 test_atan2_uint64_t4x4 (uint64_t4x4 p0, uint64_t4x4 p1) {
+  return atan2(p0, p1);
+}