[clang] 3323a62 - [Matrix] Add __builtin_matrix_transpose to Clang.

Tue Jun 9 02:15:38 PDT 2020

Author: Florian Hahn
Date: 2020-06-09T10:14:37+01:00
New Revision: 3323a628ec821b8b75d3b60bf1510931f97d3883

URL: https://github.com/llvm/llvm-project/commit/3323a628ec821b8b75d3b60bf1510931f97d3883
DIFF: https://github.com/llvm/llvm-project/commit/3323a628ec821b8b75d3b60bf1510931f97d3883.diff

LOG: [Matrix] Add __builtin_matrix_transpose to Clang.

This patch add __builtin_matrix_transpose to Clang, as described in
clang/docs/MatrixTypes.rst.

Reviewers: rjmccall, jfb, rsmith, Bigcheese

Reviewed By: rjmccall

Differential Revision: https://reviews.llvm.org/D72778

Added: 
    clang/test/CodeGen/matrix-type-builtins.c
    clang/test/CodeGenCXX/matrix-type-builtins.cpp
    clang/test/CodeGenObjC/matrix-type-builtins.m
    clang/test/Sema/matrix-type-builtins.c
    clang/test/SemaCXX/matrix-type-builtins.cpp
    clang/test/SemaObjC/matrix-type-builtins.m

Modified: 
    clang/include/clang/Basic/Builtins.def
    clang/include/clang/Basic/DiagnosticSemaKinds.td
    clang/include/clang/Sema/Sema.h
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/lib/Sema/SemaChecking.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 4c43d63ffec4..4deca85bb867 100644

--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -577,6 +577,8 @@ BUILTIN(__builtin_alloca, "v*z"   , "Fn")
 BUILTIN(__builtin_alloca_with_align, "v*zIz", "Fn")
 BUILTIN(__builtin_call_with_static_chain, "v.", "nt")
 
+BUILTIN(__builtin_matrix_transpose, "v.", "nFt")
+
 // "Overloaded" Atomic operator builtins.  These are overloaded to support data
 // types of i8, i16, i32, i64, and i128.  The front-end sees calls to the
 // non-suffixed version of these (which has a bogus type) and transforms them to

diff  --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index cce6dcc54c2f..84bcf66a148e 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10779,6 +10779,9 @@ def warn_import_on_definition : Warning<
   "import %select{module|name}0 cannot be applied to a function with a definition">,
   InGroup<IgnoredAttributes>;
 
+def err_builtin_matrix_arg: Error<
+  "%select{first|second}0 argument must be a matrix">;
+
 def err_preserve_field_info_not_field : Error<
   "__builtin_preserve_field_info argument %0 not a field access">;
 def err_preserve_field_info_not_const: Error<

diff  --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 8df78a5a7aca..33be6c68b11a 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -12120,6 +12120,11 @@ class Sema final {
                                 int ArgNum, unsigned ExpectedFieldNum,
                                 bool AllowName);
   bool SemaBuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall);
+
+  // Matrix builtin handling.
+  ExprResult SemaBuiltinMatrixTranspose(CallExpr *TheCall,
+                                        ExprResult CallResult);
+
 public:
   enum FormatStringType {
     FST_Scanf,

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c580720379b5..bfc78ce94892 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -44,6 +44,7 @@
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/MatrixBuilder.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/TargetParser.h"
@@ -2375,6 +2376,15 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
   }
 
+  case Builtin::BI__builtin_matrix_transpose: {
+    const auto *MatrixTy = E->getArg(0)->getType()->getAs<ConstantMatrixType>();
+    Value *MatValue = EmitScalarExpr(E->getArg(0));
+    MatrixBuilder<CGBuilderTy> MB(Builder);
+    Value *Result = MB.CreateMatrixTranspose(MatValue, MatrixTy->getNumRows(),
+                                             MatrixTy->getNumColumns());
+    return RValue::get(Result);
+  }
+
   case Builtin::BIfinite:
   case Builtin::BI__finite:
   case Builtin::BIfinitef:

diff  --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 86b18e26eb8d..0e451d6e5867 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1896,7 +1896,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
       return ExprError();
     break;
   case Builtin::BI__builtin_frame_address:
-  case Builtin::BI__builtin_return_address:
+  case Builtin::BI__builtin_return_address: {
     if (SemaBuiltinConstantArgRange(TheCall, 0, 0, 0xFFFF))
       return ExprError();
 
@@ -1913,6 +1913,10 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     break;
   }
 
+  case Builtin::BI__builtin_matrix_transpose:
+    return SemaBuiltinMatrixTranspose(TheCall, TheCallResult);
+  }
+
   // Since the target specific builtins for each arch overlap, only check those
   // of the arch we are compiling for.
   if (Context.BuiltinInfo.isTSBuiltin(BuiltinID)) {
@@ -15033,3 +15037,32 @@ void Sema::CheckAddressOfPackedMember(Expr *rhs) {
       rhs, std::bind(&Sema::AddPotentialMisalignedMembers, std::ref(*this), _1,
                      _2, _3, _4));
 }
+
+ExprResult Sema::SemaBuiltinMatrixTranspose(CallExpr *TheCall,
+                                            ExprResult CallResult) {
+  if (checkArgCount(*this, TheCall, 1))
+    return ExprError();
+
+  ExprResult MatrixArg = DefaultLvalueConversion(TheCall->getArg(0));
+  if (MatrixArg.isInvalid())
+    return MatrixArg;
+  Expr *Matrix = MatrixArg.get();
+
+  auto *MType = Matrix->getType()->getAs<ConstantMatrixType>();
+  if (!MType) {
+    Diag(Matrix->getBeginLoc(), diag::err_builtin_matrix_arg) << 0;
+    return ExprError();
+  }
+
+  // Create returned matrix type by swapping rows and columns of the argument
+  // matrix type.
+  QualType ResultType = Context.getConstantMatrixType(
+      MType->getElementType(), MType->getNumColumns(), MType->getNumRows());
+
+  // Change the return type to the type of the returned matrix.
+  TheCall->setType(ResultType);
+
+  // Update call argument to use the possibly converted matrix argument.
+  TheCall->setArg(0, Matrix);
+  return CallResult;
+}

diff  --git a/clang/test/CodeGen/matrix-type-builtins.c b/clang/test/CodeGen/matrix-type-builtins.c
new file mode 100644
index 000000000000..5bd7239333ce
--- /dev/null
+++ b/clang/test/CodeGen/matrix-type-builtins.c
@@ -0,0 +1,98 @@
+// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// Tests for the matrix type builtins.
+
+typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
+typedef float fx2x3_t __attribute__((matrix_type(2, 3)));
+typedef float fx3x2_t __attribute__((matrix_type(3, 2)));
+typedef int ix20x4_t __attribute__((matrix_type(20, 4)));
+typedef int ix4x20_t __attribute__((matrix_type(4, 20)));
+typedef unsigned ux1x6_t __attribute__((matrix_type(1, 6)));
+typedef unsigned ux6x1_t __attribute__((matrix_type(6, 1)));
+
+void transpose_double_5x5(dx5x5_t *a) {
+  // CHECK-LABEL: define void @transpose_double_5x5(
+  // CHECK:        [[A:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8
+  // CHECK-NEXT:   [[TRANS:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[A]], i32 5, i32 5)
+  // CHECK-NEXT:   [[AT_ADDR:%.*]] = bitcast [25 x double]* %a_t to <25 x double>*
+  // CHECK-NEXT:   store <25 x double> [[TRANS]], <25 x double>* [[AT_ADDR]], align 8
+  dx5x5_t a_t = __builtin_matrix_transpose(*a);
+}
+
+void transpose_float_3x2(fx3x2_t *a) {
+  // CHECK-LABEL: define void @transpose_float_3x2(
+  // CHECK:        [[A:%.*]] = load <6 x float>, <6 x float>* {{.*}}, align 4
+  // CHECK-NEXT:   [[TRANS:%.*]] = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> [[A]], i32 3, i32 2)
+  // CHECK-NEXT:   [[AT_ADDR:%.*]] = bitcast [6 x float]* %a_t to <6 x float>*
+  // CHECK-NEXT:   store <6 x float> [[TRANS]], <6 x float>* [[AT_ADDR]], align 4
+
+  fx2x3_t a_t = __builtin_matrix_transpose(*a);
+}
+
+void transpose_int_20x4(ix20x4_t *a) {
+  // CHECK-LABEL: define void @transpose_int_20x4(
+  // CHECK:         [[A:%.*]] = load <80 x i32>, <80 x i32>* {{.*}}, align 4
+  // CHECK-NEXT:    [[TRANS:%.*]] = call <80 x i32> @llvm.matrix.transpose.v80i32(<80 x i32> [[A]], i32 20, i32 4)
+  // CHECK-NEXT:    [[AT_ADDR:%.*]] = bitcast [80 x i32]* %a_t to <80 x i32>*
+  // CHECK-NEXT:    store <80 x i32> [[TRANS]], <80 x i32>* [[AT_ADDR]], align 4
+
+  ix4x20_t a_t = __builtin_matrix_transpose(*a);
+}
+
+struct Foo {
+  ux1x6_t in;
+  ux6x1_t out;
+};
+
+void transpose_struct_member(struct Foo *F) {
+  // CHECK-LABEL: define void @transpose_struct_member(
+  // CHECK:         [[M:%.*]] = load <6 x i32>, <6 x i32>* {{.*}}, align 4
+  // CHECK-NEXT:    [[M_T:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M]], i32 1, i32 6)
+  // CHECK-NEXT:    [[F_ADDR:%.*]] = load %struct.Foo*, %struct.Foo** %F.addr, align 8
+  // CHECK-NEXT:    [[OUT_PTR:%.*]] = getelementptr inbounds %struct.Foo, %struct.Foo* [[F_ADDR]], i32 0, i32 1
+  // CHECK-NEXT:    [[OUT_PTR_C:%.*]] = bitcast [6 x i32]* [[OUT_PTR]] to <6 x i32>*
+  // CHECK-NEXT:    store <6 x i32> [[M_T]], <6 x i32>* [[OUT_PTR_C]], align 4
+
+  F->out = __builtin_matrix_transpose(F->in);
+}
+
+void transpose_transpose_struct_member(struct Foo *F) {
+  // CHECK-LABEL: define void @transpose_transpose_struct_member(
+  // CHECK:         [[M:%.*]] = load <6 x i32>, <6 x i32>* {{.*}}, align 4
+  // CHECK-NEXT:    [[M_T:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M]], i32 1, i32 6)
+  // CHECK-NEXT:    [[M_T2:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M_T]], i32 6, i32 1)
+  // CHECK-NEXT:    [[F_ADDR:%.*]] = load %struct.Foo*, %struct.Foo** %F.addr, align 8
+  // CHECK-NEXT:    [[IN_PTR:%.*]] = getelementptr inbounds %struct.Foo, %struct.Foo* [[F_ADDR]], i32 0, i32 0
+  // CHECK-NEXT:    [[IN_PTR_C:%.*]] = bitcast [6 x i32]* [[IN_PTR]] to <6 x i32>*
+  // CHECK-NEXT:    store <6 x i32> [[M_T2]], <6 x i32>* [[IN_PTR_C]], align 4
+
+  F->in = __builtin_matrix_transpose(__builtin_matrix_transpose(F->in));
+}
+
+dx5x5_t get_matrix();
+
+void transpose_rvalue() {
+  // CHECK-LABEL: define void @transpose_rvalue()
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    [[M_T_ADDR:%.*]] = alloca [25 x double], align 8
+  // CHECK-NEXT:    [[CALL:%.*]] = call <25 x double> (...) @get_matrix()
+  // CHECK-NEXT:    [[M_T:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[CALL]], i32 5, i32 5)
+  // CHECK-NEXT:    [[M_T_ADDR_C:%.*]] = bitcast [25 x double]* [[M_T_ADDR]] to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> [[M_T]], <25 x double>* [[M_T_ADDR_C]], align 8
+
+  dx5x5_t m_t = __builtin_matrix_transpose(get_matrix());
+}
+
+const dx5x5_t global_matrix;
+
+void transpose_global() {
+  // CHECK-LABEL: define void @transpose_global()
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    [[M_T_ADDR:%.*]] = alloca [25 x double], align 8
+  // CHECK-NEXT:    [[GLOBAL_MATRIX:%.*]] = load <25 x double>, <25 x double>* bitcast ([25 x double]* @global_matrix to <25 x double>*), align 8
+  // CHECK-NEXT:    [[M_T:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[GLOBAL_MATRIX]], i32 5, i32 5)
+  // CHECK-NEXT:    [[M_T_ADDR_C:%.*]] = bitcast [25 x double]* [[M_T_ADDR]] to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> [[M_T]], <25 x double>* [[M_T_ADDR_C]], align 8
+
+  dx5x5_t m_t = __builtin_matrix_transpose(global_matrix);
+}

diff  --git a/clang/test/CodeGenCXX/matrix-type-builtins.cpp b/clang/test/CodeGenCXX/matrix-type-builtins.cpp
new file mode 100644
index 000000000000..b2b9f7be33f9
--- /dev/null
+++ b/clang/test/CodeGenCXX/matrix-type-builtins.cpp
@@ -0,0 +1,83 @@
+// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++17 | FileCheck %s
+
+// Tests for the matrix type builtins.
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+using matrix_t = EltTy __attribute__((matrix_type(Rows, Columns)));
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  matrix_t<EltTy, Rows, Columns> value;
+};
+
+template <typename T, unsigned R, unsigned C>
+MyMatrix<T, C, R> transpose(const MyMatrix<T, R, C> &M) {
+  MyMatrix<T, C, R> Res;
+  Res.value = __builtin_matrix_transpose(M.value);
+  return Res;
+}
+
+void test_transpose_template1() {
+  // CHECK-LABEL: define void @_Z24test_transpose_template1v()
+  // CHECK:         call void @_Z9transposeIiLj4ELj10EE8MyMatrixIT_XT1_EXT0_EERKS0_IS1_XT0_EXT1_EE(%struct.MyMatrix.0* sret align 4 %M1_t, %struct.MyMatrix* nonnull align 4 dereferenceable(160) %M1)
+
+  // CHECK-LABEL: define linkonce_odr void @_Z9transposeIiLj4ELj10EE8MyMatrixIT_XT1_EXT0_EERKS0_IS1_XT0_EXT1_EE(
+  // CHECK:         [[M:%.*]] = load <40 x i32>, <40 x i32>* {{.*}}, align 4
+  // CHECK-NEXT:    [[M_T:%.*]] = call <40 x i32> @llvm.matrix.transpose.v40i32(<40 x i32> [[M]], i32 4, i32 10)
+
+  MyMatrix<int, 4, 10> M1;
+  MyMatrix<int, 10, 4> M1_t = transpose(M1);
+}
+
+void test_transpose_template2(MyMatrix<double, 7, 6> &M) {
+  // CHECK-LABEL: define void @_Z24test_transpose_template2R8MyMatrixIdLj7ELj6EE(
+  // CHECK:         call void @_Z9transposeIdLj7ELj6EE8MyMatrixIT_XT1_EXT0_EERKS0_IS1_XT0_EXT1_EE(%struct.MyMatrix.2* sret align 8 %ref.tmp1, %struct.MyMatrix.1* nonnull align 8 dereferenceable(336) %0)
+  // CHECK-NEXT:    call void @_Z9transposeIdLj6ELj7EE8MyMatrixIT_XT1_EXT0_EERKS0_IS1_XT0_EXT1_EE(%struct.MyMatrix.1* sret align 8 %ref.tmp, %struct.MyMatrix.2* nonnull align 8 dereferenceable(336) %ref.tmp1)
+  // CHECK-NEXT:    call void @_Z9transposeIdLj7ELj6EE8MyMatrixIT_XT1_EXT0_EERKS0_IS1_XT0_EXT1_EE(%struct.MyMatrix.2* sret align 8 %M2_t, %struct.MyMatrix.1* nonnull align 8 dereferenceable(336) %ref.tmp)
+
+  // CHECK-LABEL: define linkonce_odr void @_Z9transposeIdLj7ELj6EE8MyMatrixIT_XT1_EXT0_EERKS0_IS1_XT0_EXT1_EE(
+  // CHECK:         [[M:%.*]] = load <42 x double>, <42 x double>* {{.*}}, align 8
+  // CHECK-NEXT:    [[M_T:%.*]] = call <42 x double> @llvm.matrix.transpose.v42f64(<42 x double> [[M]], i32 7, i32 6)
+  // CHECK-NEXT:    [[RES_ADDR:%.*]] = getelementptr inbounds %struct.MyMatrix.2, %struct.MyMatrix.2* %agg.result, i32 0, i32 0
+  // CHECK-NEXT:    [[RES_ADDR_C:%.*]] = bitcast [42 x double]* [[RES_ADDR]] to <42 x double>*
+  // CHECK-NEXT:    store <42 x double> [[M_T]], <42 x double>* [[RES_ADDR_C]], align 8
+
+  // CHECK-LABEL: define linkonce_odr void @_Z9transposeIdLj6ELj7EE8MyMatrixIT_XT1_EXT0_EERKS0_IS1_XT0_EXT1_EE(
+  // CHECK:         [[M:%.*]] = load <42 x double>, <42 x double>* {{.*}}, align 8
+  // CHECK-NEXT:    [[M_T:%.*]] = call <42 x double> @llvm.matrix.transpose.v42f64(<42 x double> [[M]], i32 6, i32 7)
+  // CHECK-NEXT:    [[RES_ADDR:%.*]] = getelementptr inbounds %struct.MyMatrix.1, %struct.MyMatrix.1* %agg.result, i32 0, i32 0
+  // CHECK-NEXT:    [[RES_ADDR_C:%.*]] = bitcast [42 x double]* [[RES_ADDR]] to <42 x double>*
+  // CHECK-NEXT:    store <42 x double> [[M_T]], <42 x double>* [[RES_ADDR_C]], align 8
+
+  MyMatrix<double, 6, 7> M2_t = transpose(transpose(transpose(M)));
+}
+
+matrix_t<float, 3, 3> get_matrix();
+
+void test_transpose_rvalue() {
+  // CHECK-LABEL: define void @_Z21test_transpose_rvaluev()
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    [[M_T_ADDR:%.*]] = alloca [9 x float], align 4
+  // CHECK-NEXT:    [[CALL_RES:%.*]] = call <9 x float> @_Z10get_matrixv()
+  // CHECK-NEXT:    [[ADD:%.*]] = fadd <9 x float> [[CALL_RES]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+  // CHECK-NEXT:    [[M_T:%.*]] = call <9 x float> @llvm.matrix.transpose.v9f32(<9 x float> [[ADD]], i32 3, i32 3)
+  // CHECK-NEXT:    [[M_T_ADDR_CAST:%.*]] = bitcast [9 x float]* [[M_T_ADDR]] to <9 x float>*
+  // CHECK-NEXT:    store <9 x float> [[M_T]], <9 x float>* [[M_T_ADDR_CAST]], align 4
+  matrix_t<float, 3, 3> m_t = __builtin_matrix_transpose(get_matrix() + 2.0);
+}
+
+void test_transpose_const(const matrix_t<float, 3, 3> &m) {
+  // CHECK-LABEL:  define void @_Z20test_transpose_constRKU11matrix_typeLm3ELm3Ef(
+  // CHECK:         [[MATRIX:%.*]] = load <9 x float>, <9 x float>* {{.*}}, align 4
+  // CHECK-NEXT:    [[M_T:%.*]] = call <9 x float> @llvm.matrix.transpose.v9f32(<9 x float> [[MATRIX]], i32 3, i32 3)
+  // CHECK-NEXT:    [[M_T_ADDR:%.*]] = bitcast [9 x float]* %m_t to <9 x float>*
+  // CHECK-NEXT:    store <9 x float> [[M_T]], <9 x float>* [[M_T_ADDR]], align 4
+  matrix_t<float, 3, 3> m_t = __builtin_matrix_transpose(m);
+}
+
+// TODO: Enable once initialization support is defined and implemented for
+//       matrix types.
+// void test_lvalue_conversion() {
+//  constexpr double4x4 m = {};
+//  [] { return __builtin_matrix_transpose(m); }
+//}

diff  --git a/clang/test/CodeGenObjC/matrix-type-builtins.m b/clang/test/CodeGenObjC/matrix-type-builtins.m
new file mode 100644
index 000000000000..37f24ff6b68e
--- /dev/null
+++ b/clang/test/CodeGenObjC/matrix-type-builtins.m
@@ -0,0 +1,42 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fenable-matrix -emit-llvm -disable-llvm-optzns -o - %s | FileCheck %s
+
+// Check that we correctly deal with placeholder expressions.
+
+typedef double double4x2 __attribute__((matrix_type(4, 2)));
+typedef double double2x4 __attribute__((matrix_type(2, 4)));
+
+__attribute__((objc_root_class))
+ at interface DoubleMatrixValue
+ at property double4x2 value;
+ at end
+
+void test_transpose_placeholder_get(DoubleMatrixValue *m, double2x4 *r) {
+  // CHECK-LABEL: define void @test_transpose_placeholder_get(
+  // CHECK:         [[MATRIX:%.*]] = call <8 x double> bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to <8 x double> (i8*, i8*)*)(
+  // CHECK-NEXT:   [[MT:%.*]] = call <8 x double> @llvm.matrix.transpose.v8f64(<8 x double> %call, i32 4, i32 2)
+  // CHECK-NEXT:   [[R_ADDR:%.*]] = load [8 x double]*, [8 x double]** %r.addr, align 8
+  // CHECK-NEXT:   [[R_ADDR_C:%.*]] = bitcast [8 x double]* [[R_ADDR]] to <8 x double>*
+  // CHECK-NEXT:    store <8 x double> [[MT]], <8 x double>* [[R_ADDR_C]], align 8
+  // CHECK-NEXT:    ret void
+
+  *r = __builtin_matrix_transpose(m.value);
+}
+
+typedef unsigned u3x4 __attribute__((matrix_type(3, 4)));
+typedef unsigned u4x3 __attribute__((matrix_type(4, 3)));
+
+__attribute__((objc_root_class))
+ at interface UnsignedMatrixValue
+ at property u3x4 value;
+ at end
+
+void test_transpose_placeholder_set(UnsignedMatrixValue *m, u4x3 *r) {
+  // CHECK-LABEL: define void @test_transpose_placeholder_set(
+  // CHECK:         [[MATRIX:%.*]] = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+  // CHECK-NEXT:    [[MT:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[MATRIX]], i32 4, i32 3)
+  // CHECK:         call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, <12 x i32>)*)(i8* {{.*}}, i8* {{.*}}, <12 x i32> [[MT]])
+  // CHECK-NEXT:    ret void
+
+  m.value = __builtin_matrix_transpose(*r);
+}

diff  --git a/clang/test/Sema/matrix-type-builtins.c b/clang/test/Sema/matrix-type-builtins.c
new file mode 100644
index 000000000000..ac23e562be03
--- /dev/null
+++ b/clang/test/Sema/matrix-type-builtins.c
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 %s -fenable-matrix -pedantic -verify -triple=x86_64-apple-darwin9
+
+typedef float sx5x10_t __attribute__((matrix_type(5, 10)));
+typedef int ix3x2_t __attribute__((matrix_type(3, 2)));
+typedef double dx3x3 __attribute__((matrix_type(3, 3)));
+typedef unsigned ix3x3 __attribute__((matrix_type(3, 3)));
+
+void transpose(sx5x10_t a, ix3x2_t b, dx3x3 c, int *d, int e) {
+  a = __builtin_matrix_transpose(b);
+  // expected-error at -1 {{assigning to 'sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') from incompatible type 'int __attribute__((matrix_type(2, 3)))'}}
+  b = __builtin_matrix_transpose(b);
+  // expected-error at -1 {{assigning to 'ix3x2_t' (aka 'int __attribute__((matrix_type(3, 2)))') from incompatible type 'int __attribute__((matrix_type(2, 3)))'}}
+  __builtin_matrix_transpose(d);
+  // expected-error at -1 {{first argument must be a matrix}}
+  __builtin_matrix_transpose(e);
+  // expected-error at -1 {{first argument must be a matrix}}
+  __builtin_matrix_transpose("test");
+  // expected-error at -1 {{first argument must be a matrix}}
+
+  ix3x3 m = __builtin_matrix_transpose(c);
+  // expected-error at -1 {{initializing 'ix3x3' (aka 'unsigned int __attribute__((matrix_type(3, 3)))') with an expression of incompatible type 'double __attribute__((matrix_type(3, 3)))'}}
+}

diff  --git a/clang/test/SemaCXX/matrix-type-builtins.cpp b/clang/test/SemaCXX/matrix-type-builtins.cpp
new file mode 100644
index 000000000000..b4311b357fa6
--- /dev/null
+++ b/clang/test/SemaCXX/matrix-type-builtins.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 %s -fenable-matrix -pedantic -std=c++11 -verify -triple=x86_64-apple-darwin9
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = EltTy __attribute__((matrix_type(Rows, Columns)));
+
+  matrix_t value;
+};
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1>
+typename MyMatrix<EltTy1, R1, C1>::matrix_t transpose(MyMatrix<EltTy0, R0, C0> &A) {
+  char *v1 = __builtin_matrix_transpose(A.value);
+  // expected-error at -1 {{cannot initialize a variable of type 'char *' with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 2)))'}}
+  // expected-error at -2 {{cannot initialize a variable of type 'char *' with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 3)))'}}
+  // expected-error at -3 {{cannot initialize a variable of type 'char *' with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 3)))'}}
+
+  __builtin_matrix_transpose(A);
+  // expected-error at -1 {{first argument must be a matrix}}
+  // expected-error at -2 {{first argument must be a matrix}}
+  // expected-error at -3 {{first argument must be a matrix}}
+
+  return __builtin_matrix_transpose(A.value);
+  // expected-error at -1 {{cannot initialize return object of type 'typename MyMatrix<unsigned int, 2U, 3U>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 3)))') with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 2)))'}}
+  // expected-error at -2 {{cannot initialize return object of type 'typename MyMatrix<unsigned int, 2U, 3U>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 3)))') with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 3)))'}}
+  // expected-error at -3 {{cannot initialize return object of type 'typename MyMatrix<float, 3U, 3U>::matrix_t' (aka 'float __attribute__((matrix_type(3, 3)))') with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 3)))'}}
+}
+
+void test_transpose_template(unsigned *Ptr1, float *Ptr2) {
+  MyMatrix<unsigned, 2, 3> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  Mat1.value = *((decltype(Mat1)::matrix_t *)Ptr1);
+  Mat1.value = transpose<unsigned, 2, 3, unsigned, 2, 3>(Mat1);
+  // expected-note at -1 {{in instantiation of function template specialization 'transpose<unsigned int, 2, 3, unsigned int, 2, 3>' requested here}}
+
+  Mat1.value = transpose<unsigned, 3, 3, unsigned, 2, 3>(Mat2);
+  // expected-note at -1 {{in instantiation of function template specialization 'transpose<unsigned int, 3, 3, unsigned int, 2, 3>' requested here}}
+
+  MyMatrix<float, 3, 3> Mat3;
+  Mat3.value = transpose<unsigned, 3, 3, float, 3, 3>(Mat2);
+  // expected-note at -1 {{in instantiation of function template specialization 'transpose<unsigned int, 3, 3, float, 3, 3>' requested here}}
+}

diff  --git a/clang/test/SemaObjC/matrix-type-builtins.m b/clang/test/SemaObjC/matrix-type-builtins.m
new file mode 100644
index 000000000000..bac7008f74e2
--- /dev/null
+++ b/clang/test/SemaObjC/matrix-type-builtins.m
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -fenable-matrix %s
+
+typedef double double4x4 __attribute__((matrix_type(4, 4)));
+typedef unsigned u4x4 __attribute__((matrix_type(4, 4)));
+
+__attribute__((objc_root_class))
+ at interface MatrixValue
+ at property double4x4 value;
+ at end
+
+void test_element_type_mismatch(u4x4 m, MatrixValue *mv) {
+  m = __builtin_matrix_transpose(mv.value);
+  // expected-error at -1 {{assigning to 'u4x4' (aka 'unsigned int __attribute__((matrix_type(4, 4)))') from incompatible type 'double __attribute__((matrix_type(4, 4)))'}}
+}
+
+typedef double double3x3 __attribute__((matrix_type(3, 3)));
+
+double test_dimension_mismatch(double3x3 m, MatrixValue *mv) {
+  m = __builtin_matrix_transpose(mv.value);
+  // expected-error at -1 {{assigning to 'double3x3' (aka 'double __attribute__((matrix_type(3, 3)))') from incompatible type 'double __attribute__((matrix_type(4, 4)))'}}
+}