[llvm] 832cd74 - [AArch64] Armv8.6-a Matrix Mult Assembly + Intrinsics

Fri Apr 24 07:54:25 PDT 2020

Author: Luke Geeson
Date: 2020-04-24T15:54:06+01:00
New Revision: 832cd749131b1fa59d12486325f19e16eb392a42

URL: https://github.com/llvm/llvm-project/commit/832cd749131b1fa59d12486325f19e16eb392a42
DIFF: https://github.com/llvm/llvm-project/commit/832cd749131b1fa59d12486325f19e16eb392a42.diff

LOG: [AArch64] Armv8.6-a Matrix Mult Assembly + Intrinsics

This patch upstreams support for the Armv8.6-a Matrix Multiplication
Extension. A summary of the features can be found here:

https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/arm-architecture-developments-armv8-6-a

This patch includes:

- Assembly support for AArch64 only (no SVE or Neon)
- Intrinsics Support for AArch64 Armv8.6a Matrix Multiplication Instructions (No bfloat16 matrix multiplication)

No IR types or C Types are needed for this extension.

This is part of a patch series, starting with BFloat16 support and
the other components in the armv8.6a extension (in previous patches
linked in phabricator)

Based on work by:
- Luke Geeson
- Oliver Stannard
- Luke Cheeseman

Reviewers: ostannard, t.p.northover, rengolin, kmclaughlin

Reviewed By: kmclaughlin

Subscribers: kmclaughlin, kristof.beyls, hiraditya, danielkiss,
cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D77871

Added: 
    clang/test/CodeGen/aarch64-matmul.cpp
    clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c
    llvm/test/CodeGen/AArch64/aarch64-matmul.ll
    llvm/test/MC/AArch64/armv8.6a-simd-matmul-error.s
    llvm/test/MC/AArch64/armv8.6a-simd-matmul.s
    llvm/test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt

Modified: 
    clang/include/clang/Basic/arm_neon.td
    clang/lib/Basic/Targets/AArch64.cpp
    clang/lib/Basic/Targets/AArch64.h
    clang/lib/CodeGen/CGBuiltin.cpp
    llvm/include/llvm/IR/IntrinsicsAArch64.td
    llvm/lib/Target/AArch64/AArch64.td
    llvm/lib/Target/AArch64/AArch64InstrFormats.td
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/lib/Target/AArch64/AArch64Subtarget.h

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index f949edc378fc..82e44aaec69b 100644

--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -221,6 +221,21 @@ def OP_FMLAL_LN_Hi  : Op<(call "vfmlal_high", $p0, $p1,
 def OP_FMLSL_LN_Hi  : Op<(call "vfmlsl_high", $p0, $p1,
                            (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
 
+def OP_USDOT_LN
+    : Op<(call "vusdot", $p0, $p1,
+          (cast "8", "S", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)))>;
+def OP_USDOT_LNQ
+    : Op<(call "vusdot", $p0, $p1,
+          (cast "8", "S", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)))>;
+
+// sudot splats the second vector and then calls vusdot
+def OP_SUDOT_LN
+    : Op<(call "vusdot", $p0,
+          (cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)), $p1)>;
+def OP_SUDOT_LNQ
+    : Op<(call "vusdot", $p0,
+          (cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)), $p1)>;
+
 //===----------------------------------------------------------------------===//
 // Auxiliary Instructions
 //===----------------------------------------------------------------------===//
@@ -1792,6 +1807,23 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)" in {
   }
 }
 
+let ArchGuard = "defined(__ARM_FEATURE_MATMUL_INT8)" in {
+  def VMMLA   : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
+  def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">;
+
+  def VUSDOT  : SInst<"vusdot", "..(<<U)(<<)", "iQi">;
+
+  def VUSDOT_LANE  : SOpInst<"vusdot_lane", "..(<<U)(<<q)I", "iQi", OP_USDOT_LN>;
+  def VSUDOT_LANE  : SOpInst<"vsudot_lane", "..(<<)(<<qU)I", "iQi", OP_SUDOT_LN>;
+
+  let ArchGuard = "defined(__aarch64__)" in {
+    let isLaneQ = 1 in {
+      def VUSDOT_LANEQ  : SOpInst<"vusdot_laneq", "..(<<U)(<<Q)I", "iQi", OP_USDOT_LNQ>;
+      def VSUDOT_LANEQ  : SOpInst<"vsudot_laneq", "..(<<)(<<QU)I", "iQi", OP_SUDOT_LNQ>;
+    }
+  }
+}
+
 // v8.3-A Vector complex addition intrinsics
 let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {
   def VCADD_ROT90_FP16   : SInst<"vcadd_rot90", "...", "h">;
@@ -1808,4 +1840,4 @@ let ArchGuard = "defined(__ARM_FEATURE_COMPLEX)" in {
 let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__aarch64__)" in {
   def VCADDQ_ROT90_FP64  : SInst<"vcaddq_rot90", "QQQ", "d">;
   def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">;
-}
\ No newline at end of file
+}

diff  --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 8ceb7f2b515e..5357d31ee64c 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -280,6 +280,9 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasTME)
     Builder.defineMacro("__ARM_FEATURE_TME", "1");
 
+  if (HasMatMul)
+    Builder.defineMacro("__ARM_FEATURE_MATMUL_INT8", "1");
+
   if ((FPU & NeonMode) && HasFP16FML)
     Builder.defineMacro("__ARM_FEATURE_FP16FML", "1");
 
@@ -356,6 +359,7 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
   HasFP16FML = false;
   HasMTE = false;
   HasTME = false;
+  HasMatMul = false;
   ArchKind = llvm::AArch64::ArchKind::ARMV8A;
 
   for (const auto &Feature : Features) {
@@ -391,6 +395,8 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasMTE = true;
     if (Feature == "+tme")
       HasTME = true;
+    if (Feature == "+i8mm")
+      HasMatMul = true;
   }
 
   setDataLayout();

diff  --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index bc73d195b7d4..39763b5b5609 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -36,6 +36,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasFP16FML;
   bool HasMTE;
   bool HasTME;
+  bool HasMatMul;
 
   llvm::AArch64::ArchKind ArchKind;
 

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3048c6441ba2..80b27b629d1e 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5009,6 +5009,7 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
   NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0),
   NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0),
   NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0),
+  NEONMAP2(vmmlaq_v, aarch64_neon_ummla, aarch64_neon_smmla, 0),
   NEONMAP0(vmovl_v),
   NEONMAP0(vmovn_v),
   NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
@@ -5091,6 +5092,9 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
   NEONMAP0(vsubhn_v),
   NEONMAP0(vtst_v),
   NEONMAP0(vtstq_v),
+  NEONMAP1(vusdot_v, aarch64_neon_usdot, 0),
+  NEONMAP1(vusdotq_v, aarch64_neon_usdot, 0),
+  NEONMAP1(vusmmlaq_v, aarch64_neon_usmmla, 0),
 };
 
 static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
@@ -6076,6 +6080,26 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
     llvm::Type *Tys[2] = { Ty, InputTy };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high");
   }
+  case NEON::BI__builtin_neon_vmmlaq_v: {
+    llvm::Type *InputTy =
+           llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmmla");
+  }
+  case NEON::BI__builtin_neon_vusmmlaq_v: {
+    llvm::Type *InputTy =
+           llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusmmla");
+  }
+  case NEON::BI__builtin_neon_vusdot_v:
+  case NEON::BI__builtin_neon_vusdotq_v: {
+    llvm::Type *InputTy =
+           llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot");
+  }
   }
 
   assert(Int && "Expected valid intrinsic number");

diff  --git a/clang/test/CodeGen/aarch64-matmul.cpp b/clang/test/CodeGen/aarch64-matmul.cpp
new file mode 100644
index 000000000000..f9206e2d048e
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-matmul.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1             -triple aarch64-eabi -target-feature +neon -target-feature +i8mm -S -emit-llvm %s -o - | FileCheck %s
+
+#ifdef __ARM_FEATURE_MATMUL_INT8
+extern "C" void arm_feature_matmulint8_defined() {}
+#endif
+// CHECK: define void @arm_feature_matmulint8_defined()
+
+

diff  --git a/clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c
new file mode 100644
index 000000000000..f876a01460cf
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c
@@ -0,0 +1,147 @@
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
+// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg -sroa \
+// RUN: | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: test_vmmlaq_s32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) {
+  return vmmlaq_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vmmlaq_u32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
+  return vmmlaq_u32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusmmlaq_s32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
+  return vusmmlaq_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusdot_s32
+// CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
+// CHECK: ret <2 x i32> [[VAL]]
+int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
+  return vusdot_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusdot_lane_s32
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
+// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]])
+// CHECK: ret <2 x i32> [[OP]]
+int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
+  return vusdot_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vsudot_lane_s32
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %0 to <8 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> %1 to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
+// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a)
+// CHECK: ret <2 x i32> [[OP]]
+int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) {
+  return vsudot_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vusdot_laneq_s32
+// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
+// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]])
+// CHECK: ret <2 x i32> [[OP]]
+int32x2_t test_vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b) {
+  return vusdot_laneq_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vsudot_laneq_s32
+// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
+// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a)
+// CHECK: ret <2 x i32> [[OP]]
+int32x2_t test_vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b) {
+  return vsudot_laneq_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vusdotq_s32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+int32x4_t test_vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
+  return vusdotq_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusdotq_lane_s32
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
+// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
+// CHECK: ret <4 x i32> [[OP]]
+int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) {
+  return vusdotq_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vsudotq_lane_s32
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
+// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a)
+// CHECK: ret <4 x i32> [[OP]]
+int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) {
+  return vsudotq_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vusdotq_laneq_s32
+// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
+// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
+// CHECK: ret <4 x i32> [[OP]]
+int32x4_t test_vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
+  return vusdotq_laneq_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vsudotq_laneq_s32
+// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
+// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a)
+// CHECK: ret <4 x i32> [[OP]]
+int32x4_t test_vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b) {
+  return vsudotq_laneq_s32(r, a, b, 0);
+}

diff  --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index fd2e3739e62d..4a23d3ff77f7 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -173,6 +173,11 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
+
+  class AdvSIMD_MatMul_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
 }
 
 // Arithmetic ops
@@ -449,6 +454,12 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   def int_aarch64_neon_udot : AdvSIMD_Dot_Intrinsic;
   def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic;
 
+// v8.6-A Matrix Multiply Intrinsics
+  def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
+  def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
+  def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
+  def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
+
   // v8.2-A FP16 Fused Multiply-Add Long
   def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
   def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;

diff  --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 4bb88a957a9b..d2ab0ff898aa 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -373,6 +373,15 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
 def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16",
     "true", "Enable BFloat16 Extension" >;
 
+def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
+    "true", "Enable Matrix Multiply Int8 Extension">;
+
+def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32",
+    "true", "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>;
+
+def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64",
+    "true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>;
+
 def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps",
     "true", "Enable fine grained virtualization traps extension">;
 
@@ -380,7 +389,6 @@ def FeatureEnhancedCounterVirtualization :
       SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization",
       "true", "Enable enhanced counter virtualization extension">;
 
-
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -413,7 +421,7 @@ def HasV8_6aOps : SubtargetFeature<
   "v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions",
 
   [HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps,
-   FeatureEnhancedCounterVirtualization]>;
+   FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 1743b0c68eb7..757f2cde49ca 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5550,11 +5550,11 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
 
 // ARMv8.2-A Dot Product Instructions (Vector): These instructions extract
 // bytes from S-sized elements.
-class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
+class BaseSIMDThreeSameVectorDot<bit Q, bit U, bit Mixed, string asm, string kind1,
                                  string kind2, RegisterOperand RegType,
                                  ValueType AccumType, ValueType InputType,
                                  SDPatternOperator OpNode> :
-        BaseSIMDThreeSameVectorTied<Q, U, 0b100, 0b10010, RegType, asm, kind1,
+        BaseSIMDThreeSameVectorTied<Q, U, 0b100, {0b1001, Mixed}, RegType, asm, kind1,
         [(set (AccumType RegType:$dst),
               (OpNode (AccumType RegType:$Rd),
                       (InputType RegType:$Rn),
@@ -5562,10 +5562,10 @@ class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
   let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
 }
 
-multiclass SIMDThreeSameVectorDot<bit U, string asm, SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64,
+multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64,
                                          v2i32, v8i8, OpNode>;
-  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128,
+  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128,
                                          v4i32, v16i8, OpNode>;
 }
 
@@ -7903,13 +7903,26 @@ class BF16ToSinglePrecision<string asm>
 }
 } // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0
 
+//----------------------------------------------------------------------------
+// Armv8.6 Matrix Multiply Extension
+//----------------------------------------------------------------------------
+
+class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNode>
+  : BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1010, B}, V128, asm, ".4s",
+              [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
+                                               (v16i8 V128:$Rn),
+                                               (v16i8 V128:$Rm)))]> {
+  let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}";
+}
+
+//----------------------------------------------------------------------------
 // ARMv8.2-A Dot Product Instructions (Indexed)
-class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
-                                      string lhs_kind, string rhs_kind,
+class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, bit Mixed, bits<2> size, string asm,
+                                      string dst_kind, string lhs_kind, string rhs_kind,
                                       RegisterOperand RegType,
                                       ValueType AccumType, ValueType InputType,
                                       SDPatternOperator OpNode> :
-        BaseSIMDIndexedTied<Q, U, 0b0, 0b10, 0b1110, RegType, RegType, V128,
+        BaseSIMDIndexedTied<Q, U, 0b0, size, {0b111, Mixed}, RegType, RegType, V128,
                             VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
         [(set (AccumType RegType:$dst),
               (AccumType (OpNode (AccumType RegType:$Rd),
@@ -7922,11 +7935,11 @@ class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
   let Inst{11}    = idx{1};  // H
 }
 
-multiclass SIMDThreeSameVectorDotIndex<bit U, string asm,
+multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string asm,
                                        SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b",
+  def v8i8  : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b",
                                               V64, v2i32, v8i8, OpNode>;
-  def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b",
+  def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b",
                                               V128, v4i32, v16i8, OpNode>;
 }
 

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f634642a3b59..91af6c79a427 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -146,6 +146,12 @@ def HasTRBE          : Predicate<"Subtarget->hasTRBE()">,
                        AssemblerPredicate<(all_of FeatureTRBE), "trbe">;
 def HasBF16          : Predicate<"Subtarget->hasBF16()">,
                        AssemblerPredicate<(all_of FeatureBF16), "bf16">;
+def HasMatMulInt8    : Predicate<"Subtarget->hasMatMulInt8()">,
+                       AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">;
+def HasMatMulFP32    : Predicate<"Subtarget->hasMatMulFP32()">,
+                       AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">;
+def HasMatMulFP64    : Predicate<"Subtarget->hasMatMulFP64()">,
+                       AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
@@ -745,10 +751,10 @@ def TSB   : CRmSystemI<barrier_op, 0b010, "tsb", []> {
 
 // ARMv8.2-A Dot Product
 let Predicates = [HasDotProd] in {
-defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>;
-defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>;
-defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>;
-defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>;
+defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>;
+defm UDOT : SIMDThreeSameVectorDot<1, 0, "udot", int_aarch64_neon_udot>;
+defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", int_aarch64_neon_sdot>;
+defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", int_aarch64_neon_udot>;
 }
 
 // ARMv8.6-A BFloat
@@ -765,6 +771,40 @@ def BFCVTN2      : SIMD_BFCVTN2;
 def BFCVT        : BF16ToSinglePrecision<"bfcvt">;
 }
 
+// ARMv8.6A AArch64 matrix multiplication
+let Predicates = [HasMatMulInt8] in {
+def  SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;
+def  UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>;
+def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>;
+defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>;
+defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>;
+
+// sudot lane has a pattern where usdot is expected (there is no sudot).
+// The second operand is used in the dup operation to repeat the indexed
+// element.
+class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind,
+                         string rhs_kind, RegisterOperand RegType,
+                         ValueType AccumType, ValueType InputType>
+      : BaseSIMDThreeSameVectorDotIndex<Q, 0, 1, 0b00, "sudot", dst_kind,
+                                        lhs_kind, rhs_kind, RegType, AccumType,
+                                        InputType, null_frag> {
+  let Pattern = [(set (AccumType RegType:$dst),
+                      (AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd),
+                                 (InputType (bitconvert (AccumType
+                                    (AArch64duplane32 (v4i32 V128:$Rm),
+                                        VectorIndexS:$idx)))),
+                                 (InputType RegType:$Rn))))];
+}
+
+multiclass SIMDSUDOTIndex {
+  def v8i8  : BaseSIMDSUDOTIndex<0, ".2s", ".8b", ".4b", V64, v2i32, v8i8>;
+  def v16i8 : BaseSIMDSUDOTIndex<1, ".4s", ".16b", ".4b", V128, v4i32, v16i8>;
+}
+
+defm SUDOTlane : SIMDSUDOTIndex;
+
+}
+
 // ARMv8.2-A FP16 Fused Multiply-Add Long
 let Predicates = [HasNEON, HasFP16FML] in {
 defm FMLAL      : SIMDThreeSameVectorFML<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>;

diff  --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index c18afcc1140c..dab5fe3626d9 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -148,6 +148,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
 
   // Armv8.6-A Extensions
   bool HasBF16 = false;
+  bool HasMatMulInt8 = false;
+  bool HasMatMulFP32 = false;
+  bool HasMatMulFP64 = false;
   bool HasAMVS = false;
   bool HasFineGrainedTraps = false;
   bool HasEnhancedCounterVirtualization = false;
@@ -417,6 +420,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   bool hasSVE2SM4() const { return HasSVE2SM4; }
   bool hasSVE2SHA3() const { return HasSVE2SHA3; }
   bool hasSVE2BitPerm() const { return HasSVE2BitPerm; }
+  bool hasMatMulInt8() const { return HasMatMulInt8; }
+  bool hasMatMulFP32() const { return HasMatMulFP32; }
+  bool hasMatMulFP64() const { return HasMatMulFP64; }
 
   // Armv8.6-A Extensions
   bool hasBF16() const { return HasBF16; }

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
new file mode 100644
index 000000000000..649d0a9bfcab
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
@@ -0,0 +1,136 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s
+
+define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: smmla.v4i32.v16i8
+; CHECK: smmla   v0.4s, v1.16b, v2.16b
+  %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x i32> %vmmla1.i
+}
+
+define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: ummla.v4i32.v16i8
+; CHECK: ummla   v0.4s, v1.16b, v2.16b
+  %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x i32> %vmmla1.i
+}
+
+define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: usmmla.v4i32.v16i8
+; CHECK: usmmla   v0.4s, v1.16b, v2.16b
+  %vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
+  ret <4 x i32> %vusmmla1.i
+}
+
+define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot.v2i32.v8i8
+; CHECK: usdot   v0.2s, v1.8b, v2.8b
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
+  ret <2 x i32> %vusdot1.i
+}
+
+define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot_lane.v2i32.v8i8
+; CHECK: usdot   v0.2s, v1.8b, v2.4b[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1)
+  ret <2 x i32> %vusdot1.i
+}
+
+define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: sudot_lane.v2i32.v8i8
+; CHECK: sudot   v0.2s, v1.8b, v2.4b[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a)
+  ret <2 x i32> %vusdot1.i
+}
+
+define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot_lane.v2i32.v16i8
+; CHECK: usdot   v0.2s, v1.8b, v2.4b[0]
+  %0 = bitcast <16 x i8> %b to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1)
+  ret <2 x i32> %vusdot1.i
+}
+
+define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: sudot_lane.v2i32.v16i8
+; CHECK: sudot   v0.2s, v1.8b, v2.4b[0]
+  %0 = bitcast <16 x i8> %b to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a) #3
+  ret <2 x i32> %vusdot1.i
+}
+
+define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot.v4i32.v16i8
+; CHECK: usdot   v0.4s, v1.16b, v2.16b
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot_lane.v4i32.v16i8
+; CHECK: usdot   v0.4s, v1.16b, v2.4b[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: sudot_lane.v4i32.v16i8
+; CHECK: sudot   v0.4s, v1.16b, v2.4b[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot_laneq.v4i32.v16i8
+; CHECK: usdot   v0.4s, v1.16b, v2.4b[0]
+  %0 = bitcast <16 x i8> %b to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: sudot_laneq.v4i32.v16i8
+; CHECK: sudot   v0.4s, v1.16b, v2.4b[0]
+  %0 = bitcast <16 x i8> %b to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+declare <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
+declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+

diff  --git a/llvm/test/MC/AArch64/armv8.6a-simd-matmul-error.s b/llvm/test/MC/AArch64/armv8.6a-simd-matmul-error.s
new file mode 100644
index 000000000000..7a1e9a4de483
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv8.6a-simd-matmul-error.s
@@ -0,0 +1,34 @@
+// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+i8mm       < %s 2>&1 | FileCheck %s
+
+// No interesting edge cases for [US]MMLA, except for the fact that the data
+// types are fixed (no 64-bit version), and USMMLA exists, but SUMMLA does not.
+smmla  v1.2s, v16.8b, v31.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+summla v1.4s, v16.16b, v31.16b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: unrecognized instruction mnemonic, did you mean: smmla, ummla, usmmla?
+
+// USDOT (vector) has two valid data type combinations, others are rejected.
+usdot v3.4s, v15.8b, v30.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+usdot v3.2s, v15.16b, v30.16b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+// For USDOT and SUDOT (indexed), the index is in range [0,3] (regardless of data types)
+usdot v31.2s, v1.8b,  v2.4b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+usdot v31.4s, v1.16b, v2.4b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+sudot v31.2s, v1.8b,  v2.4b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+sudot v31.4s, v1.16b, v2.4b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+
+// The arrangement specifiers of the first two operands must match.
+usdot v31.4s, v1.8b,  v2.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+usdot v31.2s, v1.16b, v2.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+sudot v31.4s, v1.8b,  v2.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+sudot v31.2s, v1.16b, v2.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction

diff  --git a/llvm/test/MC/AArch64/armv8.6a-simd-matmul.s b/llvm/test/MC/AArch64/armv8.6a-simd-matmul.s
new file mode 100644
index 000000000000..dab4ff3e14f5
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv8.6a-simd-matmul.s
@@ -0,0 +1,43 @@
+// RUN:     llvm-mc -triple aarch64 -show-encoding -mattr=+i8mm       < %s      | FileCheck %s
+// RUN:     llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a      < %s      | FileCheck %s
+// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a-i8mm < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL
+
+smmla  v1.4s, v16.16b, v31.16b
+ummla  v1.4s, v16.16b, v31.16b
+usmmla v1.4s, v16.16b, v31.16b
+// CHECK: smmla   v1.4s, v16.16b, v31.16b // encoding: [0x01,0xa6,0x9f,0x4e]
+// CHECK: ummla   v1.4s, v16.16b, v31.16b // encoding: [0x01,0xa6,0x9f,0x6e]
+// CHECK: usmmla  v1.4s, v16.16b, v31.16b // encoding: [0x01,0xae,0x9f,0x4e]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: smmla  v1.4s, v16.16b, v31.16b
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: ummla  v1.4s, v16.16b, v31.16b
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usmmla  v1.4s, v16.16b, v31.16b
+
+usdot v3.2s, v15.8b, v30.8b
+usdot v3.4s, v15.16b, v30.16b
+// CHECK: usdot   v3.2s, v15.8b, v30.8b   // encoding: [0xe3,0x9d,0x9e,0x0e]
+// CHECK: usdot   v3.4s, v15.16b, v30.16b // encoding: [0xe3,0x9d,0x9e,0x4e]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usdot v3.2s, v15.8b, v30.8b
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usdot v3.4s, v15.16b, v30.16b
+
+usdot v31.2s, v1.8b,  v2.4b[3]
+usdot v31.4s, v1.16b, v2.4b[3]
+// CHECK: usdot   v31.2s, v1.8b, v2.4b[3] // encoding: [0x3f,0xf8,0xa2,0x0f]
+// CHECK: usdot   v31.4s, v1.16b, v2.4b[3] // encoding: [0x3f,0xf8,0xa2,0x4f]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usdot   v31.2s, v1.8b, v2.4b[3]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usdot   v31.4s, v1.16b, v2.4b[3]
+
+sudot v31.2s, v1.8b,  v2.4b[3]
+sudot v31.4s, v1.16b, v2.4b[3]
+// CHECK: sudot   v31.2s, v1.8b, v2.4b[3] // encoding: [0x3f,0xf8,0x22,0x0f]
+// CHECK: sudot   v31.4s, v1.16b, v2.4b[3] // encoding: [0x3f,0xf8,0x22,0x4f]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: sudot   v31.2s, v1.8b, v2.4b[3]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: sudot   v31.4s, v1.16b, v2.4b[3]

diff  --git a/llvm/test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt b/llvm/test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt
new file mode 100644
index 000000000000..2e14fa649f4b
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt
@@ -0,0 +1,34 @@
+# RUN:     llvm-mc -triple=aarch64  -mattr=+i8mm -disassemble < %s       | FileCheck %s
+# RUN:     llvm-mc -triple=aarch64  -mattr=+v8.6a -disassemble < %s      | FileCheck %s
+# RUN: not llvm-mc -triple=aarch64  -mattr=+v8.5a -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NOI8MM
+
+[0x01,0xa6,0x9f,0x4e]
+[0x01,0xa6,0x9f,0x6e]
+[0x01,0xae,0x9f,0x4e]
+# CHECK: smmla   v1.4s, v16.16b, v31.16b
+# CHECK: ummla   v1.4s, v16.16b, v31.16b
+# CHECK: usmmla  v1.4s, v16.16b, v31.16b
+# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0xe3,0x9d,0x9e,0x0e]
+[0xe3,0x9d,0x9e,0x4e]
+# CHECK: usdot   v3.2s, v15.8b, v30.8b
+# CHECK: usdot   v3.4s, v15.16b, v30.16b
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x3f,0xf8,0xa2,0x0f]
+[0x3f,0xf8,0xa2,0x4f]
+# CHECK: usdot   v31.2s, v1.8b, v2.4b[3]
+# CHECK: usdot   v31.4s, v1.16b, v2.4b[3]
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x3f,0xf8,0x22,0x0f]
+[0x3f,0xf8,0x22,0x4f]
+# CHECK: sudot   v31.2s, v1.8b, v2.4b[3]
+# CHECK: sudot   v31.4s, v1.16b, v2.4b[3]
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding