[llvm] [RISCV] Support LLVM IR intrinsics for XAndesVDot (PR #140223)
Jim Lin via llvm-commits
llvm-commits at lists.llvm.org
Sun May 18 19:21:48 PDT 2025
https://github.com/tclin914 updated https://github.com/llvm/llvm-project/pull/140223
>From d145d561a32abe74d2b2a00ec463059547994065 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim at andestech.com>
Date: Fri, 16 May 2025 13:51:13 +0800
Subject: [PATCH 1/3] [RISCV] Support LLVM IR intrinsics for XAndesVDot
This patch adds LLVM IR intrinsic support for XAndesVDot similiar to #139860.
The document for the intrinsics can be found at:
https://github.com/andestech/andes-vector-intrinsic-doc/blob/ast-v5_4_0-release-v5/auto-generated/andes-v5/intrinsic_funcs.adoc#andes-vector-dot-product-extensionxandesvdot
and with policy variants
https://github.com/andestech/andes-vector-intrinsic-doc/blob/ast-v5_4_0-release-v5/auto-generated/andes-v5/policy_funcs/intrinsic_funcs.adoc#andes-vector-dot-product-extensionxandesvdot
The clang part will be added in a later patch.
Co-authored-by: Tony Chuan-Yue Yuan <yuan593 at andestech.com>
---
llvm/include/llvm/IR/IntrinsicsRISCVXAndes.td | 5 +
llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td | 50 +++
.../CodeGen/RISCV/rvv/xandesvdot-vd4dots.ll | 405 ++++++++++++++++++
.../CodeGen/RISCV/rvv/xandesvdot-vd4dotsu.ll | 405 ++++++++++++++++++
.../CodeGen/RISCV/rvv/xandesvdot-vd4dotu.ll | 405 ++++++++++++++++++
5 files changed, 1270 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/rvv/xandesvdot-vd4dots.ll
create mode 100644 llvm/test/CodeGen/RISCV/rvv/xandesvdot-vd4dotsu.ll
create mode 100644 llvm/test/CodeGen/RISCV/rvv/xandesvdot-vd4dotu.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCVXAndes.td b/llvm/include/llvm/IR/IntrinsicsRISCVXAndes.td
index d90fe2cd0e6f3..270066f815d8b 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCVXAndes.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCVXAndes.td
@@ -14,4 +14,9 @@ let TargetPrefix = "riscv" in {
// Andes Vector Packed FP16 Extension
defm nds_vfpmadt : RISCVBinaryAAXRoundingMode;
defm nds_vfpmadb : RISCVBinaryAAXRoundingMode;
+
+ // Andes Vector Dot Product Extension
+ defm nds_vd4dots : RISCVTernaryWide;
+ defm nds_vd4dotu : RISCVTernaryWide;
+ defm nds_vd4dotsu : RISCVTernaryWide;
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 6afe88b805d35..4e24a2e062635 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -388,6 +388,21 @@ multiclass VPatVFPMADBinaryV_VX_RM<string intrinsic, string instruction,
}
}
+def VD4DOT_M1 : LMULInfo<0b000, 8, VR, VR, VR, VR, VR, "M1">;
+def VD4DOT_M2 : LMULInfo<0b001, 16, VRM2, VRM2, VR, VR, VR, "M2">;
+def VD4DOT_M4 : LMULInfo<0b010, 32, VRM4, VRM4, VRM2, VR, VR, "M4">;
+def VD4DOT_M8 : LMULInfo<0b011, 64, VRM8, VRM8, VRM4, VRM2, VR, "M8">;
+
+defvar MxListVD4DOT = [V_MF2, VD4DOT_M1, VD4DOT_M2, VD4DOT_M4, VD4DOT_M8];
+
+multiclass VPseudoVD4DOT_VV {
+ foreach m = MxListVD4DOT in {
+ defm "" : VPseudoBinaryV_VV<m>,
+ SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", m.MX,
+ forcePassthruRead=true>;
+ }
+}
+
//===----------------------------------------------------------------------===//
// XAndesPerf
//===----------------------------------------------------------------------===//
@@ -499,3 +514,38 @@ defm PseudoNDS_VFPMADB : VPseudoVFPMAD_VF_RM;
defm : VPatVFPMADBinaryV_VX_RM<"int_riscv_nds_vfpmadt", "PseudoNDS_VFPMADT", AllFP16Vectors>;
defm : VPatVFPMADBinaryV_VX_RM<"int_riscv_nds_vfpmadb", "PseudoNDS_VFPMADB", AllFP16Vectors>;
+
+let Predicates = [HasVendorXAndesVDot] in {
+defm PseudoNDS_VD4DOTS : VPseudoVD4DOT_VV;
+defm PseudoNDS_VD4DOTU : VPseudoVD4DOT_VV;
+defm PseudoNDS_VD4DOTSU : VPseudoVD4DOT_VV;
+}
+
+defset list<VTypeInfoToWide> AllQuadWidenableVD4DOTVectors = {
+ def : VTypeInfoToWide<VI8MF2, VI32MF2>;
+ def : VTypeInfoToWide<VI8M1, VI32M1>;
+ def : VTypeInfoToWide<VI8M2, VI32M2>;
+ def : VTypeInfoToWide<VI8M4, VI32M4>;
+ def : VTypeInfoToWide<VI8M8, VI32M8>;
+ def : VTypeInfoToWide<VI16M1, VI64M1>;
+ def : VTypeInfoToWide<VI16M2, VI64M2>;
+ def : VTypeInfoToWide<VI16M4, VI64M4>;
+ def : VTypeInfoToWide<VI16M8, VI64M8>;
+}
+
+multiclass VPatTernaryVD4DOT_VV<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist> {
+ foreach vtiToWti = vtilist in {
+ defvar vti = vtiToWti.Vti;
+ defvar wti = vtiToWti.Wti;
+ let Predicates = GetVTypePredicates<wti>.Predicates in
+ defm : VPatTernaryWithPolicy<intrinsic, instruction, "VV",
+ wti.Vector, vti.Vector, vti.Vector,
+ wti.Mask, wti.Log2SEW, vti.LMul,
+ wti.RegClass, vti.RegClass, vti.RegClass>;
+ }
+}
+
+defm : VPatTernaryVD4DOT_VV<"int_riscv_nds_vd4dots", "PseudoNDS_VD4DOTS", AllQuadWidenableVD4DOTVectors>;
+defm : VPatTernaryVD4DOT_VV<"int_riscv_nds_vd4dotu", "PseudoNDS_VD4DOTU", AllQuadWidenableVD4DOTVectors>;
+defm : VPatTernaryVD4DOT_VV<"int_riscv_nds_vd4dotsu", "PseudoNDS_VD4DOTSU", AllQuadWidenableVD4DOTVectors>;
diff --git a/llvm/test/CodeGen/RISCV/rvv/xandesvdot-vd4dots.ll b/llvm/test/CodeGen/RISCV/rvv/xandesvdot-vd4dots.ll
new file mode 100644
index 0000000000000..bc839899854b5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/xandesvdot-vd4dots.ll
@@ -0,0 +1,405 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zve64x,+xandesvdot \
+; RUN: -verify-machineinstrs -target-abi=ilp32 | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64x,+xandesvdot \
+; RUN: -verify-machineinstrs -target-abi=lp64 | FileCheck %s
+
+declare <vscale x 1 x i32> @llvm.riscv.nds.vd4dots.nxv1i32.nxv4i8.nxv4i8(
+ <vscale x 1 x i32>,
+ <vscale x 4 x i8>,
+ <vscale x 4 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vd4dots_vv_nxv1i32_nxv4i8_nxv4i8(<vscale x 1 x i32> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_vv_nxv1i32_nxv4i8_nxv4i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v9, v10
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 1 x i32> @llvm.riscv.nds.vd4dots.nxv1i32.nxv4i8.nxv4i8(
+ <vscale x 1 x i32> %0,
+ <vscale x 4 x i8> %1,
+ <vscale x 4 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.nds.vd4dots.mask.nxv1i32.nxv4i8.nxv4i8(
+ <vscale x 1 x i32>,
+ <vscale x 4 x i8>,
+ <vscale x 4 x i8>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vd4dots_mask_vv_nxv1i32_nxv4i8_nxv4i8(<vscale x 1 x i32> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 1 x i1>%3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_mask_vv_nxv1i32_nxv4i8_nxv4i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 1 x i32> @llvm.riscv.nds.vd4dots.mask.nxv1i32.nxv4i8.nxv4i8(
+ <vscale x 1 x i32> %0,
+ <vscale x 4 x i8> %1,
+ <vscale x 4 x i8> %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.nds.vd4dots.nxv2i32.nxv8i8.nxv8i8(
+ <vscale x 2 x i32>,
+ <vscale x 8 x i8>,
+ <vscale x 8 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vd4dots_vv_nxv2i32_nxv8i8_nxv8i8(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_vv_nxv2i32_nxv8i8_nxv8i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v9, v10
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 2 x i32> @llvm.riscv.nds.vd4dots.nxv2i32.nxv8i8.nxv8i8(
+ <vscale x 2 x i32> %0,
+ <vscale x 8 x i8> %1,
+ <vscale x 8 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.nds.vd4dots.mask.nxv2i32.nxv8i8.nxv8i8(
+ <vscale x 2 x i32>,
+ <vscale x 8 x i8>,
+ <vscale x 8 x i8>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vd4dots_mask_vv_nxv2i32_nxv8i8_nxv8i8(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_mask_vv_nxv2i32_nxv8i8_nxv8i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 2 x i32> @llvm.riscv.nds.vd4dots.mask.nxv2i32.nxv8i8.nxv8i8(
+ <vscale x 2 x i32> %0,
+ <vscale x 8 x i8> %1,
+ <vscale x 8 x i8> %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.nds.vd4dots.nxv4i32.nxv16i8.nxv16i8(
+ <vscale x 4 x i32>,
+ <vscale x 16 x i8>,
+ <vscale x 16 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vd4dots_vv_nxv4i32_nxv16i8_nxv16i8(<vscale x 4 x i32> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_vv_nxv4i32_nxv16i8_nxv16i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v10, v12
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 4 x i32> @llvm.riscv.nds.vd4dots.nxv4i32.nxv16i8.nxv16i8(
+ <vscale x 4 x i32> %0,
+ <vscale x 16 x i8> %1,
+ <vscale x 16 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.nds.vd4dots.mask.nxv4i32.nxv16i8.nxv16i8(
+ <vscale x 4 x i32>,
+ <vscale x 16 x i8>,
+ <vscale x 16 x i8>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vd4dots_mask_vv_nxv4i32_nxv16i8_nxv16i8(<vscale x 4 x i32> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_mask_vv_nxv4i32_nxv16i8_nxv16i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v10, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 4 x i32> @llvm.riscv.nds.vd4dots.mask.nxv4i32.nxv16i8.nxv16i8(
+ <vscale x 4 x i32> %0,
+ <vscale x 16 x i8> %1,
+ <vscale x 16 x i8> %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.nds.vd4dots.nxv8i32.nxv32i8.nxv32i8(
+ <vscale x 8 x i32>,
+ <vscale x 32 x i8>,
+ <vscale x 32 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vd4dots_vv_nxv8i32_nxv32i8_nxv32i8(<vscale x 8 x i32> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_vv_nxv8i32_nxv32i8_nxv32i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v12, v16
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 8 x i32> @llvm.riscv.nds.vd4dots.nxv8i32.nxv32i8.nxv32i8(
+ <vscale x 8 x i32> %0,
+ <vscale x 32 x i8> %1,
+ <vscale x 32 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.nds.vd4dots.mask.nxv8i32.nxv32i8.nxv32i8(
+ <vscale x 8 x i32>,
+ <vscale x 32 x i8>,
+ <vscale x 32 x i8>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vd4dots_mask_vv_nxv8i32_nxv32i8_nxv32i8(<vscale x 8 x i32> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_mask_vv_nxv8i32_nxv32i8_nxv32i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v12, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 8 x i32> @llvm.riscv.nds.vd4dots.mask.nxv8i32.nxv32i8.nxv32i8(
+ <vscale x 8 x i32> %0,
+ <vscale x 32 x i8> %1,
+ <vscale x 32 x i8> %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.nds.vd4dots.nxv16i32.nxv64i8.nxv64i8(
+ <vscale x 16 x i32>,
+ <vscale x 64 x i8>,
+ <vscale x 64 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vd4dots_vv_nxv16i32_nxv64i8_nxv64i8(<vscale x 16 x i32> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_vv_nxv16i32_nxv64i8_nxv64i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8r.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v16, v24
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 16 x i32> @llvm.riscv.nds.vd4dots.nxv16i32.nxv64i8.nxv64i8(
+ <vscale x 16 x i32> %0,
+ <vscale x 64 x i8> %1,
+ <vscale x 64 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.nds.vd4dots.mask.nxv16i32.nxv64i8.nxv64i8(
+ <vscale x 16 x i32>,
+ <vscale x 64 x i8>,
+ <vscale x 64 x i8>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vd4dots_mask_vv_nxv16i32_nxv64i8_nxv64i8(<vscale x 16 x i32> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_mask_vv_nxv16i32_nxv64i8_nxv64i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8r.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v16, v24, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 16 x i32> @llvm.riscv.nds.vd4dots.mask.nxv16i32.nxv64i8.nxv64i8(
+ <vscale x 16 x i32> %0,
+ <vscale x 64 x i8> %1,
+ <vscale x 64 x i8> %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.nds.vd4dots.nxv1i64.nxv4i16.nxv4i16(
+ <vscale x 1 x i64>,
+ <vscale x 4 x i16>,
+ <vscale x 4 x i16>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vd4dots_vv_nxv1i64_nxv4i16_nxv4i16(<vscale x 1 x i64> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_vv_nxv1i64_nxv4i16_nxv4i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v9, v10
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 1 x i64> @llvm.riscv.nds.vd4dots.nxv1i64.nxv4i16.nxv4i16(
+ <vscale x 1 x i64> %0,
+ <vscale x 4 x i16> %1,
+ <vscale x 4 x i16> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.nds.vd4dots.mask.nxv1i64.nxv4i16.nxv4i16(
+ <vscale x 1 x i64>,
+ <vscale x 4 x i16>,
+ <vscale x 4 x i16>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vd4dots_mask_vv_nxv1i64_nxv4i16_nxv4i16(<vscale x 1 x i64> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_mask_vv_nxv1i64_nxv4i16_nxv4i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 1 x i64> @llvm.riscv.nds.vd4dots.mask.nxv1i64.nxv4i16.nxv4i16(
+ <vscale x 1 x i64> %0,
+ <vscale x 4 x i16> %1,
+ <vscale x 4 x i16> %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.nds.vd4dots.nxv2i64.nxv8i16.nxv8i16(
+ <vscale x 2 x i64>,
+ <vscale x 8 x i16>,
+ <vscale x 8 x i16>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vd4dots_vv_nxv2i64_nxv8i16_nxv8i16(<vscale x 2 x i64> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_vv_nxv2i64_nxv8i16_nxv8i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v10, v12
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 2 x i64> @llvm.riscv.nds.vd4dots.nxv2i64.nxv8i16.nxv8i16(
+ <vscale x 2 x i64> %0,
+ <vscale x 8 x i16> %1,
+ <vscale x 8 x i16> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.nds.vd4dots.mask.nxv2i64.nxv8i16.nxv8i16(
+ <vscale x 2 x i64>,
+ <vscale x 8 x i16>,
+ <vscale x 8 x i16>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vd4dots_mask_vv_nxv2i64_nxv8i16_nxv8i16(<vscale x 2 x i64> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 2 x i1>%3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_mask_vv_nxv2i64_nxv8i16_nxv8i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v10, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 2 x i64> @llvm.riscv.nds.vd4dots.mask.nxv2i64.nxv8i16.nxv8i16(
+ <vscale x 2 x i64> %0,
+ <vscale x 8 x i16> %1,
+ <vscale x 8 x i16> %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.nds.vd4dots.nxv4i64.nxv16i16.nxv16i16(
+ <vscale x 4 x i64>,
+ <vscale x 16 x i16>,
+ <vscale x 16 x i16>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vd4dots_vv_nxv4i64_nxv16i16_nxv16i16(<vscale x 4 x i64> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_vv_nxv4i64_nxv16i16_nxv16i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v12, v16
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 4 x i64> @llvm.riscv.nds.vd4dots.nxv4i64.nxv16i16.nxv16i16(
+ <vscale x 4 x i64> %0,
+ <vscale x 16 x i16> %1,
+ <vscale x 16 x i16> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.nds.vd4dots.mask.nxv4i64.nxv16i16.nxv16i16(
+ <vscale x 4 x i64>,
+ <vscale x 16 x i16>,
+ <vscale x 16 x i16>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vd4dots_mask_vv_nxv4i64_nxv16i16_nxv16i16(<vscale x 4 x i64> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_mask_vv_nxv4i64_nxv16i16_nxv16i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v12, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 4 x i64> @llvm.riscv.nds.vd4dots.mask.nxv4i64.nxv16i16.nxv16i16(
+ <vscale x 4 x i64> %0,
+ <vscale x 16 x i16> %1,
+ <vscale x 16 x i16> %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.nds.vd4dots.nxv8i64.nxv32i16.nxv32i16(
+ <vscale x 8 x i64>,
+ <vscale x 32 x i16>,
+ <vscale x 32 x i16>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vd4dots_vv_nxv8i64_nxv32i16_nxv32i16(<vscale x 8 x i64> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_vv_nxv8i64_nxv32i16_nxv32i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v16, v24
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 8 x i64> @llvm.riscv.nds.vd4dots.nxv8i64.nxv32i16.nxv32i16(
+ <vscale x 8 x i64> %0,
+ <vscale x 32 x i16> %1,
+ <vscale x 32 x i16> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.nds.vd4dots.mask.nxv8i64.nxv32i16.nxv32i16(
+ <vscale x 8 x i64>,
+ <vscale x 32 x i16>,
+ <vscale x 32 x i16>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vd4dots_mask_vv_nxv8i64_nxv32i16_nxv32i16(<vscale x 8 x i64> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dots_mask_vv_nxv8i64_nxv32i16_nxv32i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, ma
+; CHECK-NEXT: nds.vd4dots.vv v8, v16, v24, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 8 x i64> @llvm.riscv.nds.vd4dots.mask.nxv8i64.nxv32i16.nxv32i16(
+ <vscale x 8 x i64> %0,
+ <vscale x 32 x i16> %1,
+ <vscale x 32 x i16> %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 8 x i64> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/xandesvdot-vd4dotsu.ll b/llvm/test/CodeGen/RISCV/rvv/xandesvdot-vd4dotsu.ll
new file mode 100644
index 0000000000000..88eb4f297b7af
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/xandesvdot-vd4dotsu.ll
@@ -0,0 +1,405 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zve64x,+xandesvdot \
+; RUN: -verify-machineinstrs -target-abi=ilp32 | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64x,+xandesvdot \
+; RUN: -verify-machineinstrs -target-abi=lp64 | FileCheck %s
+
+declare <vscale x 1 x i32> @llvm.riscv.nds.vd4dotsu.nxv1i32.nxv4i8.nxv4i8(
+ <vscale x 1 x i32>,
+ <vscale x 4 x i8>,
+ <vscale x 4 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vd4dotsu_vv_nxv1i32_nxv4i8_nxv4i8(<vscale x 1 x i32> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_vv_nxv1i32_nxv4i8_nxv4i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v9, v10
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 1 x i32> @llvm.riscv.nds.vd4dotsu.nxv1i32.nxv4i8.nxv4i8(
+ <vscale x 1 x i32> %0,
+ <vscale x 4 x i8> %1,
+ <vscale x 4 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.nds.vd4dotsu.mask.nxv1i32.nxv4i8.nxv4i8(
+ <vscale x 1 x i32>,
+ <vscale x 4 x i8>,
+ <vscale x 4 x i8>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vd4dotsu_mask_vv_nxv1i32_nxv4i8_nxv4i8(<vscale x 1 x i32> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 1 x i1>%3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_mask_vv_nxv1i32_nxv4i8_nxv4i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 1 x i32> @llvm.riscv.nds.vd4dotsu.mask.nxv1i32.nxv4i8.nxv4i8(
+ <vscale x 1 x i32> %0,
+ <vscale x 4 x i8> %1,
+ <vscale x 4 x i8> %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.nds.vd4dotsu.nxv2i32.nxv8i8.nxv8i8(
+ <vscale x 2 x i32>,
+ <vscale x 8 x i8>,
+ <vscale x 8 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vd4dotsu_vv_nxv2i32_nxv8i8_nxv8i8(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_vv_nxv2i32_nxv8i8_nxv8i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v9, v10
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 2 x i32> @llvm.riscv.nds.vd4dotsu.nxv2i32.nxv8i8.nxv8i8(
+ <vscale x 2 x i32> %0,
+ <vscale x 8 x i8> %1,
+ <vscale x 8 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.nds.vd4dotsu.mask.nxv2i32.nxv8i8.nxv8i8(
+ <vscale x 2 x i32>,
+ <vscale x 8 x i8>,
+ <vscale x 8 x i8>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vd4dotsu_mask_vv_nxv2i32_nxv8i8_nxv8i8(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_mask_vv_nxv2i32_nxv8i8_nxv8i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 2 x i32> @llvm.riscv.nds.vd4dotsu.mask.nxv2i32.nxv8i8.nxv8i8(
+ <vscale x 2 x i32> %0,
+ <vscale x 8 x i8> %1,
+ <vscale x 8 x i8> %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.nds.vd4dotsu.nxv4i32.nxv16i8.nxv16i8(
+ <vscale x 4 x i32>,
+ <vscale x 16 x i8>,
+ <vscale x 16 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vd4dotsu_vv_nxv4i32_nxv16i8_nxv16i8(<vscale x 4 x i32> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_vv_nxv4i32_nxv16i8_nxv16i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v10, v12
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 4 x i32> @llvm.riscv.nds.vd4dotsu.nxv4i32.nxv16i8.nxv16i8(
+ <vscale x 4 x i32> %0,
+ <vscale x 16 x i8> %1,
+ <vscale x 16 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.nds.vd4dotsu.mask.nxv4i32.nxv16i8.nxv16i8(
+ <vscale x 4 x i32>,
+ <vscale x 16 x i8>,
+ <vscale x 16 x i8>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vd4dotsu_mask_vv_nxv4i32_nxv16i8_nxv16i8(<vscale x 4 x i32> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_mask_vv_nxv4i32_nxv16i8_nxv16i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v10, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 4 x i32> @llvm.riscv.nds.vd4dotsu.mask.nxv4i32.nxv16i8.nxv16i8(
+ <vscale x 4 x i32> %0,
+ <vscale x 16 x i8> %1,
+ <vscale x 16 x i8> %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.nds.vd4dotsu.nxv8i32.nxv32i8.nxv32i8(
+ <vscale x 8 x i32>,
+ <vscale x 32 x i8>,
+ <vscale x 32 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vd4dotsu_vv_nxv8i32_nxv32i8_nxv32i8(<vscale x 8 x i32> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_vv_nxv8i32_nxv32i8_nxv32i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v12, v16
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 8 x i32> @llvm.riscv.nds.vd4dotsu.nxv8i32.nxv32i8.nxv32i8(
+ <vscale x 8 x i32> %0,
+ <vscale x 32 x i8> %1,
+ <vscale x 32 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.nds.vd4dotsu.mask.nxv8i32.nxv32i8.nxv32i8(
+ <vscale x 8 x i32>,
+ <vscale x 32 x i8>,
+ <vscale x 32 x i8>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vd4dotsu_mask_vv_nxv8i32_nxv32i8_nxv32i8(<vscale x 8 x i32> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_mask_vv_nxv8i32_nxv32i8_nxv32i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v12, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 8 x i32> @llvm.riscv.nds.vd4dotsu.mask.nxv8i32.nxv32i8.nxv32i8(
+ <vscale x 8 x i32> %0,
+ <vscale x 32 x i8> %1,
+ <vscale x 32 x i8> %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.nds.vd4dotsu.nxv16i32.nxv64i8.nxv64i8(
+ <vscale x 16 x i32>,
+ <vscale x 64 x i8>,
+ <vscale x 64 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vd4dotsu_vv_nxv16i32_nxv64i8_nxv64i8(<vscale x 16 x i32> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_vv_nxv16i32_nxv64i8_nxv64i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8r.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v16, v24
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 16 x i32> @llvm.riscv.nds.vd4dotsu.nxv16i32.nxv64i8.nxv64i8(
+ <vscale x 16 x i32> %0,
+ <vscale x 64 x i8> %1,
+ <vscale x 64 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.nds.vd4dotsu.mask.nxv16i32.nxv64i8.nxv64i8(
+ <vscale x 16 x i32>,
+ <vscale x 64 x i8>,
+ <vscale x 64 x i8>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vd4dotsu_mask_vv_nxv16i32_nxv64i8_nxv64i8(<vscale x 16 x i32> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_mask_vv_nxv16i32_nxv64i8_nxv64i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8r.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v16, v24, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 16 x i32> @llvm.riscv.nds.vd4dotsu.mask.nxv16i32.nxv64i8.nxv64i8(
+ <vscale x 16 x i32> %0,
+ <vscale x 64 x i8> %1,
+ <vscale x 64 x i8> %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.nds.vd4dotsu.nxv1i64.nxv4i16.nxv4i16(
+ <vscale x 1 x i64>,
+ <vscale x 4 x i16>,
+ <vscale x 4 x i16>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vd4dotsu_vv_nxv1i64_nxv4i16_nxv4i16(<vscale x 1 x i64> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_vv_nxv1i64_nxv4i16_nxv4i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v9, v10
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 1 x i64> @llvm.riscv.nds.vd4dotsu.nxv1i64.nxv4i16.nxv4i16(
+ <vscale x 1 x i64> %0,
+ <vscale x 4 x i16> %1,
+ <vscale x 4 x i16> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.nds.vd4dotsu.mask.nxv1i64.nxv4i16.nxv4i16(
+ <vscale x 1 x i64>,
+ <vscale x 4 x i16>,
+ <vscale x 4 x i16>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vd4dotsu_mask_vv_nxv1i64_nxv4i16_nxv4i16(<vscale x 1 x i64> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_mask_vv_nxv1i64_nxv4i16_nxv4i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 1 x i64> @llvm.riscv.nds.vd4dotsu.mask.nxv1i64.nxv4i16.nxv4i16(
+ <vscale x 1 x i64> %0,
+ <vscale x 4 x i16> %1,
+ <vscale x 4 x i16> %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.nds.vd4dotsu.nxv2i64.nxv8i16.nxv8i16(
+ <vscale x 2 x i64>,
+ <vscale x 8 x i16>,
+ <vscale x 8 x i16>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vd4dotsu_vv_nxv2i64_nxv8i16_nxv8i16(<vscale x 2 x i64> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_vv_nxv2i64_nxv8i16_nxv8i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v10, v12
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 2 x i64> @llvm.riscv.nds.vd4dotsu.nxv2i64.nxv8i16.nxv8i16(
+ <vscale x 2 x i64> %0,
+ <vscale x 8 x i16> %1,
+ <vscale x 8 x i16> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.nds.vd4dotsu.mask.nxv2i64.nxv8i16.nxv8i16(
+ <vscale x 2 x i64>,
+ <vscale x 8 x i16>,
+ <vscale x 8 x i16>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vd4dotsu_mask_vv_nxv2i64_nxv8i16_nxv8i16(<vscale x 2 x i64> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 2 x i1>%3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_mask_vv_nxv2i64_nxv8i16_nxv8i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v10, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 2 x i64> @llvm.riscv.nds.vd4dotsu.mask.nxv2i64.nxv8i16.nxv8i16(
+ <vscale x 2 x i64> %0,
+ <vscale x 8 x i16> %1,
+ <vscale x 8 x i16> %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.nds.vd4dotsu.nxv4i64.nxv16i16.nxv16i16(
+ <vscale x 4 x i64>,
+ <vscale x 16 x i16>,
+ <vscale x 16 x i16>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vd4dotsu_vv_nxv4i64_nxv16i16_nxv16i16(<vscale x 4 x i64> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_vv_nxv4i64_nxv16i16_nxv16i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v12, v16
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 4 x i64> @llvm.riscv.nds.vd4dotsu.nxv4i64.nxv16i16.nxv16i16(
+ <vscale x 4 x i64> %0,
+ <vscale x 16 x i16> %1,
+ <vscale x 16 x i16> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.nds.vd4dotsu.mask.nxv4i64.nxv16i16.nxv16i16(
+ <vscale x 4 x i64>,
+ <vscale x 16 x i16>,
+ <vscale x 16 x i16>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vd4dotsu_mask_vv_nxv4i64_nxv16i16_nxv16i16(<vscale x 4 x i64> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_mask_vv_nxv4i64_nxv16i16_nxv16i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v12, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 4 x i64> @llvm.riscv.nds.vd4dotsu.mask.nxv4i64.nxv16i16.nxv16i16(
+ <vscale x 4 x i64> %0,
+ <vscale x 16 x i16> %1,
+ <vscale x 16 x i16> %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.nds.vd4dotsu.nxv8i64.nxv32i16.nxv32i16(
+ <vscale x 8 x i64>,
+ <vscale x 32 x i16>,
+ <vscale x 32 x i16>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vd4dotsu_vv_nxv8i64_nxv32i16_nxv32i16(<vscale x 8 x i64> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_vv_nxv8i64_nxv32i16_nxv32i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v16, v24
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 8 x i64> @llvm.riscv.nds.vd4dotsu.nxv8i64.nxv32i16.nxv32i16(
+ <vscale x 8 x i64> %0,
+ <vscale x 32 x i16> %1,
+ <vscale x 32 x i16> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.nds.vd4dotsu.mask.nxv8i64.nxv32i16.nxv32i16(
+ <vscale x 8 x i64>,
+ <vscale x 32 x i16>,
+ <vscale x 32 x i16>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vd4dotsu_mask_vv_nxv8i64_nxv32i16_nxv32i16(<vscale x 8 x i64> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotsu_mask_vv_nxv8i64_nxv32i16_nxv32i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, ma
+; CHECK-NEXT: nds.vd4dotsu.vv v8, v16, v24, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 8 x i64> @llvm.riscv.nds.vd4dotsu.mask.nxv8i64.nxv32i16.nxv32i16(
+ <vscale x 8 x i64> %0,
+ <vscale x 32 x i16> %1,
+ <vscale x 32 x i16> %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 8 x i64> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/xandesvdot-vd4dotu.ll b/llvm/test/CodeGen/RISCV/rvv/xandesvdot-vd4dotu.ll
new file mode 100644
index 0000000000000..3cb5e64ecc85c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/xandesvdot-vd4dotu.ll
@@ -0,0 +1,405 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zve64x,+xandesvdot \
+; RUN: -verify-machineinstrs -target-abi=ilp32 | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64x,+xandesvdot \
+; RUN: -verify-machineinstrs -target-abi=lp64 | FileCheck %s
+
+declare <vscale x 1 x i32> @llvm.riscv.nds.vd4dotu.nxv1i32.nxv4i8.nxv4i8(
+ <vscale x 1 x i32>,
+ <vscale x 4 x i8>,
+ <vscale x 4 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vd4dotu_vv_nxv1i32_nxv4i8_nxv4i8(<vscale x 1 x i32> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_vv_nxv1i32_nxv4i8_nxv4i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v9, v10
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 1 x i32> @llvm.riscv.nds.vd4dotu.nxv1i32.nxv4i8.nxv4i8(
+ <vscale x 1 x i32> %0,
+ <vscale x 4 x i8> %1,
+ <vscale x 4 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.nds.vd4dotu.mask.nxv1i32.nxv4i8.nxv4i8(
+ <vscale x 1 x i32>,
+ <vscale x 4 x i8>,
+ <vscale x 4 x i8>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vd4dotu_mask_vv_nxv1i32_nxv4i8_nxv4i8(<vscale x 1 x i32> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 1 x i1>%3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_mask_vv_nxv1i32_nxv4i8_nxv4i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 1 x i32> @llvm.riscv.nds.vd4dotu.mask.nxv1i32.nxv4i8.nxv4i8(
+ <vscale x 1 x i32> %0,
+ <vscale x 4 x i8> %1,
+ <vscale x 4 x i8> %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.nds.vd4dotu.nxv2i32.nxv8i8.nxv8i8(
+ <vscale x 2 x i32>,
+ <vscale x 8 x i8>,
+ <vscale x 8 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vd4dotu_vv_nxv2i32_nxv8i8_nxv8i8(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_vv_nxv2i32_nxv8i8_nxv8i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v9, v10
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 2 x i32> @llvm.riscv.nds.vd4dotu.nxv2i32.nxv8i8.nxv8i8(
+ <vscale x 2 x i32> %0,
+ <vscale x 8 x i8> %1,
+ <vscale x 8 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.nds.vd4dotu.mask.nxv2i32.nxv8i8.nxv8i8(
+ <vscale x 2 x i32>,
+ <vscale x 8 x i8>,
+ <vscale x 8 x i8>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vd4dotu_mask_vv_nxv2i32_nxv8i8_nxv8i8(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_mask_vv_nxv2i32_nxv8i8_nxv8i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 2 x i32> @llvm.riscv.nds.vd4dotu.mask.nxv2i32.nxv8i8.nxv8i8(
+ <vscale x 2 x i32> %0,
+ <vscale x 8 x i8> %1,
+ <vscale x 8 x i8> %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.nds.vd4dotu.nxv4i32.nxv16i8.nxv16i8(
+ <vscale x 4 x i32>,
+ <vscale x 16 x i8>,
+ <vscale x 16 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vd4dotu_vv_nxv4i32_nxv16i8.nxv16i8(<vscale x 4 x i32> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_vv_nxv4i32_nxv16i8.nxv16i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v10, v12
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 4 x i32> @llvm.riscv.nds.vd4dotu.nxv4i32.nxv16i8.nxv16i8(
+ <vscale x 4 x i32> %0,
+ <vscale x 16 x i8> %1,
+ <vscale x 16 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.nds.vd4dotu.mask.nxv4i32.nxv16i8.nxv16i8(
+ <vscale x 4 x i32>,
+ <vscale x 16 x i8>,
+ <vscale x 16 x i8>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vd4dotu_mask_vv_nxv4i32_nxv16i8_nxv16i8(<vscale x 4 x i32> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_mask_vv_nxv4i32_nxv16i8_nxv16i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v10, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 4 x i32> @llvm.riscv.nds.vd4dotu.mask.nxv4i32.nxv16i8.nxv16i8(
+ <vscale x 4 x i32> %0,
+ <vscale x 16 x i8> %1,
+ <vscale x 16 x i8> %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.nds.vd4dotu.nxv8i32.nxv32i8.nxv32i8(
+ <vscale x 8 x i32>,
+ <vscale x 32 x i8>,
+ <vscale x 32 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vd4dotu_vv_nxv8i32_nxv32i8_nxv32i8(<vscale x 8 x i32> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_vv_nxv8i32_nxv32i8_nxv32i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v12, v16
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 8 x i32> @llvm.riscv.nds.vd4dotu.nxv8i32.nxv32i8.nxv32i8(
+ <vscale x 8 x i32> %0,
+ <vscale x 32 x i8> %1,
+ <vscale x 32 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.nds.vd4dotu.mask.nxv8i32.nxv32i8.nxv32i8(
+ <vscale x 8 x i32>,
+ <vscale x 32 x i8>,
+ <vscale x 32 x i8>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vd4dotu_mask_vv_nxv8i32_nxv32i8_nxv32i8(<vscale x 8 x i32> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_mask_vv_nxv8i32_nxv32i8_nxv32i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v12, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 8 x i32> @llvm.riscv.nds.vd4dotu.mask.nxv8i32.nxv32i8.nxv32i8(
+ <vscale x 8 x i32> %0,
+ <vscale x 32 x i8> %1,
+ <vscale x 32 x i8> %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.nds.vd4dotu.nxv16i32.nxv64i8.nxv64i8(
+ <vscale x 16 x i32>,
+ <vscale x 64 x i8>,
+ <vscale x 64 x i8>,
+ iXLen, iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vd4dotu_vv_nxv16i32_nxv64i8_nxv64i8(<vscale x 16 x i32> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_vv_nxv16i32_nxv64i8_nxv64i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8r.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v16, v24
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 16 x i32> @llvm.riscv.nds.vd4dotu.nxv16i32.nxv64i8.nxv64i8(
+ <vscale x 16 x i32> %0,
+ <vscale x 64 x i8> %1,
+ <vscale x 64 x i8> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.nds.vd4dotu.mask.nxv16i32.nxv64i8.nxv64i8(
+ <vscale x 16 x i32>,
+ <vscale x 64 x i8>,
+ <vscale x 64 x i8>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vd4dotu_mask_vv_nxv16i32_nxv64i8_nxv64i8(<vscale x 16 x i32> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_mask_vv_nxv16i32_nxv64i8_nxv64i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8r.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v16, v24, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 16 x i32> @llvm.riscv.nds.vd4dotu.mask.nxv16i32.nxv64i8.nxv64i8(
+ <vscale x 16 x i32> %0,
+ <vscale x 64 x i8> %1,
+ <vscale x 64 x i8> %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.nds.vd4dotu.nxv1i64.nxv4i16.nxv4i16(
+ <vscale x 1 x i64>,
+ <vscale x 4 x i16>,
+ <vscale x 4 x i16>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vd4dotu_vv_nxv1i64_nxv4i16_nxv4i16(<vscale x 1 x i64> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_vv_nxv1i64_nxv4i16_nxv4i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v9, v10
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 1 x i64> @llvm.riscv.nds.vd4dotu.nxv1i64.nxv4i16.nxv4i16(
+ <vscale x 1 x i64> %0,
+ <vscale x 4 x i16> %1,
+ <vscale x 4 x i16> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.nds.vd4dotu.mask.nxv1i64.nxv4i16.nxv4i16(
+ <vscale x 1 x i64>,
+ <vscale x 4 x i16>,
+ <vscale x 4 x i16>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vd4dotu_mask_vv_nxv1i64_nxv4i16_nxv4i16(<vscale x 1 x i64> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_mask_vv_nxv1i64_nxv4i16_nxv4i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 1 x i64> @llvm.riscv.nds.vd4dotu.mask.nxv1i64.nxv4i16.nxv4i16(
+ <vscale x 1 x i64> %0,
+ <vscale x 4 x i16> %1,
+ <vscale x 4 x i16> %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.nds.vd4dotu.nxv2i64.nxv8i16.nxv8i16(
+ <vscale x 2 x i64>,
+ <vscale x 8 x i16>,
+ <vscale x 8 x i16>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vd4dotu_vv_nxv2i64_nxv8i16_nxv8i16(<vscale x 2 x i64> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_vv_nxv2i64_nxv8i16_nxv8i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v10, v12
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 2 x i64> @llvm.riscv.nds.vd4dotu.nxv2i64.nxv8i16.nxv8i16(
+ <vscale x 2 x i64> %0,
+ <vscale x 8 x i16> %1,
+ <vscale x 8 x i16> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.nds.vd4dotu.mask.nxv2i64.nxv8i16.nxv8i16(
+ <vscale x 2 x i64>,
+ <vscale x 8 x i16>,
+ <vscale x 8 x i16>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vd4dotu_mask_vv_nxv2i64_nxv8i16_nxv8i16(<vscale x 2 x i64> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 2 x i1>%3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_mask_vv_nxv2i64_nxv8i16_nxv8i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v10, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 2 x i64> @llvm.riscv.nds.vd4dotu.mask.nxv2i64.nxv8i16.nxv8i16(
+ <vscale x 2 x i64> %0,
+ <vscale x 8 x i16> %1,
+ <vscale x 8 x i16> %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.nds.vd4dotu.nxv4i64.nxv16i16.nxv16i16(
+ <vscale x 4 x i64>,
+ <vscale x 16 x i16>,
+ <vscale x 16 x i16>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vd4dotu_vv_nxv4i64_nxv16i16_nxv16i16(<vscale x 4 x i64> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_vv_nxv4i64_nxv16i16_nxv16i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v12, v16
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 4 x i64> @llvm.riscv.nds.vd4dotu.nxv4i64.nxv16i16.nxv16i16(
+ <vscale x 4 x i64> %0,
+ <vscale x 16 x i16> %1,
+ <vscale x 16 x i16> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.nds.vd4dotu.mask.nxv4i64.nxv16i16.nxv16i16(
+ <vscale x 4 x i64>,
+ <vscale x 16 x i16>,
+ <vscale x 16 x i16>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vd4dotu_mask_vv_nxv4i64_nxv16i16_nxv16i16(<vscale x 4 x i64> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_mask_vv_nxv4i64_nxv16i16_nxv16i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v12, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 4 x i64> @llvm.riscv.nds.vd4dotu.mask.nxv4i64.nxv16i16.nxv16i16(
+ <vscale x 4 x i64> %0,
+ <vscale x 16 x i16> %1,
+ <vscale x 16 x i16> %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.nds.vd4dotu.nxv8i64.nxv32i16.nxv32i16(
+ <vscale x 8 x i64>,
+ <vscale x 32 x i16>,
+ <vscale x 32 x i16>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vd4dotu_vv_nxv8i64_nxv32i16_nxv32i16(<vscale x 8 x i64> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_vv_nxv8i64_nxv32i16_nxv32i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v16, v24
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 8 x i64> @llvm.riscv.nds.vd4dotu.nxv8i64.nxv32i16.nxv32i16(
+ <vscale x 8 x i64> %0,
+ <vscale x 32 x i16> %1,
+ <vscale x 32 x i16> %2,
+ iXLen %3, iXLen 2)
+ ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.nds.vd4dotu.mask.nxv8i64.nxv32i16.nxv32i16(
+ <vscale x 8 x i64>,
+ <vscale x 32 x i16>,
+ <vscale x 32 x i16>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vd4dotu_mask_vv_nxv8i64_nxv32i16_nxv32i16(<vscale x 8 x i64> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vd4dotu_mask_vv_nxv8i64_nxv32i16_nxv32i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, ma
+; CHECK-NEXT: nds.vd4dotu.vv v8, v16, v24, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = tail call <vscale x 8 x i64> @llvm.riscv.nds.vd4dotu.mask.nxv8i64.nxv32i16.nxv32i16(
+ <vscale x 8 x i64> %0,
+ <vscale x 32 x i16> %1,
+ <vscale x 32 x i16> %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 2)
+ ret <vscale x 8 x i64> %a
+}
>From 77fd88bd4a69718330a41e6b4a0d7aac9d39a138 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim at andestech.com>
Date: Mon, 19 May 2025 10:05:43 +0800
Subject: [PATCH 2/3] Move VPatTernaryVD4DOT_VV to the section for multiclass
---
llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td | 26 +++++++++----------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 4e24a2e062635..345576db1d7c7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -403,6 +403,19 @@ multiclass VPseudoVD4DOT_VV {
}
}
+multiclass VPatTernaryVD4DOT_VV<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist> {
+ foreach vtiToWti = vtilist in {
+ defvar vti = vtiToWti.Vti;
+ defvar wti = vtiToWti.Wti;
+ let Predicates = GetVTypePredicates<wti>.Predicates in
+ defm : VPatTernaryWithPolicy<intrinsic, instruction, "VV",
+ wti.Vector, vti.Vector, vti.Vector,
+ wti.Mask, wti.Log2SEW, vti.LMul,
+ wti.RegClass, vti.RegClass, vti.RegClass>;
+ }
+}
+
//===----------------------------------------------------------------------===//
// XAndesPerf
//===----------------------------------------------------------------------===//
@@ -533,19 +546,6 @@ defset list<VTypeInfoToWide> AllQuadWidenableVD4DOTVectors = {
def : VTypeInfoToWide<VI16M8, VI64M8>;
}
-multiclass VPatTernaryVD4DOT_VV<string intrinsic, string instruction,
- list<VTypeInfoToWide> vtilist> {
- foreach vtiToWti = vtilist in {
- defvar vti = vtiToWti.Vti;
- defvar wti = vtiToWti.Wti;
- let Predicates = GetVTypePredicates<wti>.Predicates in
- defm : VPatTernaryWithPolicy<intrinsic, instruction, "VV",
- wti.Vector, vti.Vector, vti.Vector,
- wti.Mask, wti.Log2SEW, vti.LMul,
- wti.RegClass, vti.RegClass, vti.RegClass>;
- }
-}
-
defm : VPatTernaryVD4DOT_VV<"int_riscv_nds_vd4dots", "PseudoNDS_VD4DOTS", AllQuadWidenableVD4DOTVectors>;
defm : VPatTernaryVD4DOT_VV<"int_riscv_nds_vd4dotu", "PseudoNDS_VD4DOTU", AllQuadWidenableVD4DOTVectors>;
defm : VPatTernaryVD4DOT_VV<"int_riscv_nds_vd4dotsu", "PseudoNDS_VD4DOTSU", AllQuadWidenableVD4DOTVectors>;
>From 9a32ed390fc67a8f1a3e2d119ab1d1b71b9c2150 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim at andestech.com>
Date: Mon, 19 May 2025 10:06:50 +0800
Subject: [PATCH 3/3] Remove unnecessary LMULInfos
---
llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td | 9 +--------
1 file changed, 1 insertion(+), 8 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 345576db1d7c7..158b62fb00659 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -388,15 +388,8 @@ multiclass VPatVFPMADBinaryV_VX_RM<string intrinsic, string instruction,
}
}
-def VD4DOT_M1 : LMULInfo<0b000, 8, VR, VR, VR, VR, VR, "M1">;
-def VD4DOT_M2 : LMULInfo<0b001, 16, VRM2, VRM2, VR, VR, VR, "M2">;
-def VD4DOT_M4 : LMULInfo<0b010, 32, VRM4, VRM4, VRM2, VR, VR, "M4">;
-def VD4DOT_M8 : LMULInfo<0b011, 64, VRM8, VRM8, VRM4, VRM2, VR, "M8">;
-
-defvar MxListVD4DOT = [V_MF2, VD4DOT_M1, VD4DOT_M2, VD4DOT_M4, VD4DOT_M8];
-
multiclass VPseudoVD4DOT_VV {
- foreach m = MxListVD4DOT in {
+ foreach m = [V_MF2, V_M1, V_M2, V_M4, V_M8] in {
defm "" : VPseudoBinaryV_VV<m>,
SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", m.MX,
forcePassthruRead=true>;
More information about the llvm-commits
mailing list