[llvm] d457608 - [AArch64][SVE] Add remaining SVE2 intrinsics for widening DSP operations
Kerry McLaughlin via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 18 02:36:04 PST 2020
Author: Kerry McLaughlin
Date: 2020-02-18T10:28:00Z
New Revision: d4576080da72fdedff6842a1256fb6f11cee6b3b
URL: https://github.com/llvm/llvm-project/commit/d4576080da72fdedff6842a1256fb6f11cee6b3b
DIFF: https://github.com/llvm/llvm-project/commit/d4576080da72fdedff6842a1256fb6f11cee6b3b.diff
LOG: [AArch64][SVE] Add remaining SVE2 intrinsics for widening DSP operations
Summary:
Implements the following intrinsics:
- llvm.aarch64.sve.[s|u]mullb_lane
- llvm.aarch64.sve.[s|u]mullt_lane
- llvm.aarch64.sve.sqdmullb_lane
- llvm.aarch64.sve.sqdmullt_lane
- llvm.aarch64.sve.[s|u]addwb
- llvm.aarch64.sve.[s|u]addwt
- llvm.aarch64.sve.[s|u]shllb
- llvm.aarch64.sve.[s|u]shllt
- llvm.aarch64.sve.[s|u]subwb
- llvm.aarch64.sve.[s|u]subwt
Reviewers: sdesmalen, dancgr, efriedma, c-rhodes, rengolin
Reviewed By: sdesmalen
Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, cameron.mcinally, cfe-commits, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D73903
Added:
Modified:
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/lib/Target/AArch64/SVEInstrFormats.td
llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index ba1cba2a8c4d..f5c202910859 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1053,12 +1053,31 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
LLVMVectorOfBitcastsToInt<0>],
[IntrNoMem]>;
+ class SVE2_1VectorArg_Long_Intrinsic
+ : Intrinsic<[llvm_anyvector_ty],
+ [LLVMSubdivide2VectorType<0>,
+ llvm_i32_ty],
+ [IntrNoMem, ImmArg<1>]>;
+
class SVE2_2VectorArg_Long_Intrinsic
: Intrinsic<[llvm_anyvector_ty],
[LLVMSubdivide2VectorType<0>,
LLVMSubdivide2VectorType<0>],
[IntrNoMem]>;
+ class SVE2_2VectorArgIndexed_Long_Intrinsic
+ : Intrinsic<[llvm_anyvector_ty],
+ [LLVMSubdivide2VectorType<0>,
+ LLVMSubdivide2VectorType<0>,
+ llvm_i32_ty],
+ [IntrNoMem, ImmArg<2>]>;
+
+ class SVE2_2VectorArg_Wide_Intrinsic
+ : Intrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>,
+ LLVMSubdivide2VectorType<0>],
+ [IntrNoMem]>;
+
class SVE2_2VectorArg_Pred_Long_Intrinsic
: Intrinsic<[llvm_anyvector_ty],
[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
@@ -1766,28 +1785,34 @@ def int_aarch64_sve_usra : AdvSIMD_2VectorArgIndexed_Intrinsic;
// SVE2 - Widening DSP operations
//
-def int_aarch64_sve_sabalb : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sabalt : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sabdlb : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sabdlt : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_saddlb : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_saddlt : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_smullb : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_smullt : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sqdmullb : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sqdmullt : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_ssublb : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_ssublt : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uabalb : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uabalt : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uabdlb : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uabdlt : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uaddlb : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uaddlt : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_umullb : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_umullt : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_usublb : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_usublt : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sabalb : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sabalt : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sabdlb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sabdlt : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_saddlb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_saddlt : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_saddwb : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_saddwt : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_sshllb : SVE2_1VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sshllt : SVE2_1VectorArg_Long_Intrinsic;
+def int_aarch64_sve_ssublb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_ssublt : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_ssubwb : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_ssubwt : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_uabalb : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uabalt : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uabdlb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uabdlt : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uaddlb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uaddlt : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uaddwb : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_uaddwt : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_ushllb : SVE2_1VectorArg_Long_Intrinsic;
+def int_aarch64_sve_ushllt : SVE2_1VectorArg_Long_Intrinsic;
+def int_aarch64_sve_usublb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_usublt : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_usubwb : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_usubwt : SVE2_2VectorArg_Wide_Intrinsic;
//
// SVE2 - Non-widening pairwise arithmetic
@@ -1933,10 +1958,16 @@ def int_aarch64_sve_smlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic;
def int_aarch64_sve_smlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic;
def int_aarch64_sve_umlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic;
def int_aarch64_sve_umlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_smullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_smullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_umullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_umullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic;
def int_aarch64_sve_sqdmlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic;
def int_aarch64_sve_sqdmlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic;
def int_aarch64_sve_sqdmlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic;
def int_aarch64_sve_sqdmlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_sqdmullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_sqdmullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic;
// SVE2 MLA Unpredicated.
def int_aarch64_sve_smlalb : SVE2_3VectorArg_Long_Intrinsic;
@@ -1947,11 +1978,17 @@ def int_aarch64_sve_smlslb : SVE2_3VectorArg_Long_Intrinsic;
def int_aarch64_sve_smlslt : SVE2_3VectorArg_Long_Intrinsic;
def int_aarch64_sve_umlslb : SVE2_3VectorArg_Long_Intrinsic;
def int_aarch64_sve_umlslt : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_smullb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_smullt : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_umullb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_umullt : SVE2_2VectorArg_Long_Intrinsic;
def int_aarch64_sve_sqdmlalb : SVE2_3VectorArg_Long_Intrinsic;
def int_aarch64_sve_sqdmlalt : SVE2_3VectorArg_Long_Intrinsic;
def int_aarch64_sve_sqdmlslb : SVE2_3VectorArg_Long_Intrinsic;
def int_aarch64_sve_sqdmlslt : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sqdmullb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sqdmullt : SVE2_2VectorArg_Long_Intrinsic;
def int_aarch64_sve_sqdmlalbt : SVE2_3VectorArg_Long_Intrinsic;
def int_aarch64_sve_sqdmlslbt : SVE2_3VectorArg_Long_Intrinsic;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index e188fa4e2fce..1fa7b827941a 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1475,14 +1475,14 @@ let Predicates = [HasSVE2] in {
defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_x>;
// SVE2 integer multiply long (indexed)
- defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb">;
- defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt">;
- defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb">;
- defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt">;
+ defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb", int_aarch64_sve_smullb_lane>;
+ defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt", int_aarch64_sve_smullt_lane>;
+ defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb", int_aarch64_sve_umullb_lane>;
+ defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt", int_aarch64_sve_umullt_lane>;
// SVE2 saturating multiply (indexed)
- defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb">;
- defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt">;
+ defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb", int_aarch64_sve_sqdmullb_lane>;
+ defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt", int_aarch64_sve_sqdmullt_lane>;
// SVE2 integer multiply-add long (indexed)
defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb", int_aarch64_sve_smlalb_lane>;
@@ -1593,14 +1593,14 @@ let Predicates = [HasSVE2] in {
defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt", int_aarch64_sve_uabdlt>;
// SVE2 integer add/subtract wide
- defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb">;
- defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt">;
- defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb">;
- defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt">;
- defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb">;
- defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt">;
- defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb">;
- defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt">;
+ defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb", int_aarch64_sve_saddwb>;
+ defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt", int_aarch64_sve_saddwt>;
+ defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb", int_aarch64_sve_uaddwb>;
+ defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt", int_aarch64_sve_uaddwt>;
+ defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb", int_aarch64_sve_ssubwb>;
+ defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt", int_aarch64_sve_ssubwt>;
+ defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb", int_aarch64_sve_usubwb>;
+ defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt", int_aarch64_sve_usubwt>;
// SVE2 integer multiply long
defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>;
@@ -1693,10 +1693,10 @@ let Predicates = [HasSVE2] in {
defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb">;
// SVE2 bitwise shift left long
- defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb">;
- defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt">;
- defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb">;
- defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt">;
+ defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb", int_aarch64_sve_sshllb>;
+ defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt", int_aarch64_sve_sshllt>;
+ defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb", int_aarch64_sve_ushllb>;
+ defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt", int_aarch64_sve_ushllt>;
// SVE2 integer add/subtract interleaved long
defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index fe388c48b6ee..c2d4b1b5f533 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -2731,9 +2731,10 @@ multiclass sve2_int_mul_by_indexed_elem<bits<4> opc, string asm,
def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> {
+multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm,
+ SDPatternOperator op> {
def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm,
- ZPR32, ZPR16, ZPR3b16, VectorIndexH> {
+ ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{20-19} = iop{2-1};
@@ -2741,13 +2742,16 @@ multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> {
let Inst{11} = iop{0};
}
def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm,
- ZPR64, ZPR32, ZPR4b32, VectorIndexS> {
+ ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> {
bits<4> Zm;
bits<2> iop;
let Inst{20} = iop{1};
let Inst{19-16} = Zm;
let Inst{11} = iop{0};
}
+
+ def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -2894,10 +2898,15 @@ multiclass sve2_wide_int_arith_long<bits<5> opc, string asm,
def : SVE_2_Op_Pat<nxv2i64, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm> {
+multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm,
+ SDPatternOperator op> {
def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>;
def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>;
def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>;
+
+ def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>;
}
multiclass sve2_pmul_long<bits<1> opc, string asm> {
@@ -2992,7 +3001,8 @@ class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
let Inst{4-0} = Zd;
}
-multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
+multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm,
+ SDPatternOperator op> {
def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm,
ZPR16, ZPR8, vecshiftL8>;
def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm,
@@ -3003,6 +3013,9 @@ multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
ZPR64, ZPR32, vecshiftL32> {
let Inst{20-19} = imm{4-3};
}
+ def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv16i8, i32, tvecshiftL8, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Imm_Pat<nxv2i64, op, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll
index 0c98614b7c41..b90c19e02dd4 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll
@@ -192,6 +192,69 @@ define <vscale x 2 x i64> @saddlt_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
ret <vscale x 2 x i64> %out
}
+;
+; SADDWB
+;
+
+define <vscale x 8 x i16> @saddwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: saddwb_b:
+; CHECK: saddwb z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.saddwb.nxv8i16(<vscale x 8 x i16> %a,
+ <vscale x 16 x i8> %b)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @saddwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: saddwb_h:
+; CHECK: saddwb z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.saddwb.nxv4i32(<vscale x 4 x i32> %a,
+ <vscale x 8 x i16> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @saddwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: saddwb_s:
+; CHECK: saddwb z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.saddwb.nxv2i64(<vscale x 2 x i64> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 2 x i64> %out
+}
+
+;
+; SADDWT
+;
+
+define <vscale x 8 x i16> @saddwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: saddwt_b:
+; CHECK: saddwt z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.saddwt.nxv8i16(<vscale x 8 x i16> %a,
+ <vscale x 16 x i8> %b)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @saddwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: saddwt_h:
+; CHECK: saddwt z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.saddwt.nxv4i32(<vscale x 4 x i32> %a,
+ <vscale x 8 x i16> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @saddwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: saddwt_s:
+; CHECK: saddwt z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.saddwt.nxv2i64(<vscale x 2 x i64> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 2 x i64> %out
+}
+
+
;
; SMULLB (Vectors)
;
@@ -223,6 +286,30 @@ define <vscale x 2 x i64> @smullb_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
ret <vscale x 2 x i64> %out
}
+;
+; SMULLB (Indexed)
+;
+
+define <vscale x 4 x i32> @smullb_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: smullb_lane_h:
+; CHECK: smullb z0.s, z0.h, z1.h[4]
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.smullb.lane.nxv4i32(<vscale x 8 x i16> %a,
+ <vscale x 8 x i16> %b,
+ i32 4)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @smullb_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: smullb_lane_s:
+; CHECK: smullb z0.d, z0.s, z1.s[3]
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.smullb.lane.nxv2i64(<vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b,
+ i32 3)
+ ret <vscale x 2 x i64> %out
+}
+
;
; SMULLT (Vectors)
;
@@ -254,6 +341,30 @@ define <vscale x 2 x i64> @smullt_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
ret <vscale x 2 x i64> %out
}
+;
+; SMULLT (Indexed)
+;
+
+define <vscale x 4 x i32> @smullt_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: smullt_lane_h:
+; CHECK: smullt z0.s, z0.h, z1.h[5]
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.smullt.lane.nxv4i32(<vscale x 8 x i16> %a,
+ <vscale x 8 x i16> %b,
+ i32 5)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @smullt_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: smullt_lane_s:
+; CHECK: smullt z0.d, z0.s, z1.s[2]
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.smullt.lane.nxv2i64(<vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b,
+ i32 2)
+ ret <vscale x 2 x i64> %out
+}
+
;
; SQDMULLB (Vectors)
;
@@ -285,6 +396,30 @@ define <vscale x 2 x i64> @sqdmullb_s(<vscale x 4 x i32> %a, <vscale x 4 x i32>
ret <vscale x 2 x i64> %out
}
+;
+; SQDMULLB (Indexed)
+;
+
+define <vscale x 4 x i32> @sqdmullb_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sqdmullb_lane_h:
+; CHECK: sqdmullb z0.s, z0.h, z1.h[2]
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullb.lane.nxv4i32(<vscale x 8 x i16> %a,
+ <vscale x 8 x i16> %b,
+ i32 2)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sqdmullb_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sqdmullb_lane_s:
+; CHECK: sqdmullb z0.d, z0.s, z1.s[1]
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullb.lane.nxv2i64(<vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b,
+ i32 1)
+ ret <vscale x 2 x i64> %out
+}
+
;
; SQDMULLT (Vectors)
;
@@ -316,6 +451,30 @@ define <vscale x 2 x i64> @sqdmullt_s(<vscale x 4 x i32> %a, <vscale x 4 x i32>
ret <vscale x 2 x i64> %out
}
+;
+; SQDMULLT (Indexed)
+;
+
+define <vscale x 4 x i32> @sqdmullt_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sqdmullt_lane_h:
+; CHECK: sqdmullt z0.s, z0.h, z1.h[3]
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullt.lane.nxv4i32(<vscale x 8 x i16> %a,
+ <vscale x 8 x i16> %b,
+ i32 3)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sqdmullt_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sqdmullt_lane_s:
+; CHECK: sqdmullt z0.d, z0.s, z1.s[0]
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullt.lane.nxv2i64(<vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b,
+ i32 0)
+ ret <vscale x 2 x i64> %out
+}
+
;
; SSUBLB
;
@@ -347,6 +506,62 @@ define <vscale x 2 x i64> @ssublb_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
ret <vscale x 2 x i64> %out
}
+;
+; SSHLLB
+;
+
+define <vscale x 8 x i16> @sshllb_b(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: sshllb_b:
+; CHECK: sshllb z0.h, z0.b, #0
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sshllb.nxv8i16(<vscale x 16 x i8> %a, i32 0)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sshllb_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: sshllb_h:
+; CHECK: sshllb z0.s, z0.h, #1
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sshllb.nxv4i32(<vscale x 8 x i16> %a, i32 1)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sshllb_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: sshllb_s:
+; CHECK: sshllb z0.d, z0.s, #2
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sshllb.nxv2i64(<vscale x 4 x i32> %a, i32 2)
+ ret <vscale x 2 x i64> %out
+}
+
+;
+; SSHLLT
+;
+
+define <vscale x 8 x i16> @sshllt_b(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: sshllt_b:
+; CHECK: sshllt z0.h, z0.b, #3
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sshllt.nxv8i16(<vscale x 16 x i8> %a, i32 3)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sshllt_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: sshllt_h:
+; CHECK: sshllt z0.s, z0.h, #4
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sshllt.nxv4i32(<vscale x 8 x i16> %a, i32 4)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sshllt_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: sshllt_s:
+; CHECK: sshllt z0.d, z0.s, #5
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sshllt.nxv2i64(<vscale x 4 x i32> %a, i32 5)
+ ret <vscale x 2 x i64> %out
+}
+
;
; SSUBLT
;
@@ -378,6 +593,68 @@ define <vscale x 2 x i64> @ssublt_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
ret <vscale x 2 x i64> %out
}
+;
+; SSUBWB
+;
+
+define <vscale x 8 x i16> @ssubwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: ssubwb_b:
+; CHECK: ssubwb z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ssubwb.nxv8i16(<vscale x 8 x i16> %a,
+ <vscale x 16 x i8> %b)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @ssubwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: ssubwb_h:
+; CHECK: ssubwb z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ssubwb.nxv4i32(<vscale x 4 x i32> %a,
+ <vscale x 8 x i16> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @ssubwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: ssubwb_s:
+; CHECK: ssubwb z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ssubwb.nxv2i64(<vscale x 2 x i64> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 2 x i64> %out
+}
+
+;
+; SSUBWT
+;
+
+define <vscale x 8 x i16> @ssubwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: ssubwt_b:
+; CHECK: ssubwt z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ssubwt.nxv8i16(<vscale x 8 x i16> %a,
+ <vscale x 16 x i8> %b)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @ssubwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: ssubwt_h:
+; CHECK: ssubwt z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ssubwt.nxv4i32(<vscale x 4 x i32> %a,
+ <vscale x 8 x i16> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @ssubwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: ssubwt_s:
+; CHECK: ssubwt z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ssubwt.nxv2i64(<vscale x 2 x i64> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 2 x i64> %out
+}
+
;
; UABALB
;
@@ -570,6 +847,68 @@ define <vscale x 2 x i64> @uaddlt_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
ret <vscale x 2 x i64> %out
}
+;
+; UADDWB
+;
+
+define <vscale x 8 x i16> @uaddwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: uaddwb_b:
+; CHECK: uaddwb z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uaddwb.nxv8i16(<vscale x 8 x i16> %a,
+ <vscale x 16 x i8> %b)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uaddwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uaddwb_h:
+; CHECK: uaddwb z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uaddwb.nxv4i32(<vscale x 4 x i32> %a,
+ <vscale x 8 x i16> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @uaddwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uaddwb_s:
+; CHECK: uaddwb z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uaddwb.nxv2i64(<vscale x 2 x i64> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 2 x i64> %out
+}
+
+;
+; UADDWT
+;
+
+define <vscale x 8 x i16> @uaddwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: uaddwt_b:
+; CHECK: uaddwt z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uaddwt.nxv8i16(<vscale x 8 x i16> %a,
+ <vscale x 16 x i8> %b)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uaddwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uaddwt_h:
+; CHECK: uaddwt z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uaddwt.nxv4i32(<vscale x 4 x i32> %a,
+ <vscale x 8 x i16> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @uaddwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uaddwt_s:
+; CHECK: uaddwt z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uaddwt.nxv2i64(<vscale x 2 x i64> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 2 x i64> %out
+}
+
;
; UMULLB (Vectors)
;
@@ -601,6 +940,31 @@ define <vscale x 2 x i64> @umullb_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
ret <vscale x 2 x i64> %out
}
+;
+; UMULLB (Indexed)
+;
+
+define <vscale x 4 x i32> @umullb_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: umullb_lane_h:
+; CHECK: umullb z0.s, z0.h, z1.h[0]
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.umullb.lane.nxv4i32(<vscale x 8 x i16> %a,
+ <vscale x 8 x i16> %b,
+ i32 0)
+ ret <vscale x 4 x i32> %out
+}
+
+
+define <vscale x 2 x i64> @umullb_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: umullb_lane_s:
+; CHECK: umullb z0.d, z0.s, z1.s[3]
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.umullb.lane.nxv2i64(<vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b,
+ i32 3)
+ ret <vscale x 2 x i64> %out
+}
+
;
; UMULLT (Vectors)
;
@@ -632,6 +996,86 @@ define <vscale x 2 x i64> @umullt_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
ret <vscale x 2 x i64> %out
}
+;
+; UMULLT (Indexed)
+;
+
+define <vscale x 4 x i32> @umullt_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: umullt_lane_h:
+; CHECK: umullt z0.s, z0.h, z1.h[1]
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.umullt.lane.nxv4i32(<vscale x 8 x i16> %a,
+ <vscale x 8 x i16> %b,
+ i32 1)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @umullt_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: umullt_lane_s:
+; CHECK: umullt z0.d, z0.s, z1.s[2]
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.umullt.lane.nxv2i64(<vscale x 4 x i32> %a,
+ <vscale x 4 x i32> %b,
+ i32 2)
+ ret <vscale x 2 x i64> %out
+}
+
+;
+; USHLLB
+;
+
+define <vscale x 8 x i16> @ushllb_b(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: ushllb_b:
+; CHECK: ushllb z0.h, z0.b, #6
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ushllb.nxv8i16(<vscale x 16 x i8> %a, i32 6)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @ushllb_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: ushllb_h:
+; CHECK: ushllb z0.s, z0.h, #7
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ushllb.nxv4i32(<vscale x 8 x i16> %a, i32 7)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @ushllb_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: ushllb_s:
+; CHECK: ushllb z0.d, z0.s, #8
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ushllb.nxv2i64(<vscale x 4 x i32> %a, i32 8)
+ ret <vscale x 2 x i64> %out
+}
+
+;
+; USHLLT
+;
+
+define <vscale x 8 x i16> @ushllt_b(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: ushllt_b:
+; CHECK: ushllt z0.h, z0.b, #7
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ushllt.nxv8i16(<vscale x 16 x i8> %a, i32 7)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @ushllt_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: ushllt_h:
+; CHECK: ushllt z0.s, z0.h, #15
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ushllt.nxv4i32(<vscale x 8 x i16> %a, i32 15)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @ushllt_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: ushllt_s:
+; CHECK: ushllt z0.d, z0.s, #31
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ushllt.nxv2i64(<vscale x 4 x i32> %a, i32 31)
+ ret <vscale x 2 x i64> %out
+}
+
;
; USUBLB
;
@@ -694,6 +1138,68 @@ define <vscale x 2 x i64> @usublt_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
ret <vscale x 2 x i64> %out
}
+;
+; USUBWB
+;
+
+define <vscale x 8 x i16> @usubwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: usubwb_b:
+; CHECK: usubwb z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.usubwb.nxv8i16(<vscale x 8 x i16> %a,
+ <vscale x 16 x i8> %b)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @usubwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: usubwb_h:
+; CHECK: usubwb z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.usubwb.nxv4i32(<vscale x 4 x i32> %a,
+ <vscale x 8 x i16> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @usubwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: usubwb_s:
+; CHECK: usubwb z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.usubwb.nxv2i64(<vscale x 2 x i64> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 2 x i64> %out
+}
+
+;
+; USUBWT
+;
+
+define <vscale x 8 x i16> @usubwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: usubwt_b:
+; CHECK: usubwt z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.usubwt.nxv8i16(<vscale x 8 x i16> %a,
+ <vscale x 16 x i8> %b)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @usubwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: usubwt_h:
+; CHECK: usubwt z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.usubwt.nxv4i32(<vscale x 4 x i32> %a,
+ <vscale x 8 x i16> %b)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @usubwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: usubwt_s:
+; CHECK: usubwt z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.usubwt.nxv2i64(<vscale x 2 x i64> %a,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 2 x i64> %out
+}
+
declare <vscale x 8 x i16> @llvm.aarch64.sve.sabalb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.sabalb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.sabalb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>, <vscale x 4 x i32>)
@@ -718,22 +1224,50 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.saddlt.nxv8i16(<vscale x 16 x i8>,
declare <vscale x 4 x i32> @llvm.aarch64.sve.saddlt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.saddlt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.saddwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.saddwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.saddwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.saddwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.saddwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.saddwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
declare <vscale x 8 x i16> @llvm.aarch64.sve.smullb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.smullb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.smullb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.smullb.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.smullb.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
declare <vscale x 8 x i16> @llvm.aarch64.sve.smullt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.smullt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.smullt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.smullt.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.smullt.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
declare <vscale x 8 x i16> @llvm.aarch64.sve.sqdmullb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullb.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullb.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
declare <vscale x 8 x i16> @llvm.aarch64.sve.sqdmullt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullt.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullt.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sshllb.nxv8i16(<vscale x 16 x i8>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sshllb.nxv4i32(<vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sshllb.nxv2i64(<vscale x 4 x i32>, i32)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sshllt.nxv8i16(<vscale x 16 x i8>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sshllt.nxv4i32(<vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sshllt.nxv2i64(<vscale x 4 x i32>, i32)
+
declare <vscale x 8 x i16> @llvm.aarch64.sve.ssublb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.ssublb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.ssublb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
@@ -742,6 +1276,14 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.ssublt.nxv8i16(<vscale x 16 x i8>,
declare <vscale x 4 x i32> @llvm.aarch64.sve.ssublt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.ssublt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ssubwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ssubwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ssubwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ssubwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ssubwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ssubwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
declare <vscale x 8 x i16> @llvm.aarch64.sve.uabalb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.uabalb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.uabalb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>, <vscale x 4 x i32>)
@@ -766,14 +1308,36 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.uaddlt.nxv8i16(<vscale x 16 x i8>,
declare <vscale x 4 x i32> @llvm.aarch64.sve.uaddlt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.uaddlt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.uaddwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.uaddwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.uaddwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.uaddwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.uaddwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.uaddwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
declare <vscale x 8 x i16> @llvm.aarch64.sve.umullb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.umullb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.umullb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.umullb.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.umullb.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
declare <vscale x 8 x i16> @llvm.aarch64.sve.umullt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.umullt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.umullt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.umullt.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.umullt.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ushllb.nxv8i16(<vscale x 16 x i8>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ushllb.nxv4i32(<vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ushllb.nxv2i64(<vscale x 4 x i32>, i32)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ushllt.nxv8i16(<vscale x 16 x i8>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ushllt.nxv4i32(<vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ushllt.nxv2i64(<vscale x 4 x i32>, i32)
+
declare <vscale x 8 x i16> @llvm.aarch64.sve.usublb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.usublb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.usublb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
@@ -781,3 +1345,11 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.usublb.nxv2i64(<vscale x 4 x i32>,
declare <vscale x 8 x i16> @llvm.aarch64.sve.usublt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.usublt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.usublt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.usubwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.usubwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.usubwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.usubwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.usubwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.usubwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
More information about the llvm-commits
mailing list