[llvm] d457608 - [AArch64][SVE] Add remaining SVE2 intrinsics for widening DSP operations

Tue Feb 18 02:36:04 PST 2020

Author: Kerry McLaughlin
Date: 2020-02-18T10:28:00Z
New Revision: d4576080da72fdedff6842a1256fb6f11cee6b3b

URL: https://github.com/llvm/llvm-project/commit/d4576080da72fdedff6842a1256fb6f11cee6b3b
DIFF: https://github.com/llvm/llvm-project/commit/d4576080da72fdedff6842a1256fb6f11cee6b3b.diff

LOG: [AArch64][SVE] Add remaining SVE2 intrinsics for widening DSP operations

Summary:
Implements the following intrinsics:
 - llvm.aarch64.sve.[s|u]mullb_lane
 - llvm.aarch64.sve.[s|u]mullt_lane
 - llvm.aarch64.sve.sqdmullb_lane
 - llvm.aarch64.sve.sqdmullt_lane
 - llvm.aarch64.sve.[s|u]addwb
 - llvm.aarch64.sve.[s|u]addwt
 - llvm.aarch64.sve.[s|u]shllb
 - llvm.aarch64.sve.[s|u]shllt
 - llvm.aarch64.sve.[s|u]subwb
 - llvm.aarch64.sve.[s|u]subwt

Reviewers: sdesmalen, dancgr, efriedma, c-rhodes, rengolin

Reviewed By: sdesmalen

Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, cameron.mcinally, cfe-commits, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D73903

Added: 
    

Modified: 
    llvm/include/llvm/IR/IntrinsicsAArch64.td
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/lib/Target/AArch64/SVEInstrFormats.td
    llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index ba1cba2a8c4d..f5c202910859 100644

--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1053,12 +1053,31 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMVectorOfBitcastsToInt<0>],
                 [IntrNoMem]>;
 
+  class SVE2_1VectorArg_Long_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMSubdivide2VectorType<0>,
+                 llvm_i32_ty],
+                [IntrNoMem, ImmArg<1>]>;
+
   class SVE2_2VectorArg_Long_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMSubdivide2VectorType<0>,
                  LLVMSubdivide2VectorType<0>],
                 [IntrNoMem]>;
 
+  class SVE2_2VectorArgIndexed_Long_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMSubdivide2VectorType<0>,
+               LLVMSubdivide2VectorType<0>,
+               llvm_i32_ty],
+              [IntrNoMem, ImmArg<2>]>;
+
+  class SVE2_2VectorArg_Wide_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 LLVMSubdivide2VectorType<0>],
+                [IntrNoMem]>;
+
   class SVE2_2VectorArg_Pred_Long_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
@@ -1766,28 +1785,34 @@ def int_aarch64_sve_usra          : AdvSIMD_2VectorArgIndexed_Intrinsic;
 // SVE2 - Widening DSP operations
 //
 
-def int_aarch64_sve_sabalb   : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sabalt   : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sabdlb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sabdlt   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_saddlb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_saddlt   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_smullb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_smullt   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sqdmullb : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sqdmullt : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_ssublb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_ssublt   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uabalb   : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uabalt   : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uabdlb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uabdlt   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uaddlb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uaddlt   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_umullb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_umullt   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_usublb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_usublt   : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sabalb : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sabalt : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sabdlb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sabdlt : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_saddlb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_saddlt : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_saddwb : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_saddwt : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_sshllb : SVE2_1VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sshllt : SVE2_1VectorArg_Long_Intrinsic;
+def int_aarch64_sve_ssublb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_ssublt : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_ssubwb : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_ssubwt : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_uabalb : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uabalt : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uabdlb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uabdlt : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uaddlb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uaddlt : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uaddwb : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_uaddwt : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_ushllb : SVE2_1VectorArg_Long_Intrinsic;
+def int_aarch64_sve_ushllt : SVE2_1VectorArg_Long_Intrinsic;
+def int_aarch64_sve_usublb : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_usublt : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_usubwb : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_usubwt : SVE2_2VectorArg_Wide_Intrinsic;
 
 //
 // SVE2 - Non-widening pairwise arithmetic
@@ -1933,10 +1958,16 @@ def int_aarch64_sve_smlslb_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
 def int_aarch64_sve_smlslt_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
 def int_aarch64_sve_umlslb_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
 def int_aarch64_sve_umlslt_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_smullb_lane   : SVE2_2VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_smullt_lane   : SVE2_2VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_umullb_lane   : SVE2_2VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_umullt_lane   : SVE2_2VectorArgIndexed_Long_Intrinsic;
 def int_aarch64_sve_sqdmlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic;
 def int_aarch64_sve_sqdmlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic;
 def int_aarch64_sve_sqdmlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic;
 def int_aarch64_sve_sqdmlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_sqdmullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_sqdmullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic;
 
 // SVE2 MLA Unpredicated.
 def int_aarch64_sve_smlalb      : SVE2_3VectorArg_Long_Intrinsic;
@@ -1947,11 +1978,17 @@ def int_aarch64_sve_smlslb      : SVE2_3VectorArg_Long_Intrinsic;
 def int_aarch64_sve_smlslt      : SVE2_3VectorArg_Long_Intrinsic;
 def int_aarch64_sve_umlslb      : SVE2_3VectorArg_Long_Intrinsic;
 def int_aarch64_sve_umlslt      : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_smullb      : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_smullt      : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_umullb      : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_umullt      : SVE2_2VectorArg_Long_Intrinsic;
 
 def int_aarch64_sve_sqdmlalb    : SVE2_3VectorArg_Long_Intrinsic;
 def int_aarch64_sve_sqdmlalt    : SVE2_3VectorArg_Long_Intrinsic;
 def int_aarch64_sve_sqdmlslb    : SVE2_3VectorArg_Long_Intrinsic;
 def int_aarch64_sve_sqdmlslt    : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sqdmullb    : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sqdmullt    : SVE2_2VectorArg_Long_Intrinsic;
 def int_aarch64_sve_sqdmlalbt   : SVE2_3VectorArg_Long_Intrinsic;
 def int_aarch64_sve_sqdmlslbt   : SVE2_3VectorArg_Long_Intrinsic;
 

diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index e188fa4e2fce..1fa7b827941a 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1475,14 +1475,14 @@ let Predicates = [HasSVE2] in {
   defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_x>;
 
   // SVE2 integer multiply long (indexed)
-  defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb">;
-  defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt">;
-  defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb">;
-  defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt">;
+  defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb", int_aarch64_sve_smullb_lane>;
+  defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt", int_aarch64_sve_smullt_lane>;
+  defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb", int_aarch64_sve_umullb_lane>;
+  defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt", int_aarch64_sve_umullt_lane>;
 
   // SVE2 saturating multiply (indexed)
-  defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb">;
-  defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt">;
+  defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb", int_aarch64_sve_sqdmullb_lane>;
+  defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt", int_aarch64_sve_sqdmullt_lane>;
 
   // SVE2 integer multiply-add long (indexed)
   defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb", int_aarch64_sve_smlalb_lane>;
@@ -1593,14 +1593,14 @@ let Predicates = [HasSVE2] in {
   defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt", int_aarch64_sve_uabdlt>;
 
   // SVE2 integer add/subtract wide
-  defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb">;
-  defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt">;
-  defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb">;
-  defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt">;
-  defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb">;
-  defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt">;
-  defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb">;
-  defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt">;
+  defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb", int_aarch64_sve_saddwb>;
+  defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt", int_aarch64_sve_saddwt>;
+  defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb", int_aarch64_sve_uaddwb>;
+  defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt", int_aarch64_sve_uaddwt>;
+  defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb", int_aarch64_sve_ssubwb>;
+  defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt", int_aarch64_sve_ssubwt>;
+  defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb", int_aarch64_sve_usubwb>;
+  defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt", int_aarch64_sve_usubwt>;
 
   // SVE2 integer multiply long
   defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>;
@@ -1693,10 +1693,10 @@ let Predicates = [HasSVE2] in {
   defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb">;
 
   // SVE2 bitwise shift left long
-  defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb">;
-  defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt">;
-  defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb">;
-  defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt">;
+  defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb", int_aarch64_sve_sshllb>;
+  defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt", int_aarch64_sve_sshllt>;
+  defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb", int_aarch64_sve_ushllb>;
+  defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt", int_aarch64_sve_ushllt>;
 
   // SVE2 integer add/subtract interleaved long
   defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>;

diff  --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index fe388c48b6ee..c2d4b1b5f533 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -2731,9 +2731,10 @@ multiclass sve2_int_mul_by_indexed_elem<bits<4> opc, string asm,
   def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> {
+multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm,
+                                             SDPatternOperator op> {
   def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm,
-                                        ZPR32, ZPR16, ZPR3b16, VectorIndexH> {
+                                        ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> {
     bits<3> Zm;
     bits<3> iop;
     let Inst{20-19} = iop{2-1};
@@ -2741,13 +2742,16 @@ multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> {
     let Inst{11} = iop{0};
   }
   def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm,
-                                        ZPR64, ZPR32, ZPR4b32, VectorIndexS> {
+                                        ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> {
     bits<4> Zm;
     bits<2> iop;
     let Inst{20} = iop{1};
     let Inst{19-16} = Zm;
     let Inst{11} = iop{0};
   }
+
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2894,10 +2898,15 @@ multiclass sve2_wide_int_arith_long<bits<5> opc, string asm,
   def : SVE_2_Op_Pat<nxv2i64, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm> {
+multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm,
+                                    SDPatternOperator op> {
   def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>;
   def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>;
   def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>;
+
+  def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve2_pmul_long<bits<1> opc, string asm> {
@@ -2992,7 +3001,8 @@ class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
+multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm,
+                                        SDPatternOperator op> {
   def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm,
                                         ZPR16, ZPR8, vecshiftL8>;
   def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm,
@@ -3003,6 +3013,9 @@ multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
                                         ZPR64, ZPR32, vecshiftL32> {
     let Inst{20-19} = imm{4-3};
   }
+  def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv16i8, i32, tvecshiftL8,  !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Imm_Pat<nxv2i64, op, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll
index 0c98614b7c41..b90c19e02dd4 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll
@@ -192,6 +192,69 @@ define <vscale x 2 x i64> @saddlt_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
   ret <vscale x 2 x i64> %out
 }
 
+;
+; SADDWB
+;
+
+define <vscale x 8 x i16> @saddwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: saddwb_b:
+; CHECK: saddwb z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.saddwb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @saddwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: saddwb_h:
+; CHECK: saddwb z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.saddwb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @saddwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: saddwb_s:
+; CHECK: saddwb z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.saddwb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; SADDWT
+;
+
+define <vscale x 8 x i16> @saddwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: saddwt_b:
+; CHECK: saddwt z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.saddwt.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @saddwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: saddwt_h:
+; CHECK: saddwt z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.saddwt.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @saddwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: saddwt_s:
+; CHECK: saddwt z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.saddwt.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+
 ;
 ; SMULLB (Vectors)
 ;
@@ -223,6 +286,30 @@ define <vscale x 2 x i64> @smullb_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
   ret <vscale x 2 x i64> %out
 }
 
+;
+; SMULLB (Indexed)
+;
+
+define <vscale x 4 x i32> @smullb_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: smullb_lane_h:
+; CHECK: smullb z0.s, z0.h, z1.h[4]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.smullb.lane.nxv4i32(<vscale x 8 x i16> %a,
+                                                                       <vscale x 8 x i16> %b,
+                                                                       i32 4)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @smullb_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: smullb_lane_s:
+; CHECK: smullb z0.d, z0.s, z1.s[3]
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.smullb.lane.nxv2i64(<vscale x 4 x i32> %a,
+                                                                       <vscale x 4 x i32> %b,
+                                                                       i32 3)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; SMULLT (Vectors)
 ;
@@ -254,6 +341,30 @@ define <vscale x 2 x i64> @smullt_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
   ret <vscale x 2 x i64> %out
 }
 
+;
+; SMULLT (Indexed)
+;
+
+define <vscale x 4 x i32> @smullt_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: smullt_lane_h:
+; CHECK: smullt z0.s, z0.h, z1.h[5]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.smullt.lane.nxv4i32(<vscale x 8 x i16> %a,
+                                                                       <vscale x 8 x i16> %b,
+                                                                       i32 5)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @smullt_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: smullt_lane_s:
+; CHECK: smullt z0.d, z0.s, z1.s[2]
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.smullt.lane.nxv2i64(<vscale x 4 x i32> %a,
+                                                                       <vscale x 4 x i32> %b,
+                                                                       i32 2)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; SQDMULLB (Vectors)
 ;
@@ -285,6 +396,30 @@ define <vscale x 2 x i64> @sqdmullb_s(<vscale x 4 x i32> %a, <vscale x 4 x i32>
   ret <vscale x 2 x i64> %out
 }
 
+;
+; SQDMULLB (Indexed)
+;
+
+define <vscale x 4 x i32> @sqdmullb_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sqdmullb_lane_h:
+; CHECK: sqdmullb z0.s, z0.h, z1.h[2]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullb.lane.nxv4i32(<vscale x 8 x i16> %a,
+                                                                         <vscale x 8 x i16> %b,
+                                                                         i32 2)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sqdmullb_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sqdmullb_lane_s:
+; CHECK: sqdmullb z0.d, z0.s, z1.s[1]
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullb.lane.nxv2i64(<vscale x 4 x i32> %a,
+                                                                         <vscale x 4 x i32> %b,
+                                                                         i32 1)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; SQDMULLT (Vectors)
 ;
@@ -316,6 +451,30 @@ define <vscale x 2 x i64> @sqdmullt_s(<vscale x 4 x i32> %a, <vscale x 4 x i32>
   ret <vscale x 2 x i64> %out
 }
 
+;
+; SQDMULLT (Indexed)
+;
+
+define <vscale x 4 x i32> @sqdmullt_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sqdmullt_lane_h:
+; CHECK: sqdmullt z0.s, z0.h, z1.h[3]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullt.lane.nxv4i32(<vscale x 8 x i16> %a,
+                                                                         <vscale x 8 x i16> %b,
+                                                                         i32 3)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sqdmullt_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sqdmullt_lane_s:
+; CHECK: sqdmullt z0.d, z0.s, z1.s[0]
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullt.lane.nxv2i64(<vscale x 4 x i32> %a,
+                                                                         <vscale x 4 x i32> %b,
+                                                                         i32 0)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; SSUBLB
 ;
@@ -347,6 +506,62 @@ define <vscale x 2 x i64> @ssublb_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
   ret <vscale x 2 x i64> %out
 }
 
+;
+; SSHLLB
+;
+
+define <vscale x 8 x i16> @sshllb_b(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: sshllb_b:
+; CHECK: sshllb z0.h, z0.b, #0
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sshllb.nxv8i16(<vscale x 16 x i8> %a, i32 0)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sshllb_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: sshllb_h:
+; CHECK: sshllb z0.s, z0.h, #1
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sshllb.nxv4i32(<vscale x 8 x i16> %a, i32 1)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sshllb_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: sshllb_s:
+; CHECK: sshllb z0.d, z0.s, #2
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sshllb.nxv2i64(<vscale x 4 x i32> %a, i32 2)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; SSHLLT
+;
+
+define <vscale x 8 x i16> @sshllt_b(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: sshllt_b:
+; CHECK: sshllt z0.h, z0.b, #3
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sshllt.nxv8i16(<vscale x 16 x i8> %a, i32 3)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sshllt_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: sshllt_h:
+; CHECK: sshllt z0.s, z0.h, #4
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sshllt.nxv4i32(<vscale x 8 x i16> %a, i32 4)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sshllt_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: sshllt_s:
+; CHECK: sshllt z0.d, z0.s, #5
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sshllt.nxv2i64(<vscale x 4 x i32> %a, i32 5)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; SSUBLT
 ;
@@ -378,6 +593,68 @@ define <vscale x 2 x i64> @ssublt_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
   ret <vscale x 2 x i64> %out
 }
 
+;
+; SSUBWB
+;
+
+define <vscale x 8 x i16> @ssubwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: ssubwb_b:
+; CHECK: ssubwb z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ssubwb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @ssubwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: ssubwb_h:
+; CHECK: ssubwb z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ssubwb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @ssubwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: ssubwb_s:
+; CHECK: ssubwb z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ssubwb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; SSUBWT
+;
+
+define <vscale x 8 x i16> @ssubwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: ssubwt_b:
+; CHECK: ssubwt z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ssubwt.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @ssubwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: ssubwt_h:
+; CHECK: ssubwt z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ssubwt.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @ssubwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: ssubwt_s:
+; CHECK: ssubwt z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ssubwt.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; UABALB
 ;
@@ -570,6 +847,68 @@ define <vscale x 2 x i64> @uaddlt_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
   ret <vscale x 2 x i64> %out
 }
 
+;
+; UADDWB
+;
+
+define <vscale x 8 x i16> @uaddwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: uaddwb_b:
+; CHECK: uaddwb z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uaddwb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uaddwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uaddwb_h:
+; CHECK: uaddwb z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uaddwb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @uaddwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uaddwb_s:
+; CHECK: uaddwb z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uaddwb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; UADDWT
+;
+
+define <vscale x 8 x i16> @uaddwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: uaddwt_b:
+; CHECK: uaddwt z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uaddwt.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uaddwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uaddwt_h:
+; CHECK: uaddwt z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uaddwt.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @uaddwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uaddwt_s:
+; CHECK: uaddwt z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uaddwt.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; UMULLB (Vectors)
 ;
@@ -601,6 +940,31 @@ define <vscale x 2 x i64> @umullb_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
   ret <vscale x 2 x i64> %out
 }
 
+;
+; UMULLB (Indexed)
+;
+
+define <vscale x 4 x i32> @umullb_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: umullb_lane_h:
+; CHECK: umullb z0.s, z0.h, z1.h[0]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.umullb.lane.nxv4i32(<vscale x 8 x i16> %a,
+                                                                       <vscale x 8 x i16> %b,
+                                                                       i32 0)
+  ret <vscale x 4 x i32> %out
+}
+
+
+define <vscale x 2 x i64> @umullb_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: umullb_lane_s:
+; CHECK: umullb z0.d, z0.s, z1.s[3]
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.umullb.lane.nxv2i64(<vscale x 4 x i32> %a,
+                                                                       <vscale x 4 x i32> %b,
+                                                                       i32 3)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; UMULLT (Vectors)
 ;
@@ -632,6 +996,86 @@ define <vscale x 2 x i64> @umullt_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
   ret <vscale x 2 x i64> %out
 }
 
+;
+; UMULLT (Indexed)
+;
+
+define <vscale x 4 x i32> @umullt_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: umullt_lane_h:
+; CHECK: umullt z0.s, z0.h, z1.h[1]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.umullt.lane.nxv4i32(<vscale x 8 x i16> %a,
+                                                                       <vscale x 8 x i16> %b,
+                                                                       i32 1)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @umullt_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: umullt_lane_s:
+; CHECK: umullt z0.d, z0.s, z1.s[2]
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.umullt.lane.nxv2i64(<vscale x 4 x i32> %a,
+                                                                       <vscale x 4 x i32> %b,
+                                                                       i32 2)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; USHLLB
+;
+
+define <vscale x 8 x i16> @ushllb_b(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: ushllb_b:
+; CHECK: ushllb z0.h, z0.b, #6
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ushllb.nxv8i16(<vscale x 16 x i8> %a, i32 6)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @ushllb_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: ushllb_h:
+; CHECK: ushllb z0.s, z0.h, #7
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ushllb.nxv4i32(<vscale x 8 x i16> %a, i32 7)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @ushllb_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: ushllb_s:
+; CHECK: ushllb z0.d, z0.s, #8
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ushllb.nxv2i64(<vscale x 4 x i32> %a, i32 8)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; USHLLT
+;
+
+define <vscale x 8 x i16> @ushllt_b(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: ushllt_b:
+; CHECK: ushllt z0.h, z0.b, #7
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ushllt.nxv8i16(<vscale x 16 x i8> %a, i32 7)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @ushllt_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: ushllt_h:
+; CHECK: ushllt z0.s, z0.h, #15
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ushllt.nxv4i32(<vscale x 8 x i16> %a, i32 15)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @ushllt_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: ushllt_s:
+; CHECK: ushllt z0.d, z0.s, #31
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ushllt.nxv2i64(<vscale x 4 x i32> %a, i32 31)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; USUBLB
 ;
@@ -694,6 +1138,68 @@ define <vscale x 2 x i64> @usublt_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
   ret <vscale x 2 x i64> %out
 }
 
+;
+; USUBWB
+;
+
+define <vscale x 8 x i16> @usubwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: usubwb_b:
+; CHECK: usubwb z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.usubwb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @usubwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: usubwb_h:
+; CHECK: usubwb z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.usubwb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @usubwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: usubwb_s:
+; CHECK: usubwb z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.usubwb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; USUBWT
+;
+
+define <vscale x 8 x i16> @usubwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: usubwt_b:
+; CHECK: usubwt z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.usubwt.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @usubwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: usubwt_h:
+; CHECK: usubwt z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.usubwt.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @usubwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: usubwt_s:
+; CHECK: usubwt z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.usubwt.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.sabalb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.sabalb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.sabalb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>, <vscale x 4 x i32>)
@@ -718,22 +1224,50 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.saddlt.nxv8i16(<vscale x 16 x i8>,
 declare <vscale x 4 x i32> @llvm.aarch64.sve.saddlt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.saddlt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 8 x i16> @llvm.aarch64.sve.saddwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.saddwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.saddwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.saddwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.saddwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.saddwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.smullb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.smullb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.smullb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 4 x i32> @llvm.aarch64.sve.smullb.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.smullb.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.smullt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.smullt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.smullt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 4 x i32> @llvm.aarch64.sve.smullt.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.smullt.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.sqdmullb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullb.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullb.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.sqdmullt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullt.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullt.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sshllb.nxv8i16(<vscale x 16 x i8>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sshllb.nxv4i32(<vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sshllb.nxv2i64(<vscale x 4 x i32>, i32)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sshllt.nxv8i16(<vscale x 16 x i8>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sshllt.nxv4i32(<vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sshllt.nxv2i64(<vscale x 4 x i32>, i32)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.ssublb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.ssublb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.ssublb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
@@ -742,6 +1276,14 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.ssublt.nxv8i16(<vscale x 16 x i8>,
 declare <vscale x 4 x i32> @llvm.aarch64.sve.ssublt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.ssublt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ssubwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ssubwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ssubwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ssubwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ssubwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ssubwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.uabalb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.uabalb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.uabalb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>, <vscale x 4 x i32>)
@@ -766,14 +1308,36 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.uaddlt.nxv8i16(<vscale x 16 x i8>,
 declare <vscale x 4 x i32> @llvm.aarch64.sve.uaddlt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.uaddlt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 8 x i16> @llvm.aarch64.sve.uaddwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.uaddwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.uaddwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.uaddwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.uaddwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.uaddwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.umullb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.umullb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.umullb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 4 x i32> @llvm.aarch64.sve.umullb.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.umullb.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.umullt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.umullt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.umullt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 4 x i32> @llvm.aarch64.sve.umullt.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.umullt.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ushllb.nxv8i16(<vscale x 16 x i8>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ushllb.nxv4i32(<vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ushllb.nxv2i64(<vscale x 4 x i32>, i32)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ushllt.nxv8i16(<vscale x 16 x i8>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ushllt.nxv4i32(<vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ushllt.nxv2i64(<vscale x 4 x i32>, i32)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.usublb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.usublb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.usublb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
@@ -781,3 +1345,11 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.usublb.nxv2i64(<vscale x 4 x i32>,
 declare <vscale x 8 x i16> @llvm.aarch64.sve.usublt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.usublt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.usublt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.usubwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.usubwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.usubwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.usubwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.usubwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.usubwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)