[llvm] e42ee2d - [AArch64][SVE] Add support for using reverse forms of SVE2 shifts

Fri Jun 4 04:58:35 PDT 2021

Author: Bradley Smith
Date: 2021-06-04T12:56:53+01:00
New Revision: e42ee2d50963eb12e4d2dc0a20f36b1cb2af8543

URL: https://github.com/llvm/llvm-project/commit/e42ee2d50963eb12e4d2dc0a20f36b1cb2af8543
DIFF: https://github.com/llvm/llvm-project/commit/e42ee2d50963eb12e4d2dc0a20f36b1cb2af8543.diff

LOG: [AArch64][SVE] Add support for using reverse forms of SVE2 shifts

When using and ACLE intrinsic for an SVE2 shift, if the predicate passed
has all relevant lanes active, then use a reversed version of the
instruction if beneficial.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/lib/Target/AArch64/SVEInstrFormats.td
    llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-dsp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 585fa50433fa..198260d7c472 100644

--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2518,18 +2518,25 @@ let Predicates = [HasSVE2] in {
   defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr", int_aarch64_sve_uqsubr>;
 
   // SVE2 saturating/rounding bitwise shift left (predicated)
-  defm SRSHL_ZPmZ   : sve2_int_arith_pred<0b000100, "srshl",   int_aarch64_sve_srshl>;
-  defm URSHL_ZPmZ   : sve2_int_arith_pred<0b000110, "urshl",   int_aarch64_sve_urshl>;
-  defm SRSHLR_ZPmZ  : sve2_int_arith_pred<0b001100, "srshlr",  null_frag>;
-  defm URSHLR_ZPmZ  : sve2_int_arith_pred<0b001110, "urshlr",  null_frag>;
-  defm SQSHL_ZPmZ   : sve2_int_arith_pred<0b010000, "sqshl",   int_aarch64_sve_sqshl>;
-  defm UQSHL_ZPmZ   : sve2_int_arith_pred<0b010010, "uqshl",   int_aarch64_sve_uqshl>;
-  defm SQRSHL_ZPmZ  : sve2_int_arith_pred<0b010100, "sqrshl",  int_aarch64_sve_sqrshl>;
-  defm UQRSHL_ZPmZ  : sve2_int_arith_pred<0b010110, "uqrshl",  int_aarch64_sve_uqrshl>;
-  defm SQSHLR_ZPmZ  : sve2_int_arith_pred<0b011000, "sqshlr",  null_frag>;
-  defm UQSHLR_ZPmZ  : sve2_int_arith_pred<0b011010, "uqshlr",  null_frag>;
-  defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr", null_frag>;
-  defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag>;
+  defm SRSHL_ZPmZ   : sve2_int_arith_pred<0b000100, "srshl",   int_aarch64_sve_srshl,  "SRSHL_ZPZZ",   DestructiveBinaryCommWithRev, "SRSHLR_ZPmZ">;
+  defm URSHL_ZPmZ   : sve2_int_arith_pred<0b000110, "urshl",   int_aarch64_sve_urshl,  "URSHL_ZPZZ",   DestructiveBinaryCommWithRev, "URSHLR_ZPmZ">;
+  defm SRSHLR_ZPmZ  : sve2_int_arith_pred<0b001100, "srshlr",  null_frag,              "SRSHLR_ZPZZ",  DestructiveBinaryCommWithRev, "SRSHL_ZPmZ", /*isReverseInstr*/ 1>;
+  defm URSHLR_ZPmZ  : sve2_int_arith_pred<0b001110, "urshlr",  null_frag,              "URSHLR_ZPZZ",  DestructiveBinaryCommWithRev, "URSHL_ZPmZ", /*isReverseInstr*/ 1>;
+  defm SQSHL_ZPmZ   : sve2_int_arith_pred<0b010000, "sqshl",   int_aarch64_sve_sqshl,  "SQSHL_ZPZZ",   DestructiveBinaryCommWithRev, "SQSHLR_ZPmZ">;
+  defm UQSHL_ZPmZ   : sve2_int_arith_pred<0b010010, "uqshl",   int_aarch64_sve_uqshl,  "UQSHL_ZPZZ",   DestructiveBinaryCommWithRev, "UQSHLR_ZPmZ">;
+  defm SQRSHL_ZPmZ  : sve2_int_arith_pred<0b010100, "sqrshl",  int_aarch64_sve_sqrshl, "SQRSHL_ZPZZ",  DestructiveBinaryCommWithRev, "SQRSHLR_ZPmZ">;
+  defm UQRSHL_ZPmZ  : sve2_int_arith_pred<0b010110, "uqrshl",  int_aarch64_sve_uqrshl, "UQRSHL_ZPZZ",  DestructiveBinaryCommWithRev, "UQRSHLR_ZPmZ">;
+  defm SQSHLR_ZPmZ  : sve2_int_arith_pred<0b011000, "sqshlr",  null_frag,              "SQSHLR_ZPZZ",  DestructiveBinaryCommWithRev, "SQSHL_ZPmZ", /*isReverseInstr*/ 1>;
+  defm UQSHLR_ZPmZ  : sve2_int_arith_pred<0b011010, "uqshlr",  null_frag,              "UQSHLR_ZPZZ",  DestructiveBinaryCommWithRev, "UQSHL_ZPmZ", /*isReverseInstr*/ 1>;
+  defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr", null_frag,              "SQRSHLR_ZPZZ", DestructiveBinaryCommWithRev, "SQRSHL_ZPmZ", /*isReverseInstr*/ 1>;
+  defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag,              "UQRSHLR_ZPZZ", DestructiveBinaryCommWithRev, "UQRSHL_ZPmZ", /*isReverseInstr*/ 1>;
+
+  defm SRSHL_ZPZZ   : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_srshl>;
+  defm URSHL_ZPZZ   : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_urshl>;
+  defm SQSHL_ZPZZ   : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_sqshl>;
+  defm UQSHL_ZPZZ   : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_uqshl>;
+  defm SQRSHL_ZPZZ  : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_sqrshl>;
+  defm UQRSHL_ZPZZ  : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_uqrshl>;
 
   let Predicates = [HasSVE2, UseExperimentalZeroingPseudos] in {
     defm SQSHL_ZPZI  : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;

diff  --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index d0523c44cc49..02ac788b16e5 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -372,6 +372,12 @@ class SVE_2_Op_Pred_All_Active<ValueType vtd, SDPatternOperator op,
 : Pat<(vtd (op (pt (SVEAllActive)), vt1:$Op1, vt2:$Op2)),
       (inst $Op1, $Op2)>;
 
+class SVE_2_Op_Pred_All_Active_Pt<ValueType vtd, SDPatternOperator op,
+                                  ValueType pt, ValueType vt1, ValueType vt2,
+                                  Instruction inst>
+: Pat<(vtd (op (pt (SVEAllActive:$Op1)), vt1:$Op2, vt2:$Op3)),
+      (inst $Op1, $Op2, $Op3)>;
+
 class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
                    ValueType vt2, ValueType vt3, Instruction inst>
 : Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)),
@@ -3167,11 +3173,20 @@ class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve2_int_arith_pred<bits<6> opc, string asm, SDPatternOperator op> {
-  def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>;
-  def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>;
-  def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>;
-  def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>;
+multiclass sve2_int_arith_pred<bits<6> opc, string asm, SDPatternOperator op,
+                               string Ps = "",
+                               DestructiveInstTypeEnum flags=DestructiveOther,
+                               string revname="", bit isReverseInstr=0> {
+  let DestructiveInstType = flags in {
+  def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>,
+             SVEPseudo2Instr<Ps # _B, 1>, SVEInstr2Rev<NAME # _B, revname # _B, isReverseInstr>;
+  def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>,
+             SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
+  def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>,
+             SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+  def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>,
+             SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
+  }
 
   def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
   def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1,  nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
@@ -8139,3 +8154,15 @@ multiclass sve_int_shift_pred_bhsd<SDPatternOperator op,
   def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1,  i32, imm_s, !cast<Instruction>(NAME # _UNDEF_S)>;
   def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1,  i64, imm_d, !cast<Instruction>(NAME # _UNDEF_D)>;
 }
+
+multiclass sve_int_bin_pred_all_active_bhsd<SDPatternOperator op> {
+  def _UNDEF_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesUndef>;
+  def _UNDEF_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+  def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+  def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
+
+  def : SVE_2_Op_Pred_All_Active_Pt<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _UNDEF_B)>;
+  def : SVE_2_Op_Pred_All_Active_Pt<nxv8i16, op, nxv8i1,  nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+  def : SVE_2_Op_Pred_All_Active_Pt<nxv4i32, op, nxv4i1,  nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+  def : SVE_2_Op_Pred_All_Active_Pt<nxv2i64, op, nxv2i1,  nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
+}

diff  --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-dsp.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-dsp.ll
index 30e521cc75f6..9afdb48c053e 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-dsp.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-dsp.ll
@@ -706,6 +706,69 @@ define <vscale x 2 x i64> @sqrshl_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64>
   ret <vscale x 2 x i64> %out
 }
 
+;
+; SQRSHLR
+;
+
+define <vscale x 16 x i8> @sqrshlr_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: sqrshlr_i8:
+; CHECK: ptrue p0.b
+; CHECK-NEXT: sqrshlr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: ret
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.sqrshl.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                 <vscale x 16 x i8> %b,
+                                                                 <vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @sqrshlr_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sqrshlr_i16:
+; CHECK: ptrue p0.h
+; CHECK-NEXT: sqrshlr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sqrshl.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x i16> %b,
+                                                                 <vscale x 8 x i16> %a)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sqrshlr_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sqrshlr_i32:
+; CHECK: ptrue p0.s
+; CHECK-NEXT: sqrshlr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqrshl.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                 <vscale x 4 x i32> %b,
+                                                                 <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sqrshlr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sqrshlr_i64:
+; CHECK: ptrue p0.d
+; CHECK-NEXT: sqrshlr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqrshl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b,
+                                                                 <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @sqrshlr_i64_noptrue(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sqrshlr_i64_noptrue:
+; CHECK: sqrshl z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqrshl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b,
+                                                                 <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; SQSHL (Vectors)
 ;
@@ -750,6 +813,69 @@ define <vscale x 2 x i64> @sqshl_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %
   ret <vscale x 2 x i64> %out
 }
 
+;
+; SQSHLR
+;
+
+define <vscale x 16 x i8> @sqshlr_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: sqshlr_i8:
+; CHECK: ptrue p0.b
+; CHECK-NEXT: sqshlr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: ret
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.sqshl.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                 <vscale x 16 x i8> %b,
+                                                                 <vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @sqshlr_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sqshlr_i16:
+; CHECK: ptrue p0.h
+; CHECK-NEXT: sqshlr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sqshl.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x i16> %b,
+                                                                 <vscale x 8 x i16> %a)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sqshlr_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sqshlr_i32:
+; CHECK: ptrue p0.s
+; CHECK-NEXT: sqshlr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqshl.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                 <vscale x 4 x i32> %b,
+                                                                 <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sqshlr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sqshlr_i64:
+; CHECK: ptrue p0.d
+; CHECK-NEXT: sqshlr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqshl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b,
+                                                                 <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @sqshlr_i64_noptrue(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sqshlr_i64_noptrue:
+; CHECK: sqshl z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqshl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b,
+                                                                 <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; SQSHL (Scalar)
 ;
@@ -1110,6 +1236,69 @@ define <vscale x 2 x i64> @srshl_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %
   ret <vscale x 2 x i64> %out
 }
 
+;
+; SRSHLR
+;
+
+define <vscale x 16 x i8> @srshlr_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: srshlr_i8:
+; CHECK: ptrue p0.b
+; CHECK-NEXT: srshlr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: ret
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.srshl.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                 <vscale x 16 x i8> %b,
+                                                                 <vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @srshlr_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: srshlr_i16:
+; CHECK: ptrue p0.h
+; CHECK-NEXT: srshlr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.srshl.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x i16> %b,
+                                                                 <vscale x 8 x i16> %a)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @srshlr_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: srshlr_i32:
+; CHECK: ptrue p0.s
+; CHECK-NEXT: srshlr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.srshl.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                 <vscale x 4 x i32> %b,
+                                                                 <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @srshlr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: srshlr_i64:
+; CHECK: ptrue p0.d
+; CHECK-NEXT: srshlr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.srshl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b,
+                                                                 <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @srshlr_i64_noptrue(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: srshlr_i64_noptrue:
+; CHECK: srshl z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.srshl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b,
+                                                                 <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; SRSHR
 ;
@@ -1550,6 +1739,69 @@ define <vscale x 2 x i64> @uqrshl_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64>
   ret <vscale x 2 x i64> %out
 }
 
+;
+; UQRSHLR
+;
+
+define <vscale x 16 x i8> @uqrshlr_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: uqrshlr_i8:
+; CHECK: ptrue p0.b
+; CHECK-NEXT: uqrshlr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: ret
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.uqrshl.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                 <vscale x 16 x i8> %b,
+                                                                 <vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @uqrshlr_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uqrshlr_i16:
+; CHECK: ptrue p0.h
+; CHECK-NEXT: uqrshlr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uqrshl.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x i16> %b,
+                                                                 <vscale x 8 x i16> %a)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uqrshlr_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uqrshlr_i32:
+; CHECK: ptrue p0.s
+; CHECK-NEXT: uqrshlr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqrshl.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                 <vscale x 4 x i32> %b,
+                                                                 <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @uqrshlr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: uqrshlr_i64:
+; CHECK: ptrue p0.d
+; CHECK-NEXT: uqrshlr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqrshl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b,
+                                                                 <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @uqrshlr_i64_noptrue(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: uqrshlr_i64_noptrue:
+; CHECK: uqrshl z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqrshl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b,
+                                                                 <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; UQSHL (Vectors)
 ;
@@ -1594,6 +1846,69 @@ define <vscale x 2 x i64> @uqshl_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %
   ret <vscale x 2 x i64> %out
 }
 
+;
+; UQSHLR
+;
+
+define <vscale x 16 x i8> @uqshlr_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: uqshlr_i8:
+; CHECK: ptrue p0.b
+; CHECK-NEXT: uqshlr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: ret
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.uqshl.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                 <vscale x 16 x i8> %b,
+                                                                 <vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @uqshlr_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uqshlr_i16:
+; CHECK: ptrue p0.h
+; CHECK-NEXT: uqshlr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uqshl.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x i16> %b,
+                                                                 <vscale x 8 x i16> %a)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uqshlr_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uqshlr_i32:
+; CHECK: ptrue p0.s
+; CHECK-NEXT: uqshlr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqshl.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                 <vscale x 4 x i32> %b,
+                                                                 <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @uqshlr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: uqshlr_i64:
+; CHECK: ptrue p0.d
+; CHECK-NEXT: uqshlr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqshl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b,
+                                                                 <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @uqshlr_i64_noptrue(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: uqshlr_i64_noptrue:
+; CHECK: uqshl z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqshl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b,
+                                                                 <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; UQSHL (Scalar)
 ;
@@ -1880,6 +2195,69 @@ define <vscale x 2 x i64> @urshl_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %
   ret <vscale x 2 x i64> %out
 }
 
+;
+; URSHLR
+;
+
+define <vscale x 16 x i8> @urshlr_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: urshlr_i8:
+; CHECK: ptrue p0.b
+; CHECK-NEXT: urshlr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: ret
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.urshl.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                 <vscale x 16 x i8> %b,
+                                                                 <vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @urshlr_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: urshlr_i16:
+; CHECK: ptrue p0.h
+; CHECK-NEXT: urshlr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.urshl.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x i16> %b,
+                                                                 <vscale x 8 x i16> %a)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @urshlr_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: urshlr_i32:
+; CHECK: ptrue p0.s
+; CHECK-NEXT: urshlr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.urshl.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                 <vscale x 4 x i32> %b,
+                                                                 <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @urshlr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: urshlr_i64:
+; CHECK: ptrue p0.d
+; CHECK-NEXT: urshlr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.urshl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b,
+                                                                 <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @urshlr_i64_noptrue(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: urshlr_i64_noptrue:
+; CHECK: urshl z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.urshl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b,
+                                                                 <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; URSHR
 ;
@@ -2289,3 +2667,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.usra.nxv16i8(<vscale x 16 x i8>, <v
 declare <vscale x 8 x i16> @llvm.aarch64.sve.usra.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.usra.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.usra.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32)