[llvm] [AArch64][SVE2] Generate signed/unsigned rounding shift rights (PR #78374)
Usman Nadeem via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 16 17:04:09 PST 2024
https://github.com/UsmanNadeem created https://github.com/llvm/llvm-project/pull/78374
Matching code is similar to that for rshrnb except that immediate
shift value has a larger range, and support for signed shift. rshrnb
now uses the new AArch64ISD node for uniform rounding.
Change-Id: Idbb811f318d33c7637371cf7bb00285d20e1771d
>From b33d0002a29f095e5dc9ef352359e01184b61795 Mon Sep 17 00:00:00 2001
From: "Nadeem, Usman" <mnadeem at quicinc.com>
Date: Tue, 16 Jan 2024 17:02:01 -0800
Subject: [PATCH] [AArch64][SVE2] Generate signed/unsigned rounding shift
rights
Matching code is similar to that for rshrnb except that immediate
shift value has a larger range, and support for signed shift. rshrnb
now uses the new AArch64ISD node for uniform rounding.
Change-Id: Idbb811f318d33c7637371cf7bb00285d20e1771d
---
.../Target/AArch64/AArch64ISelLowering.cpp | 81 +++++--
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 +
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 10 +-
.../AArch64/sve2-intrinsics-combine-rshrnb.ll | 17 +-
llvm/test/CodeGen/AArch64/sve2-rsh.ll | 203 ++++++++++++++++++
5 files changed, 276 insertions(+), 37 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/sve2-rsh.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 91b36161ab46e89..d1731fcaabf8664 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2649,6 +2649,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::MSRR)
MAKE_CASE(AArch64ISD::RSHRNB_I)
MAKE_CASE(AArch64ISD::CTTZ_ELTS)
+ MAKE_CASE(AArch64ISD::SRSHR_I_PRED)
+ MAKE_CASE(AArch64ISD::URSHR_I_PRED)
}
#undef MAKE_CASE
return nullptr;
@@ -2933,6 +2935,7 @@ static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
static SDValue convertFixedMaskToScalableVector(SDValue Mask,
SelectionDAG &DAG);
+static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT);
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
EVT VT);
@@ -13713,6 +13716,42 @@ SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
return SDValue();
}
+static SDValue tryLowerToRoundingShiftRightByImm(SDValue Shift,
+ SelectionDAG &DAG) {
+ if (Shift->getOpcode() != ISD::SRL && Shift->getOpcode() != ISD::SRA)
+ return SDValue();
+
+ EVT ResVT = Shift.getValueType();
+ assert(ResVT.isScalableVT());
+
+ auto ShiftOp1 =
+ dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
+ if (!ShiftOp1)
+ return SDValue();
+ unsigned ShiftValue = ShiftOp1->getZExtValue();
+
+ if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
+ return SDValue();
+
+ SDValue Add = Shift->getOperand(0);
+ if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
+ return SDValue();
+ auto AddOp1 =
+ dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
+ if (!AddOp1)
+ return SDValue();
+ uint64_t AddValue = AddOp1->getZExtValue();
+ if (AddValue != 1ULL << (ShiftValue - 1))
+ return SDValue();
+
+ SDLoc DL(Shift);
+ unsigned Opc = Shift->getOpcode() == ISD::SRA ? AArch64ISD::SRSHR_I_PRED
+ : AArch64ISD::URSHR_I_PRED;
+ return DAG.getNode(Opc, DL, ResVT, getPredicateForVector(DAG, DL, ResVT),
+ Add->getOperand(0),
+ DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
+}
+
SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
@@ -13738,6 +13777,10 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
Op.getOperand(0), Op.getOperand(1));
case ISD::SRA:
case ISD::SRL:
+ if (VT.isScalableVector() && Subtarget->hasSVE2orSME())
+ if (SDValue RSH = tryLowerToRoundingShiftRightByImm(Op, DAG))
+ return RSH;
+
if (VT.isScalableVector() ||
useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
@@ -20025,6 +20068,12 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_sve_uqsub_x:
return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_sve_srshr:
+ return DAG.getNode(AArch64ISD::SRSHR_I_PRED, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_urshr:
+ return DAG.getNode(AArch64ISD::URSHR_I_PRED, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_asrd:
return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2), N->getOperand(3));
@@ -20652,12 +20701,13 @@ static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
// a uzp1 or a truncating store.
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
- EVT VT = Srl->getValueType(0);
+ if (Srl->getOpcode() != AArch64ISD::URSHR_I_PRED)
+ return SDValue();
- if (!VT.isScalableVector() || !Subtarget->hasSVE2() ||
- Srl->getOpcode() != ISD::SRL)
+ if (!isAllActivePredicate(DAG, Srl.getOperand(0)))
return SDValue();
+ EVT VT = Srl->getValueType(0);
EVT ResVT;
if (VT == MVT::nxv8i16)
ResVT = MVT::nxv16i8;
@@ -20668,29 +20718,14 @@ static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG,
else
return SDValue();
- auto SrlOp1 =
- dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Srl->getOperand(1)));
- if (!SrlOp1)
- return SDValue();
- unsigned ShiftValue = SrlOp1->getZExtValue();
- if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
- return SDValue();
-
- SDValue Add = Srl->getOperand(0);
- if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
- return SDValue();
- auto AddOp1 =
- dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
- if (!AddOp1)
- return SDValue();
- uint64_t AddValue = AddOp1->getZExtValue();
- if (AddValue != 1ULL << (ShiftValue - 1))
+ unsigned ShiftValue =
+ cast<ConstantSDNode>(Srl->getOperand(2))->getZExtValue();
+ if (ShiftValue > ResVT.getScalarSizeInBits())
return SDValue();
SDLoc DL(Srl);
- SDValue Rshrnb = DAG.getNode(
- AArch64ISD::RSHRNB_I, DL, ResVT,
- {Add->getOperand(0), DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
+ SDValue Rshrnb = DAG.getNode(AArch64ISD::RSHRNB_I, DL, ResVT,
+ {Srl->getOperand(1), Srl->getOperand(2)});
return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6ddbcd41dcb7696..e1ecd3f4e36be03 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -210,7 +210,9 @@ enum NodeType : unsigned {
UQSHL_I,
SQSHLU_I,
SRSHR_I,
+ SRSHR_I_PRED,
URSHR_I,
+ URSHR_I_PRED,
// Vector narrowing shift by immediate (bottom)
RSHRNB_I,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index c4d69232c9e30ea..516ab36464379dd 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -232,6 +232,8 @@ def SDT_AArch64Arith_Imm : SDTypeProfile<1, 3, [
]>;
def AArch64asrd_m1 : SDNode<"AArch64ISD::SRAD_MERGE_OP1", SDT_AArch64Arith_Imm>;
+def AArch64urshri_p : SDNode<"AArch64ISD::URSHR_I_PRED", SDT_AArch64Arith_Imm>;
+def AArch64srshri_p : SDNode<"AArch64ISD::SRSHR_I_PRED", SDT_AArch64Arith_Imm>;
def SDT_AArch64IntExtend : SDTypeProfile<1, 4, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3, OtherVT>, SDTCisVec<4>,
@@ -3538,8 +3540,8 @@ let Predicates = [HasSVE2orSME] in {
// SVE2 predicated shifts
defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left_dup<0b0110, "sqshl", "SQSHL_ZPZI", int_aarch64_sve_sqshl>;
defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left_dup<0b0111, "uqshl", "UQSHL_ZPZI", int_aarch64_sve_uqshl>;
- defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right< 0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>;
- defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right< 0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>;
+ defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right< 0b1100, "srshr", "SRSHR_ZPZI", AArch64srshri_p>;
+ defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right< 0b1101, "urshr", "URSHR_ZPZI", AArch64urshri_p>;
defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>;
// SVE2 integer add/subtract long
@@ -3583,8 +3585,8 @@ let Predicates = [HasSVE2orSME] in {
// SVE2 bitwise shift right and accumulate
defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra", AArch64ssra>;
defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra", AArch64usra>;
- defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra, int_aarch64_sve_srshr>;
- defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra, int_aarch64_sve_urshr>;
+ defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra, AArch64srshri_p>;
+ defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra, AArch64urshri_p>;
// SVE2 complex integer add
defm CADD_ZZI : sve2_int_cadd<0b0, "cadd", int_aarch64_sve_cadd_x>;
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
index 0afd11d098a0009..58ef846a3172381 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
@@ -184,16 +184,14 @@ define void @wide_add_shift_add_rshrnb_d(ptr %dest, i64 %index, <vscale x 4 x i6
define void @neg_wide_add_shift_add_rshrnb_d(ptr %dest, i64 %index, <vscale x 4 x i64> %arg1){
; CHECK-LABEL: neg_wide_add_shift_add_rshrnb_d:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z2.d, #0x800000000000
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEXT: add z1.d, z1.d, z2.d
-; CHECK-NEXT: lsr z1.d, z1.d, #48
-; CHECK-NEXT: lsr z0.d, z0.d, #48
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: urshr z1.d, p0/m, z1.d, #48
+; CHECK-NEXT: urshr z0.d, p0/m, z0.d, #48
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ld1w { z1.s }, p1/z, [x0, x1, lsl #2]
; CHECK-NEXT: add z0.s, z1.s, z0.s
-; CHECK-NEXT: st1w { z0.s }, p0, [x0, x1, lsl #2]
+; CHECK-NEXT: st1w { z0.s }, p1, [x0, x1, lsl #2]
; CHECK-NEXT: ret
%1 = add <vscale x 4 x i64> %arg1, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 140737488355328, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
%2 = lshr <vscale x 4 x i64> %1, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 48, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
@@ -286,8 +284,7 @@ define void @neg_add_lshr_rshrnb_s(ptr %ptr, ptr %dst, i64 %index){
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT: add z0.d, z0.d, #32 // =0x20
-; CHECK-NEXT: lsr z0.d, z0.d, #6
+; CHECK-NEXT: urshr z0.d, p0/m, z0.d, #6
; CHECK-NEXT: st1h { z0.d }, p0, [x1, x2, lsl #1]
; CHECK-NEXT: ret
%load = load <vscale x 2 x i64>, ptr %ptr, align 2
diff --git a/llvm/test/CodeGen/AArch64/sve2-rsh.ll b/llvm/test/CodeGen/AArch64/sve2-rsh.ll
new file mode 100644
index 000000000000000..2bdfc1931cdc2f3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-rsh.ll
@@ -0,0 +1,203 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck --check-prefixes=CHECK,SVE %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s -o - | FileCheck --check-prefixes=CHECK,SVE2 %s
+
+; Wrong add/shift amount. Should be 32 for shift of 6.
+define <vscale x 2 x i64> @neg_urshr_1(<vscale x 2 x i64> %x) {
+; CHECK-LABEL: neg_urshr_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add z0.d, z0.d, #16 // =0x10
+; CHECK-NEXT: lsr z0.d, z0.d, #6
+; CHECK-NEXT: ret
+ %add = add <vscale x 2 x i64> %x, splat (i64 16)
+ %sh = lshr <vscale x 2 x i64> %add, splat (i64 6)
+ ret <vscale x 2 x i64> %sh
+}
+
+; Vector Shift.
+define <vscale x 2 x i64> @neg_urshr_2(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: neg_urshr_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add z0.d, z0.d, #32 // =0x20
+; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+ %add = add <vscale x 2 x i64> %x, splat (i64 32)
+ %sh = lshr <vscale x 2 x i64> %add, %y
+ ret <vscale x 2 x i64> %sh
+}
+
+; Vector Add.
+define <vscale x 2 x i64> @neg_urshr_3(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: neg_urshr_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add z0.d, z0.d, z1.d
+; CHECK-NEXT: lsr z0.d, z0.d, #6
+; CHECK-NEXT: ret
+ %add = add <vscale x 2 x i64> %x, %y
+ %sh = lshr <vscale x 2 x i64> %add, splat (i64 6)
+ ret <vscale x 2 x i64> %sh
+}
+
+; Add has two uses.
+define <vscale x 2 x i64> @neg_urshr_4(<vscale x 2 x i64> %x) {
+; CHECK-LABEL: neg_urshr_4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT: add z0.d, z0.d, #32 // =0x20
+; CHECK-NEXT: lsr z8.d, z0.d, #6
+; CHECK-NEXT: bl use
+; CHECK-NEXT: mov z0.d, z8.d
+; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %add = add <vscale x 2 x i64> %x, splat (i64 32)
+ %sh = lshr <vscale x 2 x i64> %add, splat (i64 6)
+ call void @use(<vscale x 2 x i64> %add)
+ ret <vscale x 2 x i64> %sh
+}
+
+define <vscale x 16 x i8> @urshr_i8(<vscale x 16 x i8> %x) {
+; SVE-LABEL: urshr_i8:
+; SVE: // %bb.0:
+; SVE-NEXT: add z0.b, z0.b, #32 // =0x20
+; SVE-NEXT: lsr z0.b, z0.b, #6
+; SVE-NEXT: ret
+;
+; SVE2-LABEL: urshr_i8:
+; SVE2: // %bb.0:
+; SVE2-NEXT: ptrue p0.b
+; SVE2-NEXT: urshr z0.b, p0/m, z0.b, #6
+; SVE2-NEXT: ret
+ %add = add <vscale x 16 x i8> %x, splat (i8 32)
+ %sh = lshr <vscale x 16 x i8> %add, splat (i8 6)
+ ret <vscale x 16 x i8> %sh
+}
+
+define <vscale x 8 x i16> @urshr_i16(<vscale x 8 x i16> %x) {
+; SVE-LABEL: urshr_i16:
+; SVE: // %bb.0:
+; SVE-NEXT: add z0.h, z0.h, #32 // =0x20
+; SVE-NEXT: lsr z0.h, z0.h, #6
+; SVE-NEXT: ret
+;
+; SVE2-LABEL: urshr_i16:
+; SVE2: // %bb.0:
+; SVE2-NEXT: ptrue p0.h
+; SVE2-NEXT: urshr z0.h, p0/m, z0.h, #6
+; SVE2-NEXT: ret
+ %add = add <vscale x 8 x i16> %x, splat (i16 32)
+ %sh = lshr <vscale x 8 x i16> %add, splat (i16 6)
+ ret <vscale x 8 x i16> %sh
+}
+
+define <vscale x 4 x i32> @urshr_i32(<vscale x 4 x i32> %x) {
+; SVE-LABEL: urshr_i32:
+; SVE: // %bb.0:
+; SVE-NEXT: add z0.s, z0.s, #32 // =0x20
+; SVE-NEXT: lsr z0.s, z0.s, #6
+; SVE-NEXT: ret
+;
+; SVE2-LABEL: urshr_i32:
+; SVE2: // %bb.0:
+; SVE2-NEXT: ptrue p0.s
+; SVE2-NEXT: urshr z0.s, p0/m, z0.s, #6
+; SVE2-NEXT: ret
+ %add = add <vscale x 4 x i32> %x, splat (i32 32)
+ %sh = lshr <vscale x 4 x i32> %add, splat (i32 6)
+ ret <vscale x 4 x i32> %sh
+}
+
+define <vscale x 2 x i64> @urshr_i64(<vscale x 2 x i64> %x) {
+; SVE-LABEL: urshr_i64:
+; SVE: // %bb.0:
+; SVE-NEXT: add z0.d, z0.d, #32 // =0x20
+; SVE-NEXT: lsr z0.d, z0.d, #6
+; SVE-NEXT: ret
+;
+; SVE2-LABEL: urshr_i64:
+; SVE2: // %bb.0:
+; SVE2-NEXT: ptrue p0.d
+; SVE2-NEXT: urshr z0.d, p0/m, z0.d, #6
+; SVE2-NEXT: ret
+ %add = add <vscale x 2 x i64> %x, splat (i64 32)
+ %sh = lshr <vscale x 2 x i64> %add, splat (i64 6)
+ ret <vscale x 2 x i64> %sh
+}
+
+define <vscale x 16 x i8> @srshr_i8(<vscale x 16 x i8> %x) {
+; SVE-LABEL: srshr_i8:
+; SVE: // %bb.0:
+; SVE-NEXT: add z0.b, z0.b, #32 // =0x20
+; SVE-NEXT: asr z0.b, z0.b, #6
+; SVE-NEXT: ret
+;
+; SVE2-LABEL: srshr_i8:
+; SVE2: // %bb.0:
+; SVE2-NEXT: ptrue p0.b
+; SVE2-NEXT: srshr z0.b, p0/m, z0.b, #6
+; SVE2-NEXT: ret
+ %add = add <vscale x 16 x i8> %x, splat (i8 32)
+ %sh = ashr <vscale x 16 x i8> %add, splat (i8 6)
+ ret <vscale x 16 x i8> %sh
+}
+
+define <vscale x 8 x i16> @srshr_i16(<vscale x 8 x i16> %x) {
+; SVE-LABEL: srshr_i16:
+; SVE: // %bb.0:
+; SVE-NEXT: add z0.h, z0.h, #32 // =0x20
+; SVE-NEXT: asr z0.h, z0.h, #6
+; SVE-NEXT: ret
+;
+; SVE2-LABEL: srshr_i16:
+; SVE2: // %bb.0:
+; SVE2-NEXT: ptrue p0.h
+; SVE2-NEXT: srshr z0.h, p0/m, z0.h, #6
+; SVE2-NEXT: ret
+ %add = add <vscale x 8 x i16> %x, splat (i16 32)
+ %sh = ashr <vscale x 8 x i16> %add, splat (i16 6)
+ ret <vscale x 8 x i16> %sh
+}
+
+define <vscale x 4 x i32> @srshr_i32(<vscale x 4 x i32> %x) {
+; SVE-LABEL: srshr_i32:
+; SVE: // %bb.0:
+; SVE-NEXT: add z0.s, z0.s, #32 // =0x20
+; SVE-NEXT: asr z0.s, z0.s, #6
+; SVE-NEXT: ret
+;
+; SVE2-LABEL: srshr_i32:
+; SVE2: // %bb.0:
+; SVE2-NEXT: ptrue p0.s
+; SVE2-NEXT: srshr z0.s, p0/m, z0.s, #6
+; SVE2-NEXT: ret
+ %add = add <vscale x 4 x i32> %x, splat (i32 32)
+ %sh = ashr <vscale x 4 x i32> %add, splat (i32 6)
+ ret <vscale x 4 x i32> %sh
+}
+
+define <vscale x 2 x i64> @srshr_i64(<vscale x 2 x i64> %x) {
+; SVE-LABEL: srshr_i64:
+; SVE: // %bb.0:
+; SVE-NEXT: add z0.d, z0.d, #32 // =0x20
+; SVE-NEXT: asr z0.d, z0.d, #6
+; SVE-NEXT: ret
+;
+; SVE2-LABEL: srshr_i64:
+; SVE2: // %bb.0:
+; SVE2-NEXT: ptrue p0.d
+; SVE2-NEXT: srshr z0.d, p0/m, z0.d, #6
+; SVE2-NEXT: ret
+ %add = add <vscale x 2 x i64> %x, splat (i64 32)
+ %sh = ashr <vscale x 2 x i64> %add, splat (i64 6)
+ ret <vscale x 2 x i64> %sh
+}
+
+declare void @use(<vscale x 2 x i64>)
More information about the llvm-commits
mailing list