[llvm] f87f23c - [AArch64][SVE] Add the SVE dupq_lane intrinsic
Kerry McLaughlin via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 24 06:08:34 PST 2020
Author: Kerry McLaughlin
Date: 2020-02-24T13:59:47Z
New Revision: f87f23c81caeb0b0b7b8e795023b7273a13115d2
URL: https://github.com/llvm/llvm-project/commit/f87f23c81caeb0b0b7b8e795023b7273a13115d2
DIFF: https://github.com/llvm/llvm-project/commit/f87f23c81caeb0b0b7b8e795023b7273a13115d2.diff
LOG: [AArch64][SVE] Add the SVE dupq_lane intrinsic
Summary:
Implements the @llvm.aarch64.sve.dupq.lane intrinsic.
As specified in the ACLE, the behaviour of:
svdupq_lane_u64(data, index)
...is identical to:
svtbl(data, svadd_x(svptrue_b64(),
svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
index * 2))
If the index is in the range [0,3], the operation is equivalent
to a single DUP (.q) instruction.
Reviewers: sdesmalen, c-rhodes, cameron.mcinally, efriedma, dancgr, rengolin
Reviewed By: sdesmalen
Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, cfe-commits, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74734
Added:
Modified:
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 72eb8ef6bc04..99a8926472c9 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -961,6 +961,12 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
LLVMVectorElementType<0>],
[IntrNoMem]>;
+ class AdvSIMD_SVE_DUPQ_Intrinsic
+ : Intrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>,
+ llvm_i64_ty],
+ [IntrNoMem]>;
+
class AdvSIMD_SVE_EXPA_Intrinsic
: Intrinsic<[llvm_anyvector_ty],
[LLVMVectorOfBitcastsToInt<0>],
@@ -1474,6 +1480,7 @@ def int_aarch64_sve_clasta_n : AdvSIMD_SVE_ReduceWithInit_Intrinsic;
def int_aarch64_sve_clastb : AdvSIMD_Pred2VectorArg_Intrinsic;
def int_aarch64_sve_clastb_n : AdvSIMD_SVE_ReduceWithInit_Intrinsic;
def int_aarch64_sve_compact : AdvSIMD_Pred1VectorArg_Intrinsic;
+def int_aarch64_sve_dupq_lane : AdvSIMD_SVE_DUPQ_Intrinsic;
def int_aarch64_sve_ext : AdvSIMD_2VectorArgIndexed_Intrinsic;
def int_aarch64_sve_lasta : AdvSIMD_SVE_Reduce_Intrinsic;
def int_aarch64_sve_lastb : AdvSIMD_SVE_Reduce_Intrinsic;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6d040a4057d6..32d594450ca6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3036,6 +3036,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::aarch64_sve_ptrue:
return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
Op.getOperand(1));
+ case Intrinsic::aarch64_sve_dupq_lane:
+ return LowerDUPQLane(Op, DAG);
case Intrinsic::aarch64_sve_insr: {
SDValue Scalar = Op.getOperand(2);
@@ -7512,6 +7514,54 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
}
+SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+
+ EVT VT = Op.getValueType();
+ if (!isTypeLegal(VT) || !VT.isScalableVector())
+ return SDValue();
+
+ // Current lowering only supports the SVE-ACLE types.
+ if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
+ return SDValue();
+
+ // The DUPQ operation is indepedent of element type so normalise to i64s.
+ SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
+ SDValue Idx128 = Op.getOperand(2);
+
+ // DUPQ can be used when idx is in range.
+ auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
+ if (CIdx && (CIdx->getZExtValue() <= 3)) {
+ SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
+ SDNode *DUPQ =
+ DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
+ return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
+ }
+
+ // The ACLE says this must produce the same result as:
+ // svtbl(data, svadd_x(svptrue_b64(),
+ // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
+ // index * 2))
+ SDValue One = DAG.getConstant(1, DL, MVT::i64);
+ SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
+
+ // create the vector 0,1,0,1,...
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR,
+ DL, MVT::nxv2i64, Zero, One);
+ SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
+
+ // create the vector idx64,idx64+1,idx64,idx64+1,...
+ SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
+ SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
+ SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
+
+ // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
+ SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
+ return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
+}
+
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
APInt &UndefBits) {
EVT VT = BVN->getValueType(0);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6eee456b25c2..4923d3b77ae3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -745,6 +745,7 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
index dae5adbd4303..eb1ed651256a 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
@@ -296,6 +296,179 @@ define double @clastb_n_f64(<vscale x 2 x i1> %pg, double %a, <vscale x 2 x doub
ret double %out
}
+;
+; DUPQ
+;
+
+define <vscale x 16 x i8> @dupq_i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: dupq_i8:
+; CHECK: mov z0.q, q0
+; CHECK-NEXT: ret
+ %out = call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %a, i64 0)
+ ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @dupq_i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: dupq_i16:
+; CHECK: mov z0.q, z0.q[1]
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %a, i64 1)
+ ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @dupq_i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: dupq_i32:
+; CHECK: mov z0.q, z0.q[2]
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %a, i64 2)
+ ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @dupq_i64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: dupq_i64:
+; CHECK: mov z0.q, z0.q[3]
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %a, i64 3)
+ ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 8 x half> @dupq_f16(<vscale x 8 x half> %a) {
+; CHECK-LABEL: dupq_f16:
+; CHECK: mov z0.q, q0
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %a, i64 0)
+ ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @dupq_f32(<vscale x 4 x float> %a) {
+; CHECK-LABEL: dupq_f32:
+; CHECK: mov z0.q, z0.q[1]
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %a, i64 1)
+ ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @dupq_f64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: dupq_f64:
+; CHECK: mov z0.q, z0.q[2]
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %a, i64 2)
+ ret <vscale x 2 x double> %out
+}
+
+;
+; DUPQ_LANE
+;
+
+define <vscale x 16 x i8> @dupq_lane_i8(<vscale x 16 x i8> %a, i64 %idx) {
+; CHECK-LABEL: dupq_lane_i8:
+; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1
+; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1
+; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0
+; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]]
+; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d
+; CHECK-NEXT: tbl z0.d, { z0.d }, [[Z4]].d
+; CHECK-NEXT: ret
+ %out = call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %a, i64 %idx)
+ ret <vscale x 16 x i8> %out
+}
+
+; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant).
+define <vscale x 8 x i16> @dupq_lane_i16(<vscale x 8 x i16> %a, i64 %idx) {
+; CHECK-LABEL: dupq_lane_i16:
+; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1
+; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1
+; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0
+; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]]
+; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d
+; CHECK: tbl z0.d, { z0.d }, [[Z4]].d
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %a, i64 %idx)
+ ret <vscale x 8 x i16> %out
+}
+
+; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant).
+define <vscale x 4 x i32> @dupq_lane_i32(<vscale x 4 x i32> %a, i64 %idx) {
+; CHECK-LABEL: dupq_lane_i32:
+; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1
+; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1
+; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0
+; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]]
+; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d
+; CHECK: tbl z0.d, { z0.d }, [[Z4]].d
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %a, i64 %idx)
+ ret <vscale x 4 x i32> %out
+}
+
+; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant).
+define <vscale x 2 x i64> @dupq_lane_i64(<vscale x 2 x i64> %a, i64 %idx) {
+; CHECK-LABEL: dupq_lane_i64:
+; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1
+; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1
+; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0
+; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]]
+; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d
+; CHECK: tbl z0.d, { z0.d }, [[Z4]].d
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %a, i64 %idx)
+ ret <vscale x 2 x i64> %out
+}
+
+; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant).
+define <vscale x 8 x half> @dupq_lane_f16(<vscale x 8 x half> %a, i64 %idx) {
+; CHECK-LABEL: dupq_lane_f16:
+; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1
+; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1
+; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0
+; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]]
+; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d
+; CHECK: tbl z0.d, { z0.d }, [[Z4]].d
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %a, i64 %idx)
+ ret <vscale x 8 x half> %out
+}
+
+; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant).
+define <vscale x 4 x float> @dupq_lane_f32(<vscale x 4 x float> %a, i64 %idx) {
+; CHECK-LABEL: dupq_lane_f32:
+; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1
+; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1
+; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0
+; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]]
+; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d
+; CHECK: tbl z0.d, { z0.d }, [[Z4]].d
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %a, i64 %idx)
+ ret <vscale x 4 x float> %out
+}
+
+; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant).
+define <vscale x 2 x double> @dupq_lane_f64(<vscale x 2 x double> %a, i64 %idx) {
+; CHECK-LABEL: dupq_lane_f64:
+; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1
+; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1
+; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0
+; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]]
+; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d
+; CHECK: tbl z0.d, { z0.d }, [[Z4]].d
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %a, i64 %idx)
+ ret <vscale x 2 x double> %out
+}
+
+; NOTE: Index out of range (0-3)
+define <vscale x 2 x i64> @dupq_i64_range(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: dupq_i64_range:
+; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1
+; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1
+; CHECK-DAG: add [[Z3:z[0-9]+]].d, [[Z2]].d, #8
+; CHECK: tbl z0.d, { z0.d }, [[Z3]].d
+; CHECK-NEXT: ret
+ %out = call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %a, i64 4)
+ ret <vscale x 2 x i64> %out
+}
+
;
; EXT
;
@@ -1616,6 +1789,14 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.compact.nxv2i64(<vscale x 2 x i1>,
declare <vscale x 4 x float> @llvm.aarch64.sve.compact.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>)
declare <vscale x 2 x double> @llvm.aarch64.sve.compact.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>)
+declare <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8>, i64)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64>, i64)
+declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64)
+declare <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double>, i64)
+
declare <vscale x 16 x i8> @llvm.aarch64.sve.ext.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
declare <vscale x 8 x i16> @llvm.aarch64.sve.ext.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
declare <vscale x 4 x i32> @llvm.aarch64.sve.ext.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
More information about the llvm-commits
mailing list