[llvm] ff2dd8a - [AArch64][SVE] Fold vector ZExt/SExt into gather loads where possible
Joe Ellis via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 16 08:10:22 PDT 2021
Author: Joe Ellis
Date: 2021-03-16T15:09:46Z
New Revision: ff2dd8a21251ba0e6d284c9823ff1118a23b59ae
URL: https://github.com/llvm/llvm-project/commit/ff2dd8a21251ba0e6d284c9823ff1118a23b59ae
DIFF: https://github.com/llvm/llvm-project/commit/ff2dd8a21251ba0e6d284c9823ff1118a23b59ae.diff
LOG: [AArch64][SVE] Fold vector ZExt/SExt into gather loads where possible
This commit folds sxtw'd or uxtw'd offsets into gather loads where
possible with a DAGCombine optimization.
As an example, the following code:
1 #include <arm_sve.h>
2
3 svuint64_t func(svbool_t pred, const int32_t *base, svint64_t offsets) {
4 return svld1sw_gather_s64offset_u64(
5 pred, base, svextw_s64_x(pred, offsets)
6 );
7 }
would previously lower to the following assembly:
sxtw z0.d, p0/m, z0.d
ld1sw { z0.d }, p0/z, [x0, z0.d]
ret
but now lowers to:
ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw]
ret
Differential Revision: https://reviews.llvm.org/D97858
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll
llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 613895dd3625..e61a6edac34c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14400,6 +14400,63 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
+ unsigned Opc = N->getOpcode();
+
+ assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
+ Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) ||
+ (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
+ Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
+ "Invalid opcode.");
+
+ const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
+ Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
+ const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
+ Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
+ const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
+ Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
+ Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
+ Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
+
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Pg = N->getOperand(1);
+ SDValue Base = N->getOperand(2);
+ SDValue Offset = N->getOperand(3);
+ SDValue Ty = N->getOperand(4);
+
+ EVT ResVT = N->getValueType(0);
+
+ const auto OffsetOpc = Offset.getOpcode();
+ const bool OffsetIsZExt =
+ OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
+ const bool OffsetIsSExt =
+ OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
+
+ // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
+ if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
+ SDValue ExtPg = Offset.getOperand(0);
+ VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
+ EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
+
+ // If the predicate for the sign- or zero-extended offset is the
+ // same as the predicate used for this load and the sign-/zero-extension
+ // was from a 32-bits...
+ if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
+ SDValue UnextendedOffset = Offset.getOperand(1);
+
+ unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
+ if (Signed)
+ NewOpc = getSignExtendedGatherOpcode(NewOpc);
+
+ return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
+ {Chain, Pg, Base, UnextendedOffset, Ty});
+ }
+ }
+
+ return SDValue();
+}
+
/// Target-specific DAG combine function for post-increment LD1 (lane) and
/// post-increment LD1R.
static SDValue performPostLD1Combine(SDNode *N,
@@ -15777,6 +15834,21 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performNVCASTCombine(N);
case AArch64ISD::UZP1:
return performUzpCombine(N, DAG);
+ case AArch64ISD::GLD1_MERGE_ZERO:
+ case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+ case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+ case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1_IMM_MERGE_ZERO:
+ case AArch64ISD::GLD1S_MERGE_ZERO:
+ case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
+ case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
+ case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
+ return performGLD1Combine(N, DAG);
case ISD::INSERT_VECTOR_ELT:
return performPostLD1Combine(N, DCI, true);
case ISD::EXTRACT_VECTOR_ELT:
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll
index 64cb89edd679..57778847b545 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll
@@ -78,7 +78,194 @@ define <vscale x 2 x i64> @gld1sw_index(<vscale x 2 x i1> %pg, i32* %base, <vsca
ret <vscale x 2 x i64> %res
}
+;
+; LD1H, LD1W, LD1D: base + 64-bit sxtw'd scaled offset
+; e.g. ld1h z0.d, p0/z, [x0, z0.d, sxtw #1]
+;
+
+define <vscale x 2 x i64> @gld1h_index_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_index_sxtw
+; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: ret
+ %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %sxtw)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1w_index_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1w_index_sxtw
+; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: ret
+ %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %sxtw)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1d_index_sxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_index_sxtw
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT: ret
+ %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i64> %sxtw)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_index_double_sxtw(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_index_double_sxtw
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT: ret
+ %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i64> %sxtw)
+ ret <vscale x 2 x double> %load
+}
+
+;
+; LD1SH, LD1SW: base + 64-bit sxtw'd scaled offset
+; e.g. ld1sh z0.d, p0/z, [x0, z0.d, sxtw #1]
+;
+
+define <vscale x 2 x i64> @gld1sh_index_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sh_index_sxtw
+; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: ret
+ %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %sxtw)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1sw_index_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sw_index_sxtw
+; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: ret
+ %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %sxtw)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+;
+; LD1H, LD1W, LD1D: base + 64-bit sxtw'd scaled offset
+; e.g. ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
+;
+
+define <vscale x 2 x i64> @gld1h_index_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_index_uxtw
+; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
+; CHECK-NEXT: ret
+ %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %uxtw)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1w_index_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1w_index_uxtw
+; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
+; CHECK-NEXT: ret
+ %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %uxtw)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1d_index_uxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_index_uxtw
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
+; CHECK-NEXT: ret
+ %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i64> %uxtw)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_index_double_uxtw(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_index_double_uxtw
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
+; CHECK-NEXT: ret
+ %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i64> %uxtw)
+ ret <vscale x 2 x double> %load
+}
+
+;
+; LD1SH, LD1SW: base + 64-bit uxtw'd scaled offset
+; e.g. ld1sh z0.d, p0/z, [x0, z0.d, uxtw #1]
+;
+
+define <vscale x 2 x i64> @gld1sh_index_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sh_index_uxtw
+; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1]
+; CHECK-NEXT: ret
+ %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %uxtw)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1sw_index_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sw_index_uxtw
+; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2]
+; CHECK-NEXT: ret
+ %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %uxtw)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll
index 7cf641a26427..21c08d152ef3 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll
@@ -100,8 +100,251 @@ define <vscale x 2 x i64> @gld1sw_d(<vscale x 2 x i1> %pg, i32* %base, <vscale x
ret <vscale x 2 x i64> %res
}
+;
+; LD1B, LD1W, LD1H, LD1D: base + 64-bit sxtw'd unscaled offset
+; e.g. ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
+;
+
+define <vscale x 2 x i64> @gld1b_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1b_d_sxtw:
+; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i64> %sxtw)
+ %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1h_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_d_sxtw:
+; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %sxtw)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1w_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gld1w_d_sxtw:
+; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %offsets)
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %sxtw)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1d_d_sxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d_sxtw:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.nxv2i64(<vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i64> %sxtw)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_d_double_sxtw(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d_double_sxtw:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.nxv2f64(<vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i64> %sxtw)
+ ret <vscale x 2 x double> %load
+}
+
+;
+; LD1SB, LD1SW, LD1SH: base + 64-bit sxtw'd unscaled offset
+; e.g. ld1sh { z0.d }, p0/z, [x0, z0.d]
+;
+
+define <vscale x 2 x i64> @gld1sb_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sb_d_sxtw:
+; CHECK: ld1sb { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i64> %sxtw)
+ %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1sh_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sh_d_sxtw:
+; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %sxtw)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1sw_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gld1sw_d_sxtw:
+; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+ %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %offsets)
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %sxtw)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+;
+; LD1B, LD1W, LD1H, LD1D: base + 64-bit uxtw'd unscaled offset
+; e.g. ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
+;
+
+define <vscale x 2 x i64> @gld1b_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1b_d_uxtw:
+; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i64> %uxtw)
+ %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1h_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_d_uxtw:
+; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %uxtw)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1w_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gld1w_d_uxtw:
+; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %offsets)
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %uxtw)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1d_d_uxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d_uxtw:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.nxv2i64(<vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i64> %uxtw)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_d_double_uxtw(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d_double_uxtw:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.nxv2f64(<vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i64> %uxtw)
+ ret <vscale x 2 x double> %load
+}
+
+;
+; LD1SB, LD1SW, LD1SH: base + 64-bit uxtw'd unscaled offset
+; e.g. ld1sh { z0.d }, p0/z, [x0, z0.d]
+;
+
+define <vscale x 2 x i64> @gld1sb_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sb_d_uxtw:
+; CHECK: ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i64> %uxtw)
+ %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1sh_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sh_d_uxtw:
+; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %b)
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %uxtw)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1sw_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gld1sw_d_uxtw:
+; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+ %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %offsets)
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %uxtw)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>)
More information about the llvm-commits
mailing list