[llvm-branch-commits] [llvm] 111f559 - [SVE][CodeGen] Call refineIndexType & refineUniformBase from visitMGATHER
Kerry McLaughlin via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Dec 7 05:31:17 PST 2020
Author: Kerry McLaughlin
Date: 2020-12-07T13:20:19Z
New Revision: 111f559bbd12c59b0ac450ea2feb8f6981705647
URL: https://github.com/llvm/llvm-project/commit/111f559bbd12c59b0ac450ea2feb8f6981705647
DIFF: https://github.com/llvm/llvm-project/commit/111f559bbd12c59b0ac450ea2feb8f6981705647.diff
LOG: [SVE][CodeGen] Call refineIndexType & refineUniformBase from visitMGATHER
The refineIndexType & refineUniformBase functions added by D90942 can also be used to
improve CodeGen of masked gathers.
These changes were split out from D91092
Reviewed By: sdesmalen
Differential Revision: https://reviews.llvm.org/D92319
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
llvm/test/CodeGen/X86/masked_gather_scatter.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5481c52a5b12..96baaabdb813 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9410,13 +9410,13 @@ bool refineUniformBase(SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG) {
}
// Fold sext/zext of index into index type.
-bool refineIndexType(MaskedScatterSDNode *MSC, SDValue &Index, bool Scaled,
- SelectionDAG &DAG) {
+bool refineIndexType(MaskedGatherScatterSDNode *MGS, SDValue &Index,
+ bool Scaled, SelectionDAG &DAG) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (Index.getOpcode() == ISD::ZERO_EXTEND) {
SDValue Op = Index.getOperand(0);
- MSC->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED);
+ MGS->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED);
if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
Index = Op;
return true;
@@ -9425,7 +9425,7 @@ bool refineIndexType(MaskedScatterSDNode *MSC, SDValue &Index, bool Scaled,
if (Index.getOpcode() == ISD::SIGN_EXTEND) {
SDValue Op = Index.getOperand(0);
- MSC->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED);
+ MGS->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED);
if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
Index = Op;
return true;
@@ -9494,11 +9494,30 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
SDValue DAGCombiner::visitMGATHER(SDNode *N) {
MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);
SDValue Mask = MGT->getMask();
+ SDValue Chain = MGT->getChain();
+ SDValue Index = MGT->getIndex();
+ SDValue Scale = MGT->getScale();
+ SDValue PassThru = MGT->getPassThru();
+ SDValue BasePtr = MGT->getBasePtr();
SDLoc DL(N);
// Zap gathers with a zero mask.
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
- return CombineTo(N, MGT->getPassThru(), MGT->getChain());
+ return CombineTo(N, PassThru, MGT->getChain());
+
+ if (refineUniformBase(BasePtr, Index, DAG)) {
+ SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+ return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
+ PassThru.getValueType(), DL, Ops,
+ MGT->getMemOperand(), MGT->getIndexType());
+ }
+
+ if (refineIndexType(MGT, Index, MGT->isIndexScaled(), DAG)) {
+ SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+ return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
+ PassThru.getValueType(), DL, Ops,
+ MGT->getMemOperand(), MGT->getIndexType());
+ }
return SDValue();
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d729252c92d9..517f5e965157 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3894,6 +3894,9 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other);
+ if (getGatherScatterIndexIsExtended(Index))
+ Index = Index.getOperand(0);
+
SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL,
VTs, Ops);
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
index 747468ae3cf4..32dca0d26cdc 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
@@ -8,8 +8,6 @@
define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
@@ -22,8 +20,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
@@ -36,8 +32,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
; CHECK-NEXT: ret
%ptrs = getelementptr i64, i64* %base, <vscale x 2 x i32> %offsets
@@ -48,8 +42,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32>
define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
; CHECK-NEXT: ret
%ptrs = getelementptr half, half* %base, <vscale x 2 x i32> %offsets
@@ -60,8 +52,6 @@ define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32
define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
; CHECK-NEXT: ret
%ptrs = getelementptr float, float* %base, <vscale x 2 x i32> %offsets
@@ -72,8 +62,6 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i
define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
; CHECK-NEXT: ret
%ptrs = getelementptr double, double* %base, <vscale x 2 x i32> %offsets
@@ -84,10 +72,9 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
-; CHECK-NEXT: sxth z0.d, p1/m, z0.d
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxth z0.d, p0/m, z0.d
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@@ -98,10 +85,9 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
; CHECK-NEXT: ret
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
index b214fcf15911..1fc048a3adf7 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
@@ -8,8 +8,6 @@
define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xff
; CHECK-NEXT: ret
@@ -22,12 +20,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %o
define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
@@ -40,12 +33,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %
define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
@@ -58,12 +46,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %
define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
@@ -74,12 +57,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %
define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
@@ -90,12 +68,7 @@ define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32>
define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
@@ -106,12 +79,7 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32>
define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
@@ -122,10 +90,9 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32
define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT: sxtb z0.d, p1/m, z0.d
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@@ -136,13 +103,9 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
-; CHECK-NEXT: sxth z0.d, p1/m, z0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxth z0.d, p0/m, z0.d
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@@ -154,13 +117,9 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
-; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@@ -188,18 +147,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %o
define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: sunpklo z2.d, z0.s
-; CHECK-NEXT: sunpkhi z0.d, z0.s
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z2.d, z1.d, z2.d
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d]
-; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d]
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: and z0.s, z0.s, #0xffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
@@ -212,18 +160,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %
define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: sunpklo z2.d, z0.s
-; CHECK-NEXT: sunpkhi z0.d, z0.s
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z2.d, z1.d, z2.d
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d]
-; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d]
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
@@ -234,18 +171,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %
define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: sunpklo z2.d, z0.s
-; CHECK-NEXT: sunpkhi z0.d, z0.s
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z2.d, z1.d, z2.d
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d]
-; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d]
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
@@ -256,18 +182,7 @@ define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32>
define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: sunpklo z2.d, z0.s
-; CHECK-NEXT: sunpkhi z0.d, z0.s
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z2.d, z1.d, z2.d
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d]
-; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d]
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
@@ -291,19 +206,8 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %
define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: sunpklo z2.d, z0.s
-; CHECK-NEXT: sunpkhi z0.d, z0.s
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z2.d, z1.d, z2.d
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d]
-; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d]
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
; CHECK-NEXT: sxth z0.s, p0/m, z0.s
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
index d938567beb04..ada49b7fecbc 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
@@ -8,8 +8,7 @@
define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
@@ -22,8 +21,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
@@ -36,8 +34,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %offsets.zext
@@ -48,8 +45,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32>
define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %offsets.zext
@@ -60,8 +56,7 @@ define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32
define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %offsets.zext
@@ -72,8 +67,7 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i
define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %offsets.zext
@@ -84,8 +78,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxth z0.d, p0/m, z0.d
; CHECK-NEXT: ret
@@ -99,8 +92,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
; CHECK-NEXT: ret
@@ -118,14 +110,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32>
define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: uunpklo z1.d, z0.s
-; CHECK-NEXT: uunpkhi z0.d, z0.s
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1]
-; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1]
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
; CHECK-NEXT: and z0.s, z0.s, #0xffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
@@ -138,14 +123,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32>
define <vscale x 4 x i32> @masked_gather_nxv4i32(i32* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: uunpklo z1.d, z0.s
-; CHECK-NEXT: uunpkhi z0.d, z0.s
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2]
-; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2]
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%ptrs = getelementptr i32, i32* %base, <vscale x 4 x i64> %offsets.zext
@@ -156,14 +134,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i32(i32* %base, <vscale x 4 x i32>
define <vscale x 4 x half> @masked_gather_nxv4f16(half* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: uunpklo z1.d, z0.s
-; CHECK-NEXT: uunpkhi z0.d, z0.s
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1]
-; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1]
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%ptrs = getelementptr half, half* %base, <vscale x 4 x i64> %offsets.zext
@@ -174,14 +145,7 @@ define <vscale x 4 x half> @masked_gather_nxv4f16(half* %base, <vscale x 4 x i32
define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: uunpklo z1.d, z0.s
-; CHECK-NEXT: uunpkhi z0.d, z0.s
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2]
-; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2]
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%ptrs = getelementptr float, float* %base, <vscale x 4 x i64> %offsets.zext
@@ -192,15 +156,8 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i
define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: uunpklo z1.d, z0.s
-; CHECK-NEXT: uunpkhi z0.d, z0.s
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1]
-; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1]
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
; CHECK-NEXT: sxth z0.s, p0/m, z0.s
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
index 7a47311484f8..61b8e3e53e23 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
@@ -8,8 +8,7 @@
define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
@@ -22,11 +21,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %o
define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
@@ -40,11 +35,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %
define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
@@ -58,11 +49,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %
define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -74,11 +61,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %
define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -90,11 +73,7 @@ define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32>
define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -106,11 +85,7 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32>
define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -122,8 +97,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32
define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
; CHECK-NEXT: ret
@@ -137,11 +111,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxth z0.d, p0/m, z0.d
; CHECK-NEXT: ret
@@ -156,11 +126,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
; CHECK-NEXT: ret
@@ -179,14 +145,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32>
define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: uunpklo z1.d, z0.s
-; CHECK-NEXT: uunpkhi z0.d, z0.s
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1b { z0.d }, p2/z, [x0, z0.d, sxtw]
-; CHECK-NEXT: ld1b { z1.d }, p0/z, [x0, z1.d, sxtw]
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: and z0.s, z0.s, #0xff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
@@ -199,18 +158,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %o
define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: uunpkhi z2.d, z0.s
-; CHECK-NEXT: uunpklo z0.d, z0.s
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: add z1.d, z1.d, z2.d
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d]
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
-; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: and z0.s, z0.s, #0xffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
@@ -224,18 +172,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %
define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: uunpkhi z2.d, z0.s
-; CHECK-NEXT: uunpklo z0.d, z0.s
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: add z1.d, z1.d, z2.d
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1w { z1.d }, p2/z, [x8, z1.d]
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
-; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@@ -247,18 +184,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %
define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: uunpkhi z2.d, z0.s
-; CHECK-NEXT: uunpklo z0.d, z0.s
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: add z1.d, z1.d, z2.d
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d]
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
-; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@@ -270,18 +196,7 @@ define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32>
define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: uunpkhi z2.d, z0.s
-; CHECK-NEXT: uunpklo z0.d, z0.s
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: add z1.d, z1.d, z2.d
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1w { z1.d }, p2/z, [x8, z1.d]
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
-; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@@ -293,15 +208,8 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32>
define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: uunpklo z1.d, z0.s
-; CHECK-NEXT: uunpkhi z0.d, z0.s
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1b { z0.d }, p2/z, [x0, z0.d, sxtw]
-; CHECK-NEXT: ld1b { z1.d }, p0/z, [x0, z1.d, sxtw]
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
; CHECK-NEXT: sxtb z0.s, p0/m, z0.s
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
@@ -314,19 +222,8 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %
define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: uunpkhi z2.d, z0.s
-; CHECK-NEXT: uunpklo z0.d, z0.s
-; CHECK-NEXT: pfalse p1.b
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: add z1.d, z1.d, z2.d
-; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d]
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
; CHECK-NEXT: sxth z0.s, p0/m, z0.s
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
index be8909201a83..3f4f54c5d839 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
@@ -16,10 +16,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i64> %o
define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@@ -32,10 +29,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i64> %
define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@@ -48,10 +42,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i64> %
define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
@@ -62,10 +53,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i64> %
define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
@@ -76,10 +64,7 @@ define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i64>
define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
@@ -90,10 +75,7 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i64>
define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
@@ -117,10 +99,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> %
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxth z0.d, p0/m, z0.d
; CHECK-NEXT: ret
@@ -134,10 +113,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.d, x0
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 948928099d38..995e39f56355 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -765,45 +765,41 @@ define <16 x float> @test13(float* %base, <16 x i32> %ind) {
define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
; KNL_64-LABEL: test14:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0
-; KNL_64-NEXT: vmovd %esi, %xmm1
-; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1
-; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1
-; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT: vmovq %xmm0, %rax
+; KNL_64-NEXT: vmovd %esi, %xmm0
+; KNL_64-NEXT: vpbroadcastd %xmm0, %ymm0
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT: vpsllq $2, %zmm0, %zmm0
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
-; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
+; KNL_64-NEXT: vgatherqps (%rax,%zmm0), %ymm1 {%k1}
; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test14:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0
+; KNL_32-NEXT: vmovd %xmm0, %eax
; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
-; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
-; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
+; KNL_32-NEXT: vgatherdps (%eax,%zmm1), %zmm0 {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test14:
; SKX: # %bb.0:
-; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
-; SKX-NEXT: vpbroadcastd %esi, %ymm1
-; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
-; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
-; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vmovq %xmm0, %rax
+; SKX-NEXT: vpbroadcastd %esi, %ymm0
+; SKX-NEXT: vpmovsxdq %ymm0, %zmm0
+; SKX-NEXT: vpsllq $2, %zmm0, %zmm0
; SKX-NEXT: kxnorw %k0, %k0, %k1
-; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
+; SKX-NEXT: vgatherqps (%rax,%zmm0), %ymm1 {%k1}
; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test14:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0
+; SKX_32-NEXT: vmovd %xmm0, %eax
; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
-; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
-; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
+; SKX_32-NEXT: vgatherdps (%eax,%zmm1), %zmm0 {%k1}
; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
More information about the llvm-branch-commits
mailing list