[llvm] 258dac4 - [SVE] Enable use of 32bit gather/scatter indices for fixed length vectors

Sun May 22 05:09:36 PDT 2022

Author: Paul Walker
Date: 2022-05-22T12:32:30+01:00
New Revision: 258dac43d669bea0116124030b2dadf8ff9b922f

URL: https://github.com/llvm/llvm-project/commit/258dac43d669bea0116124030b2dadf8ff9b922f
DIFF: https://github.com/llvm/llvm-project/commit/258dac43d669bea0116124030b2dadf8ff9b922f.diff

LOG: [SVE] Enable use of 32bit gather/scatter indices for fixed length vectors

Differential Revision: https://reviews.llvm.org/D125193

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 5fc8757c55feb..a582838ff663e 100644

--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1390,7 +1390,9 @@ class TargetLoweringBase {
 
   // Returns true if VT is a legal index type for masked gathers/scatters
   // on this target
-  virtual bool shouldRemoveExtendFromGSIndex(EVT VT) const { return false; }
+  virtual bool shouldRemoveExtendFromGSIndex(EVT IndexVT, EVT DataVT) const {
+    return false;
+  }
 
   /// Return how the condition code should be treated: either it is legal, needs
   /// to be expanded to some other code sequence, or the target has a custom

diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index eae9109fd1ab5..958f2dbc61459 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10441,14 +10441,14 @@ bool refineUniformBase(SDValue &BasePtr, SDValue &Index, bool IndexIsScaled,
 }
 
 // Fold sext/zext of index into index type.
-bool refineIndexType(SDValue &Index, ISD::MemIndexType &IndexType,
+bool refineIndexType(SDValue &Index, ISD::MemIndexType &IndexType, EVT DataVT,
                      SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // It's always safe to look through zero extends.
   if (Index.getOpcode() == ISD::ZERO_EXTEND) {
     SDValue Op = Index.getOperand(0);
-    if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
+    if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType(), DataVT)) {
       IndexType = ISD::UNSIGNED_SCALED;
       Index = Op;
       return true;
@@ -10463,7 +10463,7 @@ bool refineIndexType(SDValue &Index, ISD::MemIndexType &IndexType,
   if (Index.getOpcode() == ISD::SIGN_EXTEND &&
       ISD::isIndexTypeSigned(IndexType)) {
     SDValue Op = Index.getOperand(0);
-    if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
+    if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType(), DataVT)) {
       Index = Op;
       return true;
     }
@@ -10494,7 +10494,7 @@ SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
                                 MSC->isTruncatingStore());
   }
 
-  if (refineIndexType(Index, IndexType, DAG)) {
+  if (refineIndexType(Index, IndexType, StoreVal.getValueType(), DAG)) {
     SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
     return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
                                 DL, Ops, MSC->getMemOperand(), IndexType,
@@ -10590,7 +10590,7 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) {
         Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
   }
 
-  if (refineIndexType(Index, IndexType, DAG)) {
+  if (refineIndexType(Index, IndexType, N->getValueType(0), DAG)) {
     SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
     return DAG.getMaskedGather(
         DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f8b71ee642877..f3f10c60447e2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4520,13 +4520,19 @@ bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
   return false;
 }
 
-bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
-  if (VT.getVectorElementType() == MVT::i32 &&
-      VT.getVectorElementCount().getKnownMinValue() >= 4 &&
-      !VT.isFixedLengthVector())
-    return true;
+bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT,
+                                                          EVT DataVT) const {
+  // SVE only supports implicit extension of 32-bit indices.
+  if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
+    return false;
 
-  return false;
+  // Indices cannot be smaller than the main data type.
+  if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
+    return false;
+
+  // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
+  // element container type, which would violate the previous clause.
+  return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
 }
 
 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6d6b7d44b57e2..52ead327d0bed 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1090,7 +1090,7 @@ class AArch64TargetLowering : public TargetLowering {
   }
 
   bool shouldExtendGSIndex(EVT VT, EVT &EltTy) const override;
-  bool shouldRemoveExtendFromGSIndex(EVT VT) const override;
+  bool shouldRemoveExtendFromGSIndex(EVT IndexVT, EVT DataVT) const override;
   bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
   bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
   bool mayBeEmittedAsTailCall(const CallInst *CI) const override;

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 458b3f3a4073e..527753a6e22cb 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -11656,7 +11656,8 @@ Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
   return Result;
 }
 
-bool RISCVTargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
+bool RISCVTargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT,
+                                                        EVT DataVT) const {
   return false;
 }
 

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 11a30c834fc23..1c461fb38ceb0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -560,7 +560,7 @@ class RISCVTargetLowering : public TargetLowering {
                                            const RISCVRegisterInfo *TRI);
   MVT getContainerForFixedLengthVector(MVT VT) const;
 
-  bool shouldRemoveExtendFromGSIndex(EVT VT) const override;
+  bool shouldRemoveExtendFromGSIndex(EVT IndexVT, EVT DataVT) const override;
 
   bool isLegalElementTypeForRVV(Type *ScalarTy) const;
 

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index 0922b4bd0eabb..69e063e122491 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -959,19 +959,16 @@ define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
 ; The above tests test the types, the below tests check that the addressing
 ; modes still function
 
-; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
 define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
 ; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f16:
 ; VBITS_GE_2048:       // %bb.0:
 ; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
+; VBITS_GE_2048-NEXT:    ptrue p1.s, vl32
 ; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1sw { z1.d }, p1/z, [x1]
+; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p1/z, [x1]
 ; VBITS_GE_2048-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
 ; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1h { z0.d }, p1/z, [x2, z1.d, lsl #1]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_2048-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1]
 ; VBITS_GE_2048-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_2048-NEXT:    ret
@@ -985,18 +982,14 @@ define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b,
   ret void
 }
 
-; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
 define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 {
 ; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f32:
 ; VBITS_GE_2048:       // %bb.0:
 ; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
 ; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1sw { z1.d }, p1/z, [x1]
+; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; VBITS_GE_2048-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p1/z, [x2, z1.d, lsl #2]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2]
 ; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_2048-NEXT:    ret
   %cvals = load <32 x float>, <32 x float>* %a
@@ -1009,7 +1002,6 @@ define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b,
   ret void
 }
 
-; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
 define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 {
 ; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f64:
 ; VBITS_GE_2048:       // %bb.0:
@@ -1030,19 +1022,16 @@ define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b
   ret void
 }
 
-; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
 define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
 ; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_zext:
 ; VBITS_GE_2048:       // %bb.0:
 ; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
+; VBITS_GE_2048-NEXT:    ptrue p1.s, vl32
 ; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.d }, p1/z, [x1]
+; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p1/z, [x1]
 ; VBITS_GE_2048-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
 ; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1h { z0.d }, p1/z, [x2, z1.d, lsl #1]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_2048-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1]
 ; VBITS_GE_2048-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_2048-NEXT:    ret
@@ -1056,19 +1045,16 @@ define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half
   ret void
 }
 
-; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
 define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
 ; VBITS_GE_2048-LABEL: masked_gather_32b_unscaled_sext:
 ; VBITS_GE_2048:       // %bb.0:
 ; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
+; VBITS_GE_2048-NEXT:    ptrue p1.s, vl32
 ; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1sw { z1.d }, p1/z, [x1]
+; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p1/z, [x1]
 ; VBITS_GE_2048-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
 ; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1h { z0.d }, p1/z, [x2, z1.d]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_2048-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, sxtw]
 ; VBITS_GE_2048-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_2048-NEXT:    ret
@@ -1083,19 +1069,16 @@ define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8
   ret void
 }
 
-; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
 define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
 ; VBITS_GE_2048-LABEL: masked_gather_32b_unscaled_zext:
 ; VBITS_GE_2048:       // %bb.0:
 ; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
+; VBITS_GE_2048-NEXT:    ptrue p1.s, vl32
 ; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.d }, p1/z, [x1]
+; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p1/z, [x1]
 ; VBITS_GE_2048-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
 ; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_2048-NEXT:    ld1h { z0.d }, p1/z, [x2, z1.d]
-; VBITS_GE_2048-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_2048-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, uxtw]
 ; VBITS_GE_2048-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_2048-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_2048-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index 79d27b4ca7584..edf937ab562e5 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -871,20 +871,17 @@ define void @masked_scatter_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
 ; The above tests test the types, the below tests check that the addressing
 ; modes still function
 
-; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
 define void @masked_scatter_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
 ; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_sext_f16:
 ; VBITS_GE_2048:       // %bb.0:
 ; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
+; VBITS_GE_2048-NEXT:    ptrue p1.s, vl32
 ; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1sw { z1.d }, p1/z, [x1]
+; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p1/z, [x1]
 ; VBITS_GE_2048-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
 ; VBITS_GE_2048-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1h { z0.d }, p0, [x2, z1.d, lsl #1]
+; VBITS_GE_2048-NEXT:    st1h { z0.s }, p0, [x2, z1.s, sxtw #1]
 ; VBITS_GE_2048-NEXT:    ret
   %vals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
@@ -895,18 +892,14 @@ define void @masked_scatter_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b,
   ret void
 }
 
-; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
 define void @masked_scatter_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 {
 ; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_sext_f32:
 ; VBITS_GE_2048:       // %bb.0:
 ; VBITS_GE_2048-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
 ; VBITS_GE_2048-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1sw { z1.d }, p1/z, [x1]
+; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; VBITS_GE_2048-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1w { z0.d }, p0, [x2, z1.d, lsl #2]
+; VBITS_GE_2048-NEXT:    st1w { z0.s }, p0, [x2, z1.s, sxtw #2]
 ; VBITS_GE_2048-NEXT:    ret
   %vals = load <32 x float>, <32 x float>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
@@ -917,7 +910,6 @@ define void @masked_scatter_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b
   ret void
 }
 
-; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
 define void @masked_scatter_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 {
 ; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_sext_f64:
 ; VBITS_GE_2048:       // %bb.0:
@@ -936,20 +928,17 @@ define void @masked_scatter_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %
   ret void
 }
 
-; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
 define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
 ; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_zext:
 ; VBITS_GE_2048:       // %bb.0:
 ; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
+; VBITS_GE_2048-NEXT:    ptrue p1.s, vl32
 ; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.d }, p1/z, [x1]
+; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p1/z, [x1]
 ; VBITS_GE_2048-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
 ; VBITS_GE_2048-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1h { z0.d }, p0, [x2, z1.d, lsl #1]
+; VBITS_GE_2048-NEXT:    st1h { z0.s }, p0, [x2, z1.s, uxtw #1]
 ; VBITS_GE_2048-NEXT:    ret
   %vals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
@@ -960,20 +949,17 @@ define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, hal
   ret void
 }
 
-; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
 define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
 ; VBITS_GE_2048-LABEL: masked_scatter_32b_unscaled_sext:
 ; VBITS_GE_2048:       // %bb.0:
 ; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
+; VBITS_GE_2048-NEXT:    ptrue p1.s, vl32
 ; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1sw { z1.d }, p1/z, [x1]
+; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p1/z, [x1]
 ; VBITS_GE_2048-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
 ; VBITS_GE_2048-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1h { z0.d }, p0, [x2, z1.d]
+; VBITS_GE_2048-NEXT:    st1h { z0.s }, p0, [x2, z1.s, sxtw]
 ; VBITS_GE_2048-NEXT:    ret
   %vals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
@@ -985,20 +971,17 @@ define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i
   ret void
 }
 
-; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
 define void @masked_scatter_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
 ; VBITS_GE_2048-LABEL: masked_scatter_32b_unscaled_zext:
 ; VBITS_GE_2048:       // %bb.0:
 ; VBITS_GE_2048-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT:    ptrue p1.d, vl32
+; VBITS_GE_2048-NEXT:    ptrue p1.s, vl32
 ; VBITS_GE_2048-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.d }, p1/z, [x1]
+; VBITS_GE_2048-NEXT:    ld1w { z1.s }, p1/z, [x1]
 ; VBITS_GE_2048-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
 ; VBITS_GE_2048-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_2048-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_2048-NEXT:    st1h { z0.d }, p0, [x2, z1.d]
+; VBITS_GE_2048-NEXT:    st1h { z0.s }, p0, [x2, z1.s, uxtw]
 ; VBITS_GE_2048-NEXT:    ret
   %vals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b