[llvm] 499c632 - [SVE] Code generation for fixed length vector loads & stores.

Tue Jun 23 02:41:16 PDT 2020

Author: Paul Walker
Date: 2020-06-23T09:39:03Z
New Revision: 499c63288f4e3385e8d7311b214fb4f743e33234

URL: https://github.com/llvm/llvm-project/commit/499c63288f4e3385e8d7311b214fb4f743e33234
DIFF: https://github.com/llvm/llvm-project/commit/499c63288f4e3385e8d7311b214fb4f743e33234.diff

LOG: [SVE] Code generation for fixed length vector loads & stores.

Summary:
This patch adds base support for code generating fixed length
vector operations targeting a known SVE vector length. To achieve
this we lower fixed length vector operations to equivalent scalable
vector operations, whereby SVE predication is used to limit the
elements processed to those present within the fixed length vector.

Specifically this patch implements load and store operations, which
get lowered to their masked counterparts thusly:

  V = load(Addr) =>
    V = extract_fixed_vector(masked_load(make_pred(V.NumElts), Addr))

  store(V, (Addr)) =>
    masked_store(insert_fixed_vector(V), make_pred(V.NumElts), Addr))

Reviewers: rengolin, efriedma

Subscribers: tschuett, hiraditya, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80385

Added: 
    llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5ff95808c3e8..88faa9b13c9e 100644

--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1054,6 +1054,8 @@ bool TargetLowering::SimplifyDemandedBits(
   case ISD::EXTRACT_SUBVECTOR: {
     // Offset the demanded elts by the subvector index.
     SDValue Src = Op.getOperand(0);
+    if (Src.getValueType().isScalableVector())
+      break;
     uint64_t Idx = Op.getConstantOperandVal(1);
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
     APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
@@ -2532,6 +2534,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(
   case ISD::EXTRACT_SUBVECTOR: {
     // Offset the demanded elts by the subvector index.
     SDValue Src = Op.getOperand(0);
+    if (Src.getValueType().isScalableVector())
+      break;
     uint64_t Idx = Op.getConstantOperandVal(1);
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
     APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8eefbe62309e..94df3de50dbd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -926,6 +926,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::SELECT, VT, Custom);
       }
     }
+
+    // NOTE: Currently this has to happen after computeRegisterProperties rather
+    // than the preferred option of combining it with the addRegisterClass call.
+    if (useSVEForFixedLengthVectors()) {
+      for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
+        if (useSVEForFixedLengthVectorVT(VT))
+          addTypeForFixedLengthSVE(VT);
+      for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
+        if (useSVEForFixedLengthVectorVT(VT))
+          addTypeForFixedLengthSVE(VT);
+    }
   }
 
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
@@ -1010,6 +1021,28 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
   }
 }
 
+void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
+  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+  // By default everything must be expanded.
+  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
+    setOperationAction(Op, VT, Expand);
+
+  // EXTRACT_SUBVECTOR/INSERT_SUBVECTOR are used to "cast" between scalable
+  // and fixed length vector types, although with the current level of support
+  // only the former is exercised.
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+  // Lower fixed length vector operations to scalable equivalents.
+  setOperationAction(ISD::LOAD, VT, Custom);
+  setOperationAction(ISD::STORE, VT, Custom);
+
+  // NOTE: This is a temporary measure to maintain functionality required by
+  // Analysis/CostModel/AArch64/sve-fixed-length.ll
+  setOperationAction(ISD::ADD, VT, Legal);
+  setOperationAction(ISD::FADD, VT, Legal);
+}
+
 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
   addRegisterClass(VT, &AArch64::FPR64RegClass);
   addTypeForNEON(VT, MVT::v2i32);
@@ -3276,6 +3309,9 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
   EVT MemVT = StoreNode->getMemoryVT();
 
   if (VT.isVector()) {
+    if (useSVEForFixedLengthVectorVT(VT))
+      return LowerFixedLengthVectorStoreToSVE(Op, DAG);
+
     unsigned AS = StoreNode->getAddressSpace();
     unsigned Align = StoreNode->getAlignment();
     if (Align < MemVT.getStoreSize() &&
@@ -3481,6 +3517,10 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::VSCALE:
     return LowerVSCALE(Op, DAG);
+  case ISD::LOAD:
+    if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+      return LowerFixedLengthVectorLoadToSVE(Op, DAG);
+    llvm_unreachable("Unexpected Load.");
   }
 }
 
@@ -3489,18 +3529,20 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {
   return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;
 }
 
-bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(MVT VT) const {
-  assert(VT.isFixedLengthVector());
+bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const {
   if (!useSVEForFixedLengthVectors())
     return false;
 
+  if (!VT.isFixedLengthVector())
+    return false;
+
   // Fixed length predicates should be promoted to i8.
   // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
   if (VT.getVectorElementType() == MVT::i1)
     return false;
 
   // Don't use SVE for vectors we cannot scalarize if required.
-  switch (VT.getVectorElementType().SimpleTy) {
+  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
   default:
     return false;
   case MVT::i8:
@@ -14653,3 +14695,156 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
 
   return false;
 }
+
+// Return the largest legal scalable vector type that matches VT's element type.
+static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
+  assert(VT.isFixedLengthVector() &&
+         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+         "Expected legal fixed length vector!");
+  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("unexpected element type for SVE container");
+  case MVT::i8:
+    return EVT(MVT::nxv16i8);
+  case MVT::i16:
+    return EVT(MVT::nxv8i16);
+  case MVT::i32:
+    return EVT(MVT::nxv4i32);
+  case MVT::i64:
+    return EVT(MVT::nxv2i64);
+  case MVT::f16:
+    return EVT(MVT::nxv8f16);
+  case MVT::f32:
+    return EVT(MVT::nxv4f32);
+  case MVT::f64:
+    return EVT(MVT::nxv2f64);
+  }
+}
+
+// Return a PTRUE with active lanes corresponding to the extent of VT.
+static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
+                                                EVT VT) {
+  assert(VT.isFixedLengthVector() &&
+         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+         "Expected legal fixed length vector!");
+
+  int PgPattern;
+  switch (VT.getVectorNumElements()) {
+  default:
+    llvm_unreachable("unexpected element count for SVE predicate");
+  case 1:
+    PgPattern = AArch64SVEPredPattern::vl1;
+    break;
+  case 2:
+    PgPattern = AArch64SVEPredPattern::vl2;
+    break;
+  case 4:
+    PgPattern = AArch64SVEPredPattern::vl4;
+    break;
+  case 8:
+    PgPattern = AArch64SVEPredPattern::vl8;
+    break;
+  case 16:
+    PgPattern = AArch64SVEPredPattern::vl16;
+    break;
+  case 32:
+    PgPattern = AArch64SVEPredPattern::vl32;
+    break;
+  case 64:
+    PgPattern = AArch64SVEPredPattern::vl64;
+    break;
+  case 128:
+    PgPattern = AArch64SVEPredPattern::vl128;
+    break;
+  case 256:
+    PgPattern = AArch64SVEPredPattern::vl256;
+    break;
+  }
+
+  // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
+  // use AArch64SVEPredPattern::all, which can enable the use of unpredicated
+  // variants of instructions when available.
+
+  MVT MaskVT;
+  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("unexpected element type for SVE predicate");
+  case MVT::i8:
+    MaskVT = MVT::nxv16i1;
+    break;
+  case MVT::i16:
+  case MVT::f16:
+    MaskVT = MVT::nxv8i1;
+    break;
+  case MVT::i32:
+  case MVT::f32:
+    MaskVT = MVT::nxv4i1;
+    break;
+  case MVT::i64:
+  case MVT::f64:
+    MaskVT = MVT::nxv2i1;
+    break;
+  }
+
+  return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
+                     DAG.getTargetConstant(PgPattern, DL, MVT::i64));
+}
+
+// Grow V to consume an entire SVE register.
+static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
+  assert(VT.isScalableVector() &&
+         "Expected to convert into a scalable vector!");
+  assert(V.getValueType().isFixedLengthVector() &&
+         "Expected a fixed length vector operand!");
+  SDLoc DL(V);
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
+}
+
+// Shrink V so it's just big enough to maintain a VT's worth of data.
+static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
+  assert(VT.isFixedLengthVector() &&
+         "Expected to convert into a fixed length vector!");
+  assert(V.getValueType().isScalableVector() &&
+         "Expected a scalable vector operand!");
+  SDLoc DL(V);
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
+}
+
+// Convert all fixed length vector loads larger than NEON to masked_loads.
+SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
+    SDValue Op, SelectionDAG &DAG) const {
+  auto Load = cast<LoadSDNode>(Op);
+
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+  auto NewLoad = DAG.getMaskedLoad(
+      ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
+      getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),
+      Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
+      Load->getExtensionType());
+
+  auto Result = convertFromScalableVector(DAG, VT, NewLoad);
+  SDValue MergedValues[2] = {Result, Load->getChain()};
+  return DAG.getMergeValues(MergedValues, DL);
+}
+
+// Convert all fixed length vector stores larger than NEON to masked_stores.
+SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
+    SDValue Op, SelectionDAG &DAG) const {
+  auto Store = cast<StoreSDNode>(Op);
+
+  SDLoc DL(Op);
+  EVT VT = Store->getValue().getValueType();
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+  auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
+  return DAG.getMaskedStore(
+      Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
+      getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
+      Store->getMemOperand(), Store->getAddressingMode(),
+      Store->isTruncatingStore());
+}

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 3baae53312fa..b7106adfa5f7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -713,6 +713,7 @@ class AArch64TargetLowering : public TargetLowering {
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
   void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
+  void addTypeForFixedLengthSVE(MVT VT);
   void addDRTypeForNEON(MVT VT);
   void addQRTypeForNEON(MVT VT);
 
@@ -847,6 +848,9 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps,
                              EVT VT, SelectionDAG &DAG, const SDLoc &DL) const;
 
+  SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         SmallVectorImpl<SDNode *> &Created) const override;
   SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
@@ -912,7 +916,7 @@ class AArch64TargetLowering : public TargetLowering {
                       const TargetTransformInfo *TTI) const override;
 
   bool useSVEForFixedLengthVectors() const;
-  bool useSVEForFixedLengthVectorVT(MVT VT) const;
+  bool useSVEForFixedLengthVectorVT(EVT VT) const;
 };
 
 namespace AArch64 {

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
new file mode 100644
index 000000000000..8478cfa3772e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
@@ -0,0 +1,104 @@
+; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
+; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
+; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
+; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
+; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK
+
+; VBYTES represents the useful byte size of a vector register from the code
+; generator's point of view. It is clamped to power-of-2 values because
+; only power-of-2 vector lengths are considered legal, regardless of the
+; user specified vector length.
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Don't use SVE when its registers are no bigger than NEON.
+; NO_SVE-NOT: ptrue
+
+; Don't use SVE for 64-bit vectors.
+define <2 x float> @load_v2f32(<2 x float>* %a) #0 {
+; CHECK-LABEL: load_v2f32:
+; CHECK: ldr d0, [x0]
+; CHECK: ret
+  %load = load <2 x float>, <2 x float>* %a
+  ret <2 x float> %load
+}
+
+; Don't use SVE for 128-bit vectors.
+define <4 x float> @load_v4f32(<4 x float>* %a) #0 {
+; CHECK-LABEL: load_v4f32:
+; CHECK: ldr q0, [x0]
+; CHECK: ret
+  %load = load <4 x float>, <4 x float>* %a
+  ret <4 x float> %load
+}
+
+define <8 x float> @load_v8f32(<8 x float>* %a) #0 {
+; CHECK-LABEL: load_v8f32:
+; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
+; CHECK: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
+; CHECK: ret
+  %load = load <8 x float>, <8 x float>* %a
+  ret <8 x float> %load
+}
+
+define <16 x float> @load_v16f32(<16 x float>* %a) #0 {
+; CHECK-LABEL: load_v16f32:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
+; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
+; VBITS_LE_256-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]]
+; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A1]]]
+; CHECK: ret
+  %load = load <16 x float>, <16 x float>* %a
+  ret <16 x float> %load
+}
+
+define <32 x float> @load_v32f32(<32 x float>* %a) #0 {
+; CHECK-LABEL: load_v32f32:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
+; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
+; VBITS_LE_512-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]]
+; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A1]]]
+; VBITS_LE_256-DAG: add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]]
+; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A2]]]
+; VBITS_LE_256-DAG: add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]]
+; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A3]]]
+; CHECK: ret
+  %load = load <32 x float>, <32 x float>* %a
+  ret <32 x float> %load
+}
+
+define <64 x float> @load_v64f32(<64 x float>* %a) #0 {
+; CHECK-LABEL: load_v64f32:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
+; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
+; VBITS_LE_1024-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]]
+; VBITS_LE_1024-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A1]]]
+; VBITS_LE_512-DAG:  add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]]
+; VBITS_LE_512-DAG:  ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A2]]]
+; VBITS_LE_512-DAG:  add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]]
+; VBITS_LE_512-DAG:  ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A3]]]
+; VBITS_LE_256-DAG:  add x[[A4:[0-9]+]], x0, #[[#mul(VBYTES,4)]]
+; VBITS_LE_256-DAG:  ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A4]]]
+; VBITS_LE_256-DAG:  add x[[A5:[0-9]+]], x0, #[[#mul(VBYTES,5)]]
+; VBITS_LE_256-DAG:  ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A5]]]
+; VBITS_LE_256-DAG:  add x[[A6:[0-9]+]], x0, #[[#mul(VBYTES,6)]]
+; VBITS_LE_256-DAG:  ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A6]]]
+; VBITS_LE_256-DAG:  add x[[A7:[0-9]+]], x0, #[[#mul(VBYTES,7)]]
+; VBITS_LE_256-DAG:  ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x[[A7]]]
+; CHECK: ret
+  %load = load <64 x float>, <64 x float>* %a
+  ret <64 x float> %load
+}
+
+attributes #0 = { "target-features"="+sve" }

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
new file mode 100644
index 000000000000..3182d07cc6d9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
@@ -0,0 +1,104 @@
+; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
+; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
+; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
+; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
+; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK
+
+; VBYTES represents the useful byte size of a vector register from the code
+; generator's point of view. It is clamped to power-of-2 values because
+; only power-of-2 vector lengths are considered legal, regardless of the
+; user specified vector length.
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Don't use SVE when its registers are no bigger than NEON.
+; NO_SVE-NOT: ptrue
+
+; Don't use SVE for 64-bit vectors.
+define void @store_v2f32(<2 x float>* %a) #0 {
+; CHECK-LABEL: store_v2f32:
+; CHECK: str xzr, [x0]
+; CHECK: ret
+  store <2 x float> zeroinitializer, <2 x float>* %a
+  ret void
+}
+
+; Don't use SVE for 128-bit vectors.
+define void @store_v4f32(<4 x float>* %a) #0 {
+; CHECK-LABEL: store_v4f32:
+; CHECK: stp xzr, xzr, [x0]
+; CHECK: ret
+  store <4 x float> zeroinitializer, <4 x float>* %a
+  ret void
+}
+
+define void @store_v8f32(<8 x float>* %a) #0 {
+; CHECK-LABEL: store_v8f32:
+; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
+; CHECK: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
+; CHECK: ret
+  store <8 x float> zeroinitializer, <8 x float>* %a
+  ret void
+}
+
+define void @store_v16f32(<16 x float>* %a) #0 {
+; CHECK-LABEL: store_v16f32:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
+; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
+; VBITS_LE_256-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]]
+; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A1]]]
+; CHECK: ret
+  store <16 x float> zeroinitializer, <16 x float>* %a
+  ret void
+}
+
+define void @store_v32f32(<32 x float>* %a) #0 {
+; CHECK-LABEL: store_v32f32:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
+; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
+; VBITS_LE_512-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]]
+; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A1]]]
+; VBITS_LE_256-DAG: add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]]
+; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A2]]]
+; VBITS_LE_256-DAG: add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]]
+; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A3]]]
+; CHECK: ret
+  store <32 x float> zeroinitializer, <32 x float>* %a
+  ret void
+}
+
+define void @store_v64f32(<64 x float>* %a) #0 {
+; CHECK-LABEL: store_v64f32:
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
+; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
+; VBITS_LE_1024-DAG: add x[[A1:[0-9]+]], x0, #[[#VBYTES]]
+; VBITS_LE_1024-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A1]]]
+; VBITS_LE_512-DAG:  add x[[A2:[0-9]+]], x0, #[[#mul(VBYTES,2)]]
+; VBITS_LE_512-DAG:  st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A2]]]
+; VBITS_LE_512-DAG:  add x[[A3:[0-9]+]], x0, #[[#mul(VBYTES,3)]]
+; VBITS_LE_512-DAG:  st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A3]]]
+; VBITS_LE_256-DAG:  add x[[A4:[0-9]+]], x0, #[[#mul(VBYTES,4)]]
+; VBITS_LE_256-DAG:  st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A4]]]
+; VBITS_LE_256-DAG:  add x[[A5:[0-9]+]], x0, #[[#mul(VBYTES,5)]]
+; VBITS_LE_256-DAG:  st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A5]]]
+; VBITS_LE_256-DAG:  add x[[A6:[0-9]+]], x0, #[[#mul(VBYTES,6)]]
+; VBITS_LE_256-DAG:  st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A6]]]
+; VBITS_LE_256-DAG:  add x[[A7:[0-9]+]], x0, #[[#mul(VBYTES,7)]]
+; VBITS_LE_256-DAG:  st1w { z{{[0-9]+}}.s }, [[PG]], [x[[A7]]]
+; CHECK: ret
+  store <64 x float> zeroinitializer, <64 x float>* %a
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }