[llvm] [AArch64][GISel] Support SVE with 128-bit min-size for G_LOAD and G_STORE (PR #92130)

Tue May 28 07:37:05 PDT 2024

https://github.com/Him188 updated https://github.com/llvm/llvm-project/pull/92130

>From a61fb1a3e989596d53c8f5c53dc1b8ca702cb7c6 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Tue, 7 May 2024 10:10:29 +0100
Subject: [PATCH 01/21] [AArch64][GISel] Support SVE with 128-bit min-size for
 G_LOAD and G_STORE

This patch adds basic support for scalable vector types in load & store instructions for AArch64 with GISel.
Only scalable vector types with a 128-bit base size are supported, e.g. <vscale x 4 x i32>, <vscale x 16 x i8>.

This patch adapted some ideas from a similar abandoned patch https://github.com/llvm/llvm-project/pull/72976.
---
 .../GlobalISel/GIMatchTableExecutorImpl.h     |  8 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  2 +-
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |  2 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 14 ++-
 .../Target/AArch64/AArch64RegisterBanks.td    |  2 +-
 .../GISel/AArch64InstructionSelector.cpp      | 59 ++++++++++--
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    | 93 ++++++++++++++++++-
 .../GISel/AArch64PostLegalizerCombiner.cpp    |  6 +-
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp | 10 +-
 .../AArch64/GlobalISel/sve-load-store.ll      | 50 ++++++++++
 10 files changed, 221 insertions(+), 25 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
index 4d147bf20c26a..29939d4619400 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
@@ -652,17 +652,17 @@ bool GIMatchTableExecutor::executeMatchTable(
       MachineMemOperand *MMO =
           *(State.MIs[InsnID]->memoperands_begin() + MMOIdx);
 
-      unsigned Size = MRI.getType(MO.getReg()).getSizeInBits();
+      const auto Size = MRI.getType(MO.getReg()).getSizeInBits();
       if (MatcherOpcode == GIM_CheckMemorySizeEqualToLLT &&
-          MMO->getSizeInBits().getValue() != Size) {
+          MMO->getSizeInBits() != Size) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       } else if (MatcherOpcode == GIM_CheckMemorySizeLessThanLLT &&
-                 MMO->getSizeInBits().getValue() >= Size) {
+                 MMO->getSizeInBits().getValue() >= Size.getKnownMinValue()) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       } else if (MatcherOpcode == GIM_CheckMemorySizeGreaterThanLLT &&
-                 MMO->getSizeInBits().getValue() <= Size)
+                 MMO->getSizeInBits().getValue() <= Size.getKnownMinValue())
         if (handleReject() == RejectAndGiveUp)
           return false;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 653e7689b5774..141c7ee15fe39 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1080,7 +1080,7 @@ bool CombinerHelper::isIndexedLoadStoreLegal(GLoadStore &LdSt) const {
   LLT Ty = MRI.getType(LdSt.getReg(0));
   LLT MemTy = LdSt.getMMO().getMemoryType();
   SmallVector<LegalityQuery::MemDesc, 2> MemDescrs(
-      {{MemTy, MemTy.getSizeInBits(), AtomicOrdering::NotAtomic}});
+      {{MemTy, MemTy.getSizeInBits().getKnownMinValue(), AtomicOrdering::NotAtomic}});
   unsigned IndexedOpc = getIndexedOpc(LdSt.getOpcode());
   SmallVector<LLT> OpTys;
   if (IndexedOpc == TargetOpcode::G_INDEXED_STORE)
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 6661127162e52..b14a004d5c4ac 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1413,7 +1413,7 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
 
 bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
   const StoreInst &SI = cast<StoreInst>(U);
-  if (DL->getTypeStoreSize(SI.getValueOperand()->getType()) == 0)
+  if (DL->getTypeStoreSize(SI.getValueOperand()->getType()).isZero())
     return true;
 
   ArrayRef<Register> Vals = getOrCreateVRegs(*SI.getValueOperand());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c1ca78af5cda8..e0be162e10a97 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26375,12 +26375,20 @@ bool AArch64TargetLowering::shouldLocalize(
   return TargetLoweringBase::shouldLocalize(MI, TTI);
 }
 
+static bool isScalableTySupported(const unsigned Op) {
+  return Op == Instruction::Load || Op == Instruction::Store;
+}
+
 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
-  if (Inst.getType()->isScalableTy())
-    return true;
+  const auto ScalableTySupported = isScalableTySupported(Inst.getOpcode());
+
+  // Fallback for scalable vectors
+  if (Inst.getType()->isScalableTy() && !ScalableTySupported) {
+      return true;
+  }
 
   for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
-    if (Inst.getOperand(i)->getType()->isScalableTy())
+    if (Inst.getOperand(i)->getType()->isScalableTy() && !ScalableTySupported)
       return true;
 
   if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBanks.td b/llvm/lib/Target/AArch64/AArch64RegisterBanks.td
index 615ce7d51d9ba..9e2ed356299e2 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterBanks.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterBanks.td
@@ -13,7 +13,7 @@
 def GPRRegBank : RegisterBank<"GPR", [XSeqPairsClass]>;
 
 /// Floating Point/Vector Registers: B, H, S, D, Q.
-def FPRRegBank : RegisterBank<"FPR", [QQQQ]>;
+def FPRRegBank : RegisterBank<"FPR", [QQQQ, ZPR]>;
 
 /// Conditional register: NZCV.
 def CCRegBank : RegisterBank<"CC", [CCR]>;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 61f5bc2464ee5..bc47443c45c8e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -901,6 +901,27 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
   return GenericOpc;
 }
 
+/// Select the AArch64 opcode for the G_LOAD or G_STORE operation for scalable 
+/// vectors.
+/// \p ElementSize size of the element of the scalable vector
+static unsigned selectLoadStoreSVEOp(const unsigned GenericOpc,
+                                     const unsigned ElementSize) {
+  const bool isStore = GenericOpc == TargetOpcode::G_STORE;
+  
+  switch (ElementSize) {
+    case 8:
+      return isStore ? AArch64::ST1B : AArch64::LD1B;
+    case 16:
+      return isStore ? AArch64::ST1H : AArch64::LD1H;
+    case 32:
+      return isStore ? AArch64::ST1W : AArch64::LD1W;
+    case 64:
+      return isStore ? AArch64::ST1D : AArch64::LD1D;
+  }
+  
+  return GenericOpc;
+}
+
 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
 /// to \p *To.
 ///
@@ -2853,8 +2874,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       return false;
     }
 
-    uint64_t MemSizeInBytes = LdSt.getMemSize().getValue();
-    unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue();
+    uint64_t MemSizeInBytes = LdSt.getMemSize().getValue().getKnownMinValue();
+    unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue().getKnownMinValue();
     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
 
     // Need special instructions for atomics that affect ordering.
@@ -2906,9 +2927,23 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     const LLT ValTy = MRI.getType(ValReg);
     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
 
+#ifndef NDEBUG
+    if (ValTy.isScalableVector()) {
+        assert(STI.hasSVE() 
+             && "Load/Store register operand is scalable vector "
+                "while SVE is not supported by the target");
+        // assert(RB.getID() == AArch64::SVRRegBankID 
+        //        && "Load/Store register operand is scalable vector "
+        //           "while its register bank is not SVR");
+    }
+#endif
+    
     // The code below doesn't support truncating stores, so we need to split it
     // again.
-    if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
+    // Truncate only if type is not scalable vector
+    const bool NeedTrunc = !ValTy.isScalableVector() 
+                      && ValTy.getSizeInBits().getFixedValue() > MemSizeInBits;
+    if (isa<GStore>(LdSt) && NeedTrunc) {
       unsigned SubReg;
       LLT MemTy = LdSt.getMMO().getMemoryType();
       auto *RC = getRegClassForTypeOnBank(MemTy, RB);
@@ -2921,7 +2956,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
                       .getReg(0);
       RBI.constrainGenericRegister(Copy, *RC, MRI);
       LdSt.getOperand(0).setReg(Copy);
-    } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
+    } else if (isa<GLoad>(LdSt) && NeedTrunc) {
       // If this is an any-extending load from the FPR bank, split it into a regular
       // load + extend.
       if (RB.getID() == AArch64::FPRRegBankID) {
@@ -2951,10 +2986,19 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     // instruction with an updated opcode, or a new instruction.
     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
       bool IsStore = isa<GStore>(I);
-      const unsigned NewOpc =
-          selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
+      unsigned NewOpc;
+      if (ValTy.isScalableVector()) {
+        NewOpc = selectLoadStoreSVEOp(I.getOpcode(), ValTy.getElementType().getSizeInBits());
+      } else {
+        NewOpc = selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
+      }
       if (NewOpc == I.getOpcode())
         return nullptr;
+
+      if (ValTy.isScalableVector()) {
+        // Add the predicate register operand
+        I.addOperand(MachineOperand::CreatePredicate(true));
+      }
       // Check if we can fold anything into the addressing mode.
       auto AddrModeFns =
           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
@@ -2970,6 +3014,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       Register CurValReg = I.getOperand(0).getReg();
       IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
       NewInst.cloneMemRefs(I);
+      if (ValTy.isScalableVector()) {
+        NewInst.add(I.getOperand(1)); // Copy predicate register
+      }
       for (auto &Fn : *AddrModeFns)
         Fn(NewInst);
       I.eraseFromParent();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index d4aac94d24f12..c4f5b75ce959f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -61,6 +61,79 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   const LLT v2s64 = LLT::fixed_vector(2, 64);
   const LLT v2p0 = LLT::fixed_vector(2, p0);
 
+  // Scalable vector sizes range from 128 to 2048
+  // Note that subtargets may not support the full range.
+  // See [ScalableVecTypes] below.
+  const LLT nxv16s8 = LLT::scalable_vector(16, s8);
+  const LLT nxv32s8 = LLT::scalable_vector(32, s8);
+  const LLT nxv64s8 = LLT::scalable_vector(64, s8);
+  const LLT nxv128s8 = LLT::scalable_vector(128, s8);
+  const LLT nxv256s8 = LLT::scalable_vector(256, s8);
+
+  const LLT nxv8s16 = LLT::scalable_vector(8, s16);
+  const LLT nxv16s16 = LLT::scalable_vector(16, s16);
+  const LLT nxv32s16 = LLT::scalable_vector(32, s16);
+  const LLT nxv64s16 = LLT::scalable_vector(64, s16);
+  const LLT nxv128s16 = LLT::scalable_vector(128, s16);
+
+  const LLT nxv4s32 = LLT::scalable_vector(4, s32); 
+  const LLT nxv8s32 = LLT::scalable_vector(8, s32); 
+  const LLT nxv16s32 = LLT::scalable_vector(16, s32); 
+  const LLT nxv32s32 = LLT::scalable_vector(32, s32);
+  const LLT nxv64s32 = LLT::scalable_vector(64, s32);
+
+  const LLT nxv2s64 = LLT::scalable_vector(2, s64);
+  const LLT nxv4s64 = LLT::scalable_vector(4, s64);
+  const LLT nxv8s64 = LLT::scalable_vector(8, s64);
+  const LLT nxv16s64 = LLT::scalable_vector(16, s64);
+  const LLT nxv32s64 = LLT::scalable_vector(32, s64);
+
+  const LLT nxv2p0 = LLT::scalable_vector(2, p0);
+  const LLT nxv4p0 = LLT::scalable_vector(4, p0);
+  const LLT nxv8p0 = LLT::scalable_vector(8, p0);
+  const LLT nxv16p0 = LLT::scalable_vector(16, p0);
+  const LLT nxv32p0 = LLT::scalable_vector(32, p0);
+
+  const auto ScalableVec128 = {
+    nxv16s8, nxv8s16, nxv4s32, nxv2s64, nxv2p0,
+  };
+  const auto ScalableVec256 = {
+    nxv32s8, nxv16s16, nxv8s32, nxv4s64, nxv4p0,
+  };
+  const auto ScalableVec512 = {
+    nxv64s8, nxv32s16, nxv16s32, nxv8s64, nxv8p0,
+  };
+  const auto ScalableVec1024 = {
+    nxv128s8, nxv64s16, nxv32s32, nxv16s64, nxv16p0,
+  };
+  const auto ScalableVec2048 = {
+    nxv256s8, nxv128s16, nxv64s32, nxv32s64, nxv32p0,
+  };
+
+  /// Scalable vector types supported by the sub target.
+  /// Empty if SVE is not supported.
+  SmallVector<LLT> ScalableVecTypes;
+  
+  if (ST.hasSVE()) {
+    // Add scalable vector types that are supported by the subtarget
+    const auto MinSize = ST.getMinSVEVectorSizeInBits();
+    auto MaxSize = ST.getMaxSVEVectorSizeInBits();
+    if (MaxSize == 0) {
+      // Unknown max size, assume the target supports all sizes.
+      MaxSize = 2048; 
+    }
+    if (MinSize <= 128 && 128 <= MaxSize)
+      ScalableVecTypes.append(ScalableVec128);
+    if (MinSize <= 256 && 256 <= MaxSize)
+      ScalableVecTypes.append(ScalableVec256);
+    if (MinSize <= 512 && 512 <= MaxSize)
+      ScalableVecTypes.append(ScalableVec512);
+    if (MinSize <= 1024 && 1024 <= MaxSize)
+      ScalableVecTypes.append(ScalableVec1024);
+    if (MinSize <= 2048 && 2048 <= MaxSize)
+      ScalableVecTypes.append(ScalableVec2048);
+  }
+
   std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
                                                         v16s8, v8s16, v4s32,
                                                         v2s64, v2p0,
@@ -329,6 +402,18 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
     return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
   };
 
+  const auto IsSameScalableVecTy = [=](const LegalityQuery &Query) {
+    // Legal if loading a scalable vector type
+    // into a scalable vector register of the exactly same type
+    if (!Query.Types[0].isScalableVector() || Query.Types[1] != p0)
+      return false;
+    if (Query.MMODescrs[0].MemoryTy != Query.Types[0])
+      return false;
+    if (Query.MMODescrs[0].AlignInBits < 128)
+      return false;
+    return is_contained(ScalableVecTypes, Query.Types[0]);
+  };
+
   getActionDefinitionsBuilder(G_LOAD)
       .customIf([=](const LegalityQuery &Query) {
         return HasRCPC3 && Query.Types[0] == s128 &&
@@ -354,6 +439,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       // These extends are also legal
       .legalForTypesWithMemDesc(
           {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
+      .legalIf(IsSameScalableVecTy)
       .widenScalarToNextPow2(0, /* MinSize = */ 8)
       .clampMaxNumElements(0, s8, 16)
       .clampMaxNumElements(0, s16, 8)
@@ -398,7 +484,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
            {s64, p0, s64, 8},   {s64, p0, s32, 8}, // truncstorei32 from s64
            {p0, p0, s64, 8},    {s128, p0, s128, 8},  {v16s8, p0, s128, 8},
            {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
-           {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
+           {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8},
+          })
+      .legalIf(IsSameScalableVecTy)
       .clampScalar(0, s8, s64)
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&
@@ -440,8 +528,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           {p0, v4s32, v4s32, 8},
           {p0, v2s64, v2s64, 8},
           {p0, v2p0, v2p0, 8},
-          {p0, s128, s128, 8},
-      })
+          {p0, s128, s128, 8}})
       .unsupported();
 
   auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index d8ca5494ba50a..5830489e8ef90 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -309,7 +309,7 @@ bool matchSplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI) {
   if (!Store.isSimple())
     return false;
   LLT ValTy = MRI.getType(Store.getValueReg());
-  if (!ValTy.isVector() || ValTy.getSizeInBits() != 128)
+  if (!ValTy.isVector() || ValTy.getSizeInBits().getKnownMinValue() != 128)
     return false;
   if (Store.getMemSizeInBits() != ValTy.getSizeInBits())
     return false; // Don't split truncating stores.
@@ -657,8 +657,8 @@ bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
         Register PtrBaseReg;
         APInt Offset;
         LLT StoredValTy = MRI.getType(St->getValueReg());
-        unsigned ValSize = StoredValTy.getSizeInBits();
-        if (ValSize < 32 || St->getMMO().getSizeInBits() != ValSize)
+        const auto ValSize = StoredValTy.getSizeInBits();
+        if (ValSize.getKnownMinValue() < 32 || St->getMMO().getSizeInBits() != ValSize)
           continue;
 
         Register PtrReg = St->getPointerReg();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 44ba9f0429e67..f249729b4b4ab 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -257,6 +257,7 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case AArch64::QQRegClassID:
   case AArch64::QQQRegClassID:
   case AArch64::QQQQRegClassID:
+  case AArch64::ZPRRegClassID:
     return getRegBank(AArch64::FPRRegBankID);
   case AArch64::GPR32commonRegClassID:
   case AArch64::GPR32RegClassID:
@@ -740,11 +741,14 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     LLT Ty = MRI.getType(MO.getReg());
     if (!Ty.isValid())
       continue;
-    OpSize[Idx] = Ty.getSizeInBits();
+    OpSize[Idx] = Ty.getSizeInBits().getKnownMinValue();
 
-    // As a top-level guess, vectors go in FPRs, scalars and pointers in GPRs.
+    // As a top-level guess, scalable vectors go in SVRs, non-scalable
+    // vectors go in FPRs, scalars and pointers in GPRs.
     // For floating-point instructions, scalars go in FPRs.
-    if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc) ||
+    if (Ty.isScalableVector()) 
+      OpRegBankIdx[Idx] = PMI_FirstFPR;
+    else if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc) ||
         Ty.getSizeInBits() > 64)
       OpRegBankIdx[Idx] = PMI_FirstFPR;
     else
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll b/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll
new file mode 100644
index 0000000000000..7a794387eb011
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -global-isel < %s | FileCheck %s
+
+define void @scalable_v16i8(ptr noalias nocapture noundef %l0, ptr noalias nocapture noundef %l1) {
+; CHECK-LABEL: scalable_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
+  %l3 = load <vscale x 16 x i8>, ptr %l0, align 16
+  store <vscale x 16 x i8> %l3, ptr %l1, align 16
+  ret void
+}
+
+define void @scalable_v8i16(ptr noalias nocapture noundef %l0, ptr noalias nocapture noundef %l1) {
+; CHECK-LABEL: scalable_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %l3 = load <vscale x 8 x i16>, ptr %l0, align 16
+  store <vscale x 8 x i16> %l3, ptr %l1, align 16
+  ret void
+}
+
+define void @scalable_v4i32(ptr noalias nocapture noundef %l0, ptr noalias nocapture noundef %l1) {
+; CHECK-LABEL: scalable_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %l3 = load <vscale x 4 x i32>, ptr %l0, align 16
+  store <vscale x 4 x i32> %l3, ptr %l1, align 16
+  ret void
+}
+
+define void @scalable_v2i64(ptr noalias nocapture noundef %l0, ptr noalias nocapture noundef %l1) {
+; CHECK-LABEL: scalable_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %l3 = load <vscale x 2 x i64>, ptr %l0, align 16
+  store <vscale x 2 x i64> %l3, ptr %l1, align 16
+  ret void
+}

>From 135c91fd128ad8b60d56edacfe4fb0bb748de307 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Tue, 14 May 2024 16:28:32 +0100
Subject: [PATCH 02/21] Remove unnecessary attributes in tests

---
 llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll b/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll
index 7a794387eb011..4c3ffb99e5667 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -global-isel < %s | FileCheck %s
 
-define void @scalable_v16i8(ptr noalias nocapture noundef %l0, ptr noalias nocapture noundef %l1) {
+define void @scalable_v16i8(ptr %l0, ptr %l1) {
 ; CHECK-LABEL: scalable_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
@@ -13,7 +13,7 @@ define void @scalable_v16i8(ptr noalias nocapture noundef %l0, ptr noalias nocap
   ret void
 }
 
-define void @scalable_v8i16(ptr noalias nocapture noundef %l0, ptr noalias nocapture noundef %l1) {
+define void @scalable_v8i16(ptr %l0, ptr %l1) {
 ; CHECK-LABEL: scalable_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
@@ -25,7 +25,7 @@ define void @scalable_v8i16(ptr noalias nocapture noundef %l0, ptr noalias nocap
   ret void
 }
 
-define void @scalable_v4i32(ptr noalias nocapture noundef %l0, ptr noalias nocapture noundef %l1) {
+define void @scalable_v4i32(ptr %l0, ptr %l1) {
 ; CHECK-LABEL: scalable_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
@@ -37,7 +37,7 @@ define void @scalable_v4i32(ptr noalias nocapture noundef %l0, ptr noalias nocap
   ret void
 }
 
-define void @scalable_v2i64(ptr noalias nocapture noundef %l0, ptr noalias nocapture noundef %l1) {
+define void @scalable_v2i64(ptr %l0, ptr %l1) {
 ; CHECK-LABEL: scalable_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d

>From e2ec9514876744627e99fecb7179accdc6969e4e Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Tue, 14 May 2024 16:30:37 +0100
Subject: [PATCH 03/21] Remove unnecessary `#ifndef` macro around assertions

---
 llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index bc47443c45c8e..1da6fce1aa283 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2927,16 +2927,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     const LLT ValTy = MRI.getType(ValReg);
     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
 
-#ifndef NDEBUG
     if (ValTy.isScalableVector()) {
         assert(STI.hasSVE() 
              && "Load/Store register operand is scalable vector "
                 "while SVE is not supported by the target");
-        // assert(RB.getID() == AArch64::SVRRegBankID 
-        //        && "Load/Store register operand is scalable vector "
-        //           "while its register bank is not SVR");
     }
-#endif
     
     // The code below doesn't support truncating stores, so we need to split it
     // again.

>From 70b3f217a7dafa10dd9f3662c868d7a6a994f68e Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 15 May 2024 11:21:16 +0100
Subject: [PATCH 04/21] Legal only for size of multiple of 128

---
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    | 93 +++----------------
 1 file changed, 13 insertions(+), 80 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index c4f5b75ce959f..b93ed0e50e8dd 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -61,78 +61,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   const LLT v2s64 = LLT::fixed_vector(2, 64);
   const LLT v2p0 = LLT::fixed_vector(2, p0);
 
-  // Scalable vector sizes range from 128 to 2048
-  // Note that subtargets may not support the full range.
-  // See [ScalableVecTypes] below.
   const LLT nxv16s8 = LLT::scalable_vector(16, s8);
-  const LLT nxv32s8 = LLT::scalable_vector(32, s8);
-  const LLT nxv64s8 = LLT::scalable_vector(64, s8);
-  const LLT nxv128s8 = LLT::scalable_vector(128, s8);
-  const LLT nxv256s8 = LLT::scalable_vector(256, s8);
-
   const LLT nxv8s16 = LLT::scalable_vector(8, s16);
-  const LLT nxv16s16 = LLT::scalable_vector(16, s16);
-  const LLT nxv32s16 = LLT::scalable_vector(32, s16);
-  const LLT nxv64s16 = LLT::scalable_vector(64, s16);
-  const LLT nxv128s16 = LLT::scalable_vector(128, s16);
-
   const LLT nxv4s32 = LLT::scalable_vector(4, s32); 
-  const LLT nxv8s32 = LLT::scalable_vector(8, s32); 
-  const LLT nxv16s32 = LLT::scalable_vector(16, s32); 
-  const LLT nxv32s32 = LLT::scalable_vector(32, s32);
-  const LLT nxv64s32 = LLT::scalable_vector(64, s32);
-
   const LLT nxv2s64 = LLT::scalable_vector(2, s64);
-  const LLT nxv4s64 = LLT::scalable_vector(4, s64);
-  const LLT nxv8s64 = LLT::scalable_vector(8, s64);
-  const LLT nxv16s64 = LLT::scalable_vector(16, s64);
-  const LLT nxv32s64 = LLT::scalable_vector(32, s64);
-
   const LLT nxv2p0 = LLT::scalable_vector(2, p0);
-  const LLT nxv4p0 = LLT::scalable_vector(4, p0);
-  const LLT nxv8p0 = LLT::scalable_vector(8, p0);
-  const LLT nxv16p0 = LLT::scalable_vector(16, p0);
-  const LLT nxv32p0 = LLT::scalable_vector(32, p0);
-
-  const auto ScalableVec128 = {
-    nxv16s8, nxv8s16, nxv4s32, nxv2s64, nxv2p0,
-  };
-  const auto ScalableVec256 = {
-    nxv32s8, nxv16s16, nxv8s32, nxv4s64, nxv4p0,
-  };
-  const auto ScalableVec512 = {
-    nxv64s8, nxv32s16, nxv16s32, nxv8s64, nxv8p0,
-  };
-  const auto ScalableVec1024 = {
-    nxv128s8, nxv64s16, nxv32s32, nxv16s64, nxv16p0,
-  };
-  const auto ScalableVec2048 = {
-    nxv256s8, nxv128s16, nxv64s32, nxv32s64, nxv32p0,
-  };
-
-  /// Scalable vector types supported by the sub target.
-  /// Empty if SVE is not supported.
-  SmallVector<LLT> ScalableVecTypes;
-  
-  if (ST.hasSVE()) {
-    // Add scalable vector types that are supported by the subtarget
-    const auto MinSize = ST.getMinSVEVectorSizeInBits();
-    auto MaxSize = ST.getMaxSVEVectorSizeInBits();
-    if (MaxSize == 0) {
-      // Unknown max size, assume the target supports all sizes.
-      MaxSize = 2048; 
-    }
-    if (MinSize <= 128 && 128 <= MaxSize)
-      ScalableVecTypes.append(ScalableVec128);
-    if (MinSize <= 256 && 256 <= MaxSize)
-      ScalableVecTypes.append(ScalableVec256);
-    if (MinSize <= 512 && 512 <= MaxSize)
-      ScalableVecTypes.append(ScalableVec512);
-    if (MinSize <= 1024 && 1024 <= MaxSize)
-      ScalableVecTypes.append(ScalableVec1024);
-    if (MinSize <= 2048 && 2048 <= MaxSize)
-      ScalableVecTypes.append(ScalableVec2048);
-  }
 
   std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
                                                         v16s8, v8s16, v4s32,
@@ -402,17 +335,19 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
     return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
   };
 
-  const auto IsSameScalableVecTy = [=](const LegalityQuery &Query) {
-    // Legal if loading a scalable vector type
-    // into a scalable vector register of the exactly same type
-    if (!Query.Types[0].isScalableVector() || Query.Types[1] != p0)
-      return false;
-    if (Query.MMODescrs[0].MemoryTy != Query.Types[0])
-      return false;
-    if (Query.MMODescrs[0].AlignInBits < 128)
-      return false;
-    return is_contained(ScalableVecTypes, Query.Types[0]);
-  };
+  if (ST.hasSVE()) {
+    for (const auto OpCode : {G_LOAD, G_STORE}) {
+      getActionDefinitionsBuilder(OpCode)
+      .legalForTypesWithMemDesc({
+        // 128 bit base sizes
+        {nxv16s8, p0, nxv16s8, 128},
+        {nxv8s16, p0, nxv8s16, 128},
+        {nxv4s32, p0, nxv4s32, 128},
+        {nxv2s64, p0, nxv2s64, 128},
+        {nxv2p0, p0, nxv2p0, 128},
+      });
+    }
+  }
 
   getActionDefinitionsBuilder(G_LOAD)
       .customIf([=](const LegalityQuery &Query) {
@@ -439,7 +374,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       // These extends are also legal
       .legalForTypesWithMemDesc(
           {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
-      .legalIf(IsSameScalableVecTy)
       .widenScalarToNextPow2(0, /* MinSize = */ 8)
       .clampMaxNumElements(0, s8, 16)
       .clampMaxNumElements(0, s16, 8)
@@ -486,7 +420,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
            {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
            {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8},
           })
-      .legalIf(IsSameScalableVecTy)
       .clampScalar(0, s8, s64)
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&

>From 73a618b6fc68f9b1c61760ee771f67ffca6657a0 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 15 May 2024 16:06:45 +0100
Subject: [PATCH 05/21] Update comments on FPRRegBank in
 AArch64RegisterBanks.td

---
 llvm/lib/Target/AArch64/AArch64RegisterBanks.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBanks.td b/llvm/lib/Target/AArch64/AArch64RegisterBanks.td
index 9e2ed356299e2..2b597b8606921 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterBanks.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterBanks.td
@@ -12,7 +12,7 @@
 /// General Purpose Registers: W, X.
 def GPRRegBank : RegisterBank<"GPR", [XSeqPairsClass]>;
 
-/// Floating Point/Vector Registers: B, H, S, D, Q.
+/// Floating Point, Vector, Scalable Vector Registers: B, H, S, D, Q, Z.
 def FPRRegBank : RegisterBank<"FPR", [QQQQ, ZPR]>;
 
 /// Conditional register: NZCV.

>From b19d1b47c8f0ead007f45888364cbe0000e94723 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Thu, 16 May 2024 11:52:41 +0100
Subject: [PATCH 06/21] Add option `aarch64-enable-sve-gisel` to allow SVE in
 GISel, disabled by default

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 39 +++++++++++--------
 .../AArch64/GlobalISel/sve-load-store.ll      |  2 +-
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e0be162e10a97..40ce9152d4748 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -145,6 +145,15 @@ static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
 static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
                                  cl::desc("Maximum of xors"));
 
+// By turning this on, we will not fallback to DAG ISel when encountering
+// scalable vector types for all instruction, even if SVE is not yet supported
+// with some instructions.
+// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
+static cl::opt<bool> EnableSVEGISel(
+    "aarch64-enable-sve-gisel", cl::Hidden,
+    cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
+    cl::init(false));
+
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -26375,26 +26384,24 @@ bool AArch64TargetLowering::shouldLocalize(
   return TargetLoweringBase::shouldLocalize(MI, TTI);
 }
 
-static bool isScalableTySupported(const unsigned Op) {
-  return Op == Instruction::Load || Op == Instruction::Store;
-}
-
 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
-  const auto ScalableTySupported = isScalableTySupported(Inst.getOpcode());
-
-  // Fallback for scalable vectors
-  if (Inst.getType()->isScalableTy() && !ScalableTySupported) {
+  // Fallback for scalable vectors.
+  // Note that if EnableSVEGISel is true, we allow scalable vector types for
+  // all instructions, regardless of whether they are actually supported.
+  if (!EnableSVEGISel) {
+    if (Inst.getType()->isScalableTy()) {
       return true;
-  }
+    }
 
-  for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
-    if (Inst.getOperand(i)->getType()->isScalableTy() && !ScalableTySupported)
-      return true;
+    for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
+      if (Inst.getOperand(i)->getType()->isScalableTy())
+        return true;
 
-  if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
-    if (AI->getAllocatedType()->isScalableTy())
-      return true;
-  }
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
+      if (AI->getAllocatedType()->isScalableTy())
+        return true;
+    }
+  } 
 
   // Checks to allow the use of SME instructions
   if (auto *Base = dyn_cast<CallBase>(&Inst)) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll b/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll
index 4c3ffb99e5667..5f41bd2b129df 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -global-isel < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -global-isel -aarch64-enable-sve-gisel=true < %s | FileCheck %s
 
 define void @scalable_v16i8(ptr %l0, ptr %l1) {
 ; CHECK-LABEL: scalable_v16i8:

>From fb48ea14e6e6a95b117bd86d17b147f42bc5dc44 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Mon, 20 May 2024 13:51:36 +0100
Subject: [PATCH 07/21] Explicitly assign scalable and non-scalable vectors
 into FPR

---
 llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index f249729b4b4ab..c44cc45e8b871 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -743,12 +743,12 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       continue;
     OpSize[Idx] = Ty.getSizeInBits().getKnownMinValue();
 
-    // As a top-level guess, scalable vectors go in SVRs, non-scalable
-    // vectors go in FPRs, scalars and pointers in GPRs.
+    // As a top-level guess, vectors including both scalable and non-scalable
+    // ones go in FPRs, scalars and pointers in GPRs.
     // For floating-point instructions, scalars go in FPRs.
-    if (Ty.isScalableVector()) 
+    if (Ty.isVector())
       OpRegBankIdx[Idx] = PMI_FirstFPR;
-    else if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc) ||
+    else if (isPreISelGenericFloatingPointOpcode(Opc) ||
         Ty.getSizeInBits() > 64)
       OpRegBankIdx[Idx] = PMI_FirstFPR;
     else

>From 9dedf0e096c6c8c156d1d263c517c8228b62dc8d Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Tue, 21 May 2024 13:59:33 +0100
Subject: [PATCH 08/21] Use getActionDefinitionsBuilder only once

---
 .../Target/AArch64/GISel/AArch64LegalizerInfo.cpp    | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index b93ed0e50e8dd..b7df5a9291373 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -335,10 +335,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
     return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
   };
 
+  auto &LoadActions = getActionDefinitionsBuilder(G_LOAD);
+  auto &StoreActions = getActionDefinitionsBuilder(G_STORE);
+
   if (ST.hasSVE()) {
-    for (const auto OpCode : {G_LOAD, G_STORE}) {
-      getActionDefinitionsBuilder(OpCode)
-      .legalForTypesWithMemDesc({
+    for (auto *Actions : {&LoadActions, &StoreActions}) {
+      Actions->legalForTypesWithMemDesc({
         // 128 bit base sizes
         {nxv16s8, p0, nxv16s8, 128},
         {nxv8s16, p0, nxv8s16, 128},
@@ -349,7 +351,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
     }
   }
 
-  getActionDefinitionsBuilder(G_LOAD)
+  LoadActions
       .customIf([=](const LegalityQuery &Query) {
         return HasRCPC3 && Query.Types[0] == s128 &&
                Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire;
@@ -399,7 +401,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .customIf(IsPtrVecPred)
       .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0);
 
-  getActionDefinitionsBuilder(G_STORE)
+  StoreActions
       .customIf([=](const LegalityQuery &Query) {
         return HasRCPC3 && Query.Types[0] == s128 &&
                Query.MMODescrs[0].Ordering == AtomicOrdering::Release;

>From ad331f96019728bf442d0a785572f975acd62d51 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 22 May 2024 09:37:56 +0100
Subject: [PATCH 09/21] Replace TypeSize usages

---
 .../llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
index 29939d4619400..554bef8406989 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
@@ -652,17 +652,17 @@ bool GIMatchTableExecutor::executeMatchTable(
       MachineMemOperand *MMO =
           *(State.MIs[InsnID]->memoperands_begin() + MMOIdx);
 
-      const auto Size = MRI.getType(MO.getReg()).getSizeInBits();
+      const TypeSize Size = MRI.getType(MO.getReg()).getSizeInBits();
       if (MatcherOpcode == GIM_CheckMemorySizeEqualToLLT &&
           MMO->getSizeInBits() != Size) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       } else if (MatcherOpcode == GIM_CheckMemorySizeLessThanLLT &&
-                 MMO->getSizeInBits().getValue() >= Size.getKnownMinValue()) {
+                 TypeSize::isKnownGE(MMO->getSizeInBits().getValue(), Size)) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       } else if (MatcherOpcode == GIM_CheckMemorySizeGreaterThanLLT &&
-                 MMO->getSizeInBits().getValue() <= Size.getKnownMinValue())
+                 TypeSize::isKnownLE(MMO->getSizeInBits().getValue(), Size))
         if (handleReject() == RejectAndGiveUp)
           return false;
 

>From 0c3cce327b0477badd9455fc050e850fcd9d12dc Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 22 May 2024 09:48:07 +0100
Subject: [PATCH 10/21] Simplify assertion

---
 .../Target/AArch64/GISel/AArch64InstructionSelector.cpp   | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 1da6fce1aa283..9a54306239a11 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2927,11 +2927,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     const LLT ValTy = MRI.getType(ValReg);
     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
 
-    if (ValTy.isScalableVector()) {
-        assert(STI.hasSVE() 
-             && "Load/Store register operand is scalable vector "
-                "while SVE is not supported by the target");
-    }
+    assert((!ValTy.isScalableVector() || STI.hasSVE()) &&
+      "Load/Store register operand is scalable vector "
+      "while SVE is not supported by the target");
     
     // The code below doesn't support truncating stores, so we need to split it
     // again.

>From 45030749e348dfc62f85fee5a03eecda5a787aa9 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 22 May 2024 10:09:18 +0100
Subject: [PATCH 11/21] Reformat code

---
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  3 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  2 +-
 .../GISel/AArch64InstructionSelector.cpp      | 39 +++++-----
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    | 71 +++++++++----------
 .../GISel/AArch64PostLegalizerCombiner.cpp    |  3 +-
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp |  2 +-
 6 files changed, 62 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 141c7ee15fe39..0c886a052d059 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1080,7 +1080,8 @@ bool CombinerHelper::isIndexedLoadStoreLegal(GLoadStore &LdSt) const {
   LLT Ty = MRI.getType(LdSt.getReg(0));
   LLT MemTy = LdSt.getMMO().getMemoryType();
   SmallVector<LegalityQuery::MemDesc, 2> MemDescrs(
-      {{MemTy, MemTy.getSizeInBits().getKnownMinValue(), AtomicOrdering::NotAtomic}});
+      {{MemTy, MemTy.getSizeInBits().getKnownMinValue(),
+        AtomicOrdering::NotAtomic}});
   unsigned IndexedOpc = getIndexedOpc(LdSt.getOpcode());
   SmallVector<LLT> OpTys;
   if (IndexedOpc == TargetOpcode::G_INDEXED_STORE)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 40ce9152d4748..cbd1b6a8e4792 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26401,7 +26401,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
       if (AI->getAllocatedType()->isScalableTy())
         return true;
     }
-  } 
+  }
 
   // Checks to allow the use of SME instructions
   if (auto *Base = dyn_cast<CallBase>(&Inst)) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 9a54306239a11..364cf72d198ef 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -901,24 +901,24 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
   return GenericOpc;
 }
 
-/// Select the AArch64 opcode for the G_LOAD or G_STORE operation for scalable 
+/// Select the AArch64 opcode for the G_LOAD or G_STORE operation for scalable
 /// vectors.
 /// \p ElementSize size of the element of the scalable vector
 static unsigned selectLoadStoreSVEOp(const unsigned GenericOpc,
                                      const unsigned ElementSize) {
   const bool isStore = GenericOpc == TargetOpcode::G_STORE;
-  
+
   switch (ElementSize) {
-    case 8:
-      return isStore ? AArch64::ST1B : AArch64::LD1B;
-    case 16:
-      return isStore ? AArch64::ST1H : AArch64::LD1H;
-    case 32:
-      return isStore ? AArch64::ST1W : AArch64::LD1W;
-    case 64:
-      return isStore ? AArch64::ST1D : AArch64::LD1D;
+  case 8:
+    return isStore ? AArch64::ST1B : AArch64::LD1B;
+  case 16:
+    return isStore ? AArch64::ST1H : AArch64::LD1H;
+  case 32:
+    return isStore ? AArch64::ST1W : AArch64::LD1W;
+  case 64:
+    return isStore ? AArch64::ST1D : AArch64::LD1D;
   }
-  
+
   return GenericOpc;
 }
 
@@ -2875,7 +2875,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     }
 
     uint64_t MemSizeInBytes = LdSt.getMemSize().getValue().getKnownMinValue();
-    unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue().getKnownMinValue();
+    unsigned MemSizeInBits =
+        LdSt.getMemSizeInBits().getValue().getKnownMinValue();
     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
 
     // Need special instructions for atomics that affect ordering.
@@ -2928,14 +2929,15 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
 
     assert((!ValTy.isScalableVector() || STI.hasSVE()) &&
-      "Load/Store register operand is scalable vector "
-      "while SVE is not supported by the target");
-    
+           "Load/Store register operand is scalable vector "
+           "while SVE is not supported by the target");
+
     // The code below doesn't support truncating stores, so we need to split it
     // again.
     // Truncate only if type is not scalable vector
-    const bool NeedTrunc = !ValTy.isScalableVector() 
-                      && ValTy.getSizeInBits().getFixedValue() > MemSizeInBits;
+    const bool NeedTrunc =
+        !ValTy.isScalableVector() &&
+        ValTy.getSizeInBits().getFixedValue() > MemSizeInBits;
     if (isa<GStore>(LdSt) && NeedTrunc) {
       unsigned SubReg;
       LLT MemTy = LdSt.getMMO().getMemoryType();
@@ -2981,7 +2983,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       bool IsStore = isa<GStore>(I);
       unsigned NewOpc;
       if (ValTy.isScalableVector()) {
-        NewOpc = selectLoadStoreSVEOp(I.getOpcode(), ValTy.getElementType().getSizeInBits());
+        NewOpc = selectLoadStoreSVEOp(I.getOpcode(),
+                                      ValTy.getElementType().getSizeInBits());
       } else {
         NewOpc = selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
       }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index b7df5a9291373..84da936ea3ea1 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -63,7 +63,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   const LLT nxv16s8 = LLT::scalable_vector(16, s8);
   const LLT nxv8s16 = LLT::scalable_vector(8, s16);
-  const LLT nxv4s32 = LLT::scalable_vector(4, s32); 
+  const LLT nxv4s32 = LLT::scalable_vector(4, s32);
   const LLT nxv2s64 = LLT::scalable_vector(2, s64);
   const LLT nxv2p0 = LLT::scalable_vector(2, p0);
 
@@ -341,12 +341,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   if (ST.hasSVE()) {
     for (auto *Actions : {&LoadActions, &StoreActions}) {
       Actions->legalForTypesWithMemDesc({
-        // 128 bit base sizes
-        {nxv16s8, p0, nxv16s8, 128},
-        {nxv8s16, p0, nxv8s16, 128},
-        {nxv4s32, p0, nxv4s32, 128},
-        {nxv2s64, p0, nxv2s64, 128},
-        {nxv2p0, p0, nxv2p0, 128},
+          // 128 bit base sizes
+          {nxv16s8, p0, nxv16s8, 128},
+          {nxv8s16, p0, nxv8s16, 128},
+          {nxv4s32, p0, nxv4s32, 128},
+          {nxv2s64, p0, nxv2s64, 128},
+          {nxv2p0, p0, nxv2p0, 128},
       });
     }
   }
@@ -410,18 +410,18 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
         return Query.Types[0] == s128 &&
                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
       })
-      .legalForTypesWithMemDesc(
-          {{s8, p0, s8, 8},     {s16, p0, s8, 8},  // truncstorei8 from s16
-           {s32, p0, s8, 8},                       // truncstorei8 from s32
-           {s64, p0, s8, 8},                       // truncstorei8 from s64
-           {s16, p0, s16, 8},   {s32, p0, s16, 8}, // truncstorei16 from s32
-           {s64, p0, s16, 8},                      // truncstorei16 from s64
-           {s32, p0, s8, 8},    {s32, p0, s16, 8},    {s32, p0, s32, 8},
-           {s64, p0, s64, 8},   {s64, p0, s32, 8}, // truncstorei32 from s64
-           {p0, p0, s64, 8},    {s128, p0, s128, 8},  {v16s8, p0, s128, 8},
-           {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
-           {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8},
-          })
+      .legalForTypesWithMemDesc({
+          {s8, p0, s8, 8},     {s16, p0, s8, 8},  // truncstorei8 from s16
+          {s32, p0, s8, 8},                       // truncstorei8 from s32
+          {s64, p0, s8, 8},                       // truncstorei8 from s64
+          {s16, p0, s16, 8},   {s32, p0, s16, 8}, // truncstorei16 from s32
+          {s64, p0, s16, 8},                      // truncstorei16 from s64
+          {s32, p0, s8, 8},    {s32, p0, s16, 8},    {s32, p0, s32, 8},
+          {s64, p0, s64, 8},   {s64, p0, s32, 8}, // truncstorei32 from s64
+          {p0, p0, s64, 8},    {s128, p0, s128, 8},  {v16s8, p0, s128, 8},
+          {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
+          {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8},
+      })
       .clampScalar(0, s8, s64)
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&
@@ -447,23 +447,22 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       // Idx 0 == Ptr, Idx 1 == Val
       // TODO: we can implement legalizations but as of now these are
       // generated in a very specific way.
-      .legalForTypesWithMemDesc({
-          {p0, s8, s8, 8},
-          {p0, s16, s16, 8},
-          {p0, s32, s8, 8},
-          {p0, s32, s16, 8},
-          {p0, s32, s32, 8},
-          {p0, s64, s64, 8},
-          {p0, p0, p0, 8},
-          {p0, v8s8, v8s8, 8},
-          {p0, v16s8, v16s8, 8},
-          {p0, v4s16, v4s16, 8},
-          {p0, v8s16, v8s16, 8},
-          {p0, v2s32, v2s32, 8},
-          {p0, v4s32, v4s32, 8},
-          {p0, v2s64, v2s64, 8},
-          {p0, v2p0, v2p0, 8},
-          {p0, s128, s128, 8}})
+      .legalForTypesWithMemDesc({{p0, s8, s8, 8},
+                                 {p0, s16, s16, 8},
+                                 {p0, s32, s8, 8},
+                                 {p0, s32, s16, 8},
+                                 {p0, s32, s32, 8},
+                                 {p0, s64, s64, 8},
+                                 {p0, p0, p0, 8},
+                                 {p0, v8s8, v8s8, 8},
+                                 {p0, v16s8, v16s8, 8},
+                                 {p0, v4s16, v4s16, 8},
+                                 {p0, v8s16, v8s16, 8},
+                                 {p0, v2s32, v2s32, 8},
+                                 {p0, v4s32, v4s32, 8},
+                                 {p0, v2s64, v2s64, 8},
+                                 {p0, v2p0, v2p0, 8},
+                                 {p0, s128, s128, 8}})
       .unsupported();
 
   auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index 5830489e8ef90..83dbf2077365c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -658,7 +658,8 @@ bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
         APInt Offset;
         LLT StoredValTy = MRI.getType(St->getValueReg());
         const auto ValSize = StoredValTy.getSizeInBits();
-        if (ValSize.getKnownMinValue() < 32 || St->getMMO().getSizeInBits() != ValSize)
+        if (ValSize.getKnownMinValue() < 32 ||
+            St->getMMO().getSizeInBits() != ValSize)
           continue;
 
         Register PtrReg = St->getPointerReg();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index c44cc45e8b871..4d2a7fd412135 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -749,7 +749,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     if (Ty.isVector())
       OpRegBankIdx[Idx] = PMI_FirstFPR;
     else if (isPreISelGenericFloatingPointOpcode(Opc) ||
-        Ty.getSizeInBits() > 64)
+             Ty.getSizeInBits() > 64)
       OpRegBankIdx[Idx] = PMI_FirstFPR;
     else
       OpRegBankIdx[Idx] = PMI_FirstGPR;

>From 22de2adb74f2b248b543673969eeb0207b60129f Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 22 May 2024 10:13:55 +0100
Subject: [PATCH 12/21] Remove brackets from single statements

---
 .../lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 364cf72d198ef..5a990374c0ee7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2982,12 +2982,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
       bool IsStore = isa<GStore>(I);
       unsigned NewOpc;
-      if (ValTy.isScalableVector()) {
+      if (ValTy.isScalableVector())
         NewOpc = selectLoadStoreSVEOp(I.getOpcode(),
                                       ValTy.getElementType().getSizeInBits());
-      } else {
+      else
         NewOpc = selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
-      }
+
       if (NewOpc == I.getOpcode())
         return nullptr;
 

>From 3be060cf9f298554b6f50b676a42ca4ba4f09307 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 22 May 2024 11:49:22 +0100
Subject: [PATCH 13/21] Revert formatting change

---
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    | 57 ++++++++++---------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 84da936ea3ea1..867588a0f5143 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -410,18 +410,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
         return Query.Types[0] == s128 &&
                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
       })
-      .legalForTypesWithMemDesc({
-          {s8, p0, s8, 8},     {s16, p0, s8, 8},  // truncstorei8 from s16
-          {s32, p0, s8, 8},                       // truncstorei8 from s32
-          {s64, p0, s8, 8},                       // truncstorei8 from s64
-          {s16, p0, s16, 8},   {s32, p0, s16, 8}, // truncstorei16 from s32
-          {s64, p0, s16, 8},                      // truncstorei16 from s64
-          {s32, p0, s8, 8},    {s32, p0, s16, 8},    {s32, p0, s32, 8},
-          {s64, p0, s64, 8},   {s64, p0, s32, 8}, // truncstorei32 from s64
-          {p0, p0, s64, 8},    {s128, p0, s128, 8},  {v16s8, p0, s128, 8},
-          {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
-          {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8},
-      })
+      .legalForTypesWithMemDesc(
+          {{s8, p0, s8, 8},     {s16, p0, s8, 8},  // truncstorei8 from s16
+           {s32, p0, s8, 8},                       // truncstorei8 from s32
+           {s64, p0, s8, 8},                       // truncstorei8 from s64
+           {s16, p0, s16, 8},   {s32, p0, s16, 8}, // truncstorei16 from s32
+           {s64, p0, s16, 8},                      // truncstorei16 from s64
+           {s32, p0, s8, 8},    {s32, p0, s16, 8},    {s32, p0, s32, 8},
+           {s64, p0, s64, 8},   {s64, p0, s32, 8}, // truncstorei32 from s64
+           {p0, p0, s64, 8},    {s128, p0, s128, 8},  {v16s8, p0, s128, 8},
+           {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
+           {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
       .clampScalar(0, s8, s64)
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&
@@ -447,22 +446,24 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       // Idx 0 == Ptr, Idx 1 == Val
       // TODO: we can implement legalizations but as of now these are
       // generated in a very specific way.
-      .legalForTypesWithMemDesc({{p0, s8, s8, 8},
-                                 {p0, s16, s16, 8},
-                                 {p0, s32, s8, 8},
-                                 {p0, s32, s16, 8},
-                                 {p0, s32, s32, 8},
-                                 {p0, s64, s64, 8},
-                                 {p0, p0, p0, 8},
-                                 {p0, v8s8, v8s8, 8},
-                                 {p0, v16s8, v16s8, 8},
-                                 {p0, v4s16, v4s16, 8},
-                                 {p0, v8s16, v8s16, 8},
-                                 {p0, v2s32, v2s32, 8},
-                                 {p0, v4s32, v4s32, 8},
-                                 {p0, v2s64, v2s64, 8},
-                                 {p0, v2p0, v2p0, 8},
-                                 {p0, s128, s128, 8}})
+      .legalForTypesWithMemDesc({
+          {p0, s8, s8, 8},
+          {p0, s16, s16, 8},
+          {p0, s32, s8, 8},
+          {p0, s32, s16, 8},
+          {p0, s32, s32, 8},
+          {p0, s64, s64, 8},
+          {p0, p0, p0, 8},
+          {p0, v8s8, v8s8, 8},
+          {p0, v16s8, v16s8, 8},
+          {p0, v4s16, v4s16, 8},
+          {p0, v8s16, v8s16, 8},
+          {p0, v2s32, v2s32, 8},
+          {p0, v4s32, v4s32, 8},
+          {p0, v2s64, v2s64, 8},
+          {p0, v2p0, v2p0, 8},
+          {p0, s128, s128, 8},
+      })
       .unsupported();
 
   auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) {

>From b5c72d79d37d0c9ccd71256ca5b320e202b4860b Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 22 May 2024 11:49:44 +0100
Subject: [PATCH 14/21] Skip SplitStoreZero128 for scalable vectors

---
 .../lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index 83dbf2077365c..40f6ec36467c1 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -309,7 +309,9 @@ bool matchSplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI) {
   if (!Store.isSimple())
     return false;
   LLT ValTy = MRI.getType(Store.getValueReg());
-  if (!ValTy.isVector() || ValTy.getSizeInBits().getKnownMinValue() != 128)
+  if (ValTy.isScalableVector())
+    return false;
+  if (!ValTy.isVector() || ValTy.getSizeInBits() != 128)
     return false;
   if (Store.getMemSizeInBits() != ValTy.getSizeInBits())
     return false; // Don't split truncating stores.

>From 2518b20419982e48658b3180ffc356bea5272f2c Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 22 May 2024 11:50:13 +0100
Subject: [PATCH 15/21] Skip optimizeConsecutiveMemOpAddressing for scalable
 vectors

---
 .../AArch64/GISel/AArch64PostLegalizerCombiner.cpp     | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index 40f6ec36467c1..fe84d0e27189f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -655,13 +655,17 @@ bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
     // should only be in a single block.
     resetState();
     for (auto &MI : MBB) {
+      // Skip for scalable vectors
+      if (auto *LdSt = dyn_cast<GLoadStore>(&MI);
+          LdSt && MRI.getType(LdSt->getOperand(0).getReg()).isScalableVector())
+        continue;
+
       if (auto *St = dyn_cast<GStore>(&MI)) {
         Register PtrBaseReg;
         APInt Offset;
         LLT StoredValTy = MRI.getType(St->getValueReg());
-        const auto ValSize = StoredValTy.getSizeInBits();
-        if (ValSize.getKnownMinValue() < 32 ||
-            St->getMMO().getSizeInBits() != ValSize)
+        unsigned ValSize = StoredValTy.getSizeInBits();
+        if (ValSize < 32 || St->getMMO().getSizeInBits() != ValSize)
           continue;
 
         Register PtrReg = St->getPointerReg();

>From 5a288d896a431db36ff0196067fd81df4d76ceae Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 22 May 2024 16:10:23 +0100
Subject: [PATCH 16/21] Rename option to aarch64-enable-gisel-sve

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp        | 2 +-
 llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index cbd1b6a8e4792..93057ef87503c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -150,7 +150,7 @@ static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
 // with some instructions.
 // See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
 static cl::opt<bool> EnableSVEGISel(
-    "aarch64-enable-sve-gisel", cl::Hidden,
+    "aarch64-enable-gisel-sve", cl::Hidden,
     cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
     cl::init(false));
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll b/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll
index 5f41bd2b129df..95a5bfa4b038f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -global-isel -aarch64-enable-sve-gisel=true < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -global-isel -aarch64-enable-gisel-sve=true < %s | FileCheck %s
 
 define void @scalable_v16i8(ptr %l0, ptr %l1) {
 ; CHECK-LABEL: scalable_v16i8:

>From f1a4d7bd34489ce9136826637d6fc4775c3990ed Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 22 May 2024 16:45:11 +0100
Subject: [PATCH 17/21] Unfold `hasSVE` loop for legalizer

---
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 867588a0f5143..16e144dd83e6a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -339,16 +339,23 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   auto &StoreActions = getActionDefinitionsBuilder(G_STORE);
 
   if (ST.hasSVE()) {
-    for (auto *Actions : {&LoadActions, &StoreActions}) {
-      Actions->legalForTypesWithMemDesc({
-          // 128 bit base sizes
-          {nxv16s8, p0, nxv16s8, 128},
-          {nxv8s16, p0, nxv8s16, 128},
-          {nxv4s32, p0, nxv4s32, 128},
-          {nxv2s64, p0, nxv2s64, 128},
-          {nxv2p0, p0, nxv2p0, 128},
-      });
-    }
+    LoadActions.legalForTypesWithMemDesc({
+        // 128 bit base sizes
+        {nxv16s8, p0, nxv16s8, 128},
+        {nxv8s16, p0, nxv8s16, 128},
+        {nxv4s32, p0, nxv4s32, 128},
+        {nxv2s64, p0, nxv2s64, 128},
+        {nxv2p0, p0, nxv2p0, 128},
+    });
+
+    StoreActions.legalForTypesWithMemDesc({
+        // 128 bit base sizes
+        {nxv16s8, p0, nxv16s8, 128},
+        {nxv8s16, p0, nxv8s16, 128},
+        {nxv4s32, p0, nxv4s32, 128},
+        {nxv2s64, p0, nxv2s64, 128},
+        {nxv2p0, p0, nxv2p0, 128},
+    });
   }
 
   LoadActions

>From 09570db32ee4d1c0d2dde13a38c9212e0a91745f Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Fri, 24 May 2024 12:07:20 +0100
Subject: [PATCH 18/21] Revert instruction selector changes as they are already
 covered by tablegen

---
 .../GISel/AArch64InstructionSelector.cpp      | 55 ++-----------------
 1 file changed, 6 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 5a990374c0ee7..61f5bc2464ee5 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -901,27 +901,6 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
   return GenericOpc;
 }
 
-/// Select the AArch64 opcode for the G_LOAD or G_STORE operation for scalable
-/// vectors.
-/// \p ElementSize size of the element of the scalable vector
-static unsigned selectLoadStoreSVEOp(const unsigned GenericOpc,
-                                     const unsigned ElementSize) {
-  const bool isStore = GenericOpc == TargetOpcode::G_STORE;
-
-  switch (ElementSize) {
-  case 8:
-    return isStore ? AArch64::ST1B : AArch64::LD1B;
-  case 16:
-    return isStore ? AArch64::ST1H : AArch64::LD1H;
-  case 32:
-    return isStore ? AArch64::ST1W : AArch64::LD1W;
-  case 64:
-    return isStore ? AArch64::ST1D : AArch64::LD1D;
-  }
-
-  return GenericOpc;
-}
-
 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
 /// to \p *To.
 ///
@@ -2874,9 +2853,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       return false;
     }
 
-    uint64_t MemSizeInBytes = LdSt.getMemSize().getValue().getKnownMinValue();
-    unsigned MemSizeInBits =
-        LdSt.getMemSizeInBits().getValue().getKnownMinValue();
+    uint64_t MemSizeInBytes = LdSt.getMemSize().getValue();
+    unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue();
     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
 
     // Need special instructions for atomics that affect ordering.
@@ -2928,17 +2906,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     const LLT ValTy = MRI.getType(ValReg);
     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
 
-    assert((!ValTy.isScalableVector() || STI.hasSVE()) &&
-           "Load/Store register operand is scalable vector "
-           "while SVE is not supported by the target");
-
     // The code below doesn't support truncating stores, so we need to split it
     // again.
-    // Truncate only if type is not scalable vector
-    const bool NeedTrunc =
-        !ValTy.isScalableVector() &&
-        ValTy.getSizeInBits().getFixedValue() > MemSizeInBits;
-    if (isa<GStore>(LdSt) && NeedTrunc) {
+    if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
       unsigned SubReg;
       LLT MemTy = LdSt.getMMO().getMemoryType();
       auto *RC = getRegClassForTypeOnBank(MemTy, RB);
@@ -2951,7 +2921,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
                       .getReg(0);
       RBI.constrainGenericRegister(Copy, *RC, MRI);
       LdSt.getOperand(0).setReg(Copy);
-    } else if (isa<GLoad>(LdSt) && NeedTrunc) {
+    } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
       // If this is an any-extending load from the FPR bank, split it into a regular
       // load + extend.
       if (RB.getID() == AArch64::FPRRegBankID) {
@@ -2981,20 +2951,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     // instruction with an updated opcode, or a new instruction.
     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
       bool IsStore = isa<GStore>(I);
-      unsigned NewOpc;
-      if (ValTy.isScalableVector())
-        NewOpc = selectLoadStoreSVEOp(I.getOpcode(),
-                                      ValTy.getElementType().getSizeInBits());
-      else
-        NewOpc = selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
-
+      const unsigned NewOpc =
+          selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
       if (NewOpc == I.getOpcode())
         return nullptr;
-
-      if (ValTy.isScalableVector()) {
-        // Add the predicate register operand
-        I.addOperand(MachineOperand::CreatePredicate(true));
-      }
       // Check if we can fold anything into the addressing mode.
       auto AddrModeFns =
           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
@@ -3010,9 +2970,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       Register CurValReg = I.getOperand(0).getReg();
       IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
       NewInst.cloneMemRefs(I);
-      if (ValTy.isScalableVector()) {
-        NewInst.add(I.getOperand(1)); // Copy predicate register
-      }
       for (auto &Fn : *AddrModeFns)
         Fn(NewInst);
       I.eraseFromParent();

>From 551fec4a77021c7d95ca7bde26496348f6bb5973 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Fri, 24 May 2024 14:22:04 +0100
Subject: [PATCH 19/21] Allow alignment of 8

---
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 16e144dd83e6a..9ce9ca411dfea 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -341,20 +341,20 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   if (ST.hasSVE()) {
     LoadActions.legalForTypesWithMemDesc({
         // 128 bit base sizes
-        {nxv16s8, p0, nxv16s8, 128},
-        {nxv8s16, p0, nxv8s16, 128},
-        {nxv4s32, p0, nxv4s32, 128},
-        {nxv2s64, p0, nxv2s64, 128},
-        {nxv2p0, p0, nxv2p0, 128},
+        {nxv16s8, p0, nxv16s8, 8},
+        {nxv8s16, p0, nxv8s16, 8},
+        {nxv4s32, p0, nxv4s32, 8},
+        {nxv2s64, p0, nxv2s64, 8},
+        {nxv2p0, p0, nxv2p0, 8},
     });
 
     StoreActions.legalForTypesWithMemDesc({
         // 128 bit base sizes
-        {nxv16s8, p0, nxv16s8, 128},
-        {nxv8s16, p0, nxv8s16, 128},
-        {nxv4s32, p0, nxv4s32, 128},
-        {nxv2s64, p0, nxv2s64, 128},
-        {nxv2p0, p0, nxv2p0, 128},
+        {nxv16s8, p0, nxv16s8, 8},
+        {nxv8s16, p0, nxv8s16, 8},
+        {nxv4s32, p0, nxv4s32, 8},
+        {nxv2s64, p0, nxv2s64, 8},
+        {nxv2p0, p0, nxv2p0, 8},
     });
   }
 

>From 5995c15754cdbd65054caef87c4a3cdd90e45266 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Tue, 28 May 2024 10:08:36 +0100
Subject: [PATCH 20/21] Remove legal rules for nxv2p0

---
 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 9ce9ca411dfea..1df78e13cd716 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -65,7 +65,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   const LLT nxv8s16 = LLT::scalable_vector(8, s16);
   const LLT nxv4s32 = LLT::scalable_vector(4, s32);
   const LLT nxv2s64 = LLT::scalable_vector(2, s64);
-  const LLT nxv2p0 = LLT::scalable_vector(2, p0);
 
   std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
                                                         v16s8, v8s16, v4s32,
@@ -345,7 +344,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
         {nxv8s16, p0, nxv8s16, 8},
         {nxv4s32, p0, nxv4s32, 8},
         {nxv2s64, p0, nxv2s64, 8},
-        {nxv2p0, p0, nxv2p0, 8},
     });
 
     StoreActions.legalForTypesWithMemDesc({
@@ -354,7 +352,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
         {nxv8s16, p0, nxv8s16, 8},
         {nxv4s32, p0, nxv4s32, 8},
         {nxv2s64, p0, nxv2s64, 8},
-        {nxv2p0, p0, nxv2p0, 8},
     });
   }
 

>From e53b252fd13d41255b8ab09094c3ad8bc607927c Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Tue, 28 May 2024 14:20:44 +0100
Subject: [PATCH 21/21] Add TODO for nxv2p0

---
 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 1df78e13cd716..c472fd06ba373 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -346,6 +346,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
         {nxv2s64, p0, nxv2s64, 8},
     });
 
+    // TODO: Add nxv2p0. Consider bitcastIf.
+    //       See #92130
+    //       https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
     StoreActions.legalForTypesWithMemDesc({
         // 128 bit base sizes
         {nxv16s8, p0, nxv16s8, 8},