[llvm] 3f36561 - [SVE][CodeGen] Fix scalable vector issues in DAGTypeLegalizer::GenWidenVectorLoads

Tue Aug 18 23:54:58 PDT 2020

Author: David Sherwood
Date: 2020-08-19T07:54:32+01:00
New Revision: 3f36561f69fd62c3f96e3575835e86187889859d

URL: https://github.com/llvm/llvm-project/commit/3f36561f69fd62c3f96e3575835e86187889859d
DIFF: https://github.com/llvm/llvm-project/commit/3f36561f69fd62c3f96e3575835e86187889859d.diff

LOG: [SVE][CodeGen] Fix scalable vector issues in DAGTypeLegalizer::GenWidenVectorLoads

In DAGTypeLegalizer::GenWidenVectorLoads the algorithm assumes it only
ever deals with fixed width types, hence the offsets for each individual
store never take 'vscale' into account. I've changed the code in that
function to use TypeSize instead of unsigned for tracking the remaining
load amount. In addition, I've changed the load loop to use the new
IncrementPointer helper function for updating the addresses in each
iteration, since this handles scalable vector types.

Also, I've added report_fatal_errors in GenWidenVectorExtLoads,
TargetLowering::scalarizeVectorLoad and TargetLowering::scalarizeVectorStores,
since these functions currently use a sequence of element-by-element
scalar loads/stores. In a similar vein, I've also added a fatal error
report in FindMemType for the case when we decide to return the element
type for a scalable vector type.

I've added new tests in

  CodeGen/AArch64/sve-split-load.ll
  CodeGen/AArch64/sve-ld-addressing-mode-reg-imm.ll

for the changes in GenWidenVectorLoads.

Differential Revision: https://reviews.llvm.org/D85909

Added: 
    

Modified: 
    llvm/include/llvm/Support/TypeSize.h
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll
    llvm/test/CodeGen/AArch64/sve-split-load.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index b8a3fa3b20c9..d5a3614155b8 100644

--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -145,6 +145,24 @@ class TypeSize {
     return *this;
   }
 
+  friend TypeSize operator-(const TypeSize &LHS, const TypeSize &RHS) {
+    assert(LHS.IsScalable == RHS.IsScalable &&
+           "Arithmetic using mixed scalable and fixed types");
+    return {LHS.MinSize - RHS.MinSize, LHS.IsScalable};
+  }
+
+  friend TypeSize operator/(const TypeSize &LHS, const TypeSize &RHS) {
+    assert(LHS.IsScalable == RHS.IsScalable &&
+           "Arithmetic using mixed scalable and fixed types");
+    return {LHS.MinSize / RHS.MinSize, LHS.IsScalable};
+  }
+
+  friend TypeSize operator%(const TypeSize &LHS, const TypeSize &RHS) {
+    assert(LHS.IsScalable == RHS.IsScalable &&
+           "Arithmetic using mixed scalable and fixed types");
+    return {LHS.MinSize % RHS.MinSize, LHS.IsScalable};
+  }
+
   // Return the minimum size with the assumption that the size is exact.
   // Use in places where a scalable size doesn't make sense (e.g. non-vector
   // types, or vectors in backends which don't support scalable vectors).

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 92bf89cc1054..353c2d7893f7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4895,11 +4895,14 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
         isPowerOf2_32(WidenWidth / MemVTWidth) &&
         (MemVTWidth <= Width ||
          (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
-      if (RetVT.getSizeInBits() < MemVTWidth || MemVT == WidenVT)
+      if (RetVT.getSizeInBits().getFixedSize() < MemVTWidth || MemVT == WidenVT)
         return MemVT;
     }
   }
 
+  if (Scalable)
+    report_fatal_error("Using element-wise loads and stores for widening "
+                       "operations is not supported for scalable vectors");
   return RetVT;
 }
 
@@ -4942,10 +4945,10 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   // element type or scalar loads and then recombines it to the widen vector
   // type.
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
-  unsigned WidenWidth = WidenVT.getSizeInBits();
   EVT LdVT    = LD->getMemoryVT();
   SDLoc dl(LD);
   assert(LdVT.isVector() && WidenVT.isVector());
+  assert(LdVT.isScalableVector() == WidenVT.isScalableVector());
   assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
 
   // Load information
@@ -4954,15 +4957,17 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
-  int LdWidth = LdVT.getSizeInBits();
-  int WidthDiff = WidenWidth - LdWidth;
+  TypeSize LdWidth = LdVT.getSizeInBits();
+  TypeSize WidenWidth = WidenVT.getSizeInBits();
+  TypeSize WidthDiff = WidenWidth - LdWidth;
   // Allow wider loads if they are sufficiently aligned to avoid memory faults
   // and if the original load is simple.
   unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment();
 
   // Find the vector type that can load from.
-  EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
-  int NewVTWidth = NewVT.getSizeInBits();
+  EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign,
+                          WidthDiff.getKnownMinSize());
+  TypeSize NewVTWidth = NewVT.getSizeInBits();
   SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
                              LD->getOriginalAlign(), MMOFlags, AAInfo);
   LdChain.push_back(LdOp.getValue(1));
@@ -4970,7 +4975,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   // Check if we can load the element with one instruction.
   if (LdWidth <= NewVTWidth) {
     if (!NewVT.isVector()) {
-      unsigned NumElts = WidenWidth / NewVTWidth;
+      unsigned NumElts = WidenWidth.getFixedSize() / NewVTWidth.getFixedSize();
       EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
       SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
@@ -4978,8 +4983,9 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     if (NewVT == WidenVT)
       return LdOp;
 
-    assert(WidenWidth % NewVTWidth == 0);
-    unsigned NumConcat = WidenWidth / NewVTWidth;
+    // TODO: We don't currently have any tests that exercise this code path.
+    assert(WidenWidth.getFixedSize() % NewVTWidth.getFixedSize() == 0);
+    unsigned NumConcat = WidenWidth.getFixedSize() / NewVTWidth.getFixedSize();
     SmallVector<SDValue, 16> ConcatOps(NumConcat);
     SDValue UndefVal = DAG.getUNDEF(NewVT);
     ConcatOps[0] = LdOp;
@@ -4992,35 +4998,30 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   SmallVector<SDValue, 16> LdOps;
   LdOps.push_back(LdOp);
 
-  LdWidth -= NewVTWidth;
-  unsigned Offset = 0;
-
-  while (LdWidth > 0) {
-    unsigned Increment = NewVTWidth / 8;
-    Offset += Increment;
-    BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, TypeSize::Fixed(Increment));
+  uint64_t ScaledOffset = 0;
+  MachinePointerInfo MPI = LD->getPointerInfo();
+  do {
+    LdWidth -= NewVTWidth;
+    IncrementPointer(cast<LoadSDNode>(LdOp), NewVT, MPI, BasePtr,
+                     &ScaledOffset);
 
-    SDValue L;
     if (LdWidth < NewVTWidth) {
       // The current type we are using is too large. Find a better size.
-      NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
+      NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign,
+                          WidthDiff.getKnownMinSize());
       NewVTWidth = NewVT.getSizeInBits();
-      L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
-                      LD->getPointerInfo().getWithOffset(Offset),
-                      LD->getOriginalAlign(), MMOFlags, AAInfo);
-      LdChain.push_back(L.getValue(1));
-    } else {
-      L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
-                      LD->getPointerInfo().getWithOffset(Offset),
-                      LD->getOriginalAlign(), MMOFlags, AAInfo);
-      LdChain.push_back(L.getValue(1));
     }
 
+    Align NewAlign = ScaledOffset == 0
+                         ? LD->getOriginalAlign()
+                         : commonAlignment(LD->getAlign(), ScaledOffset);
+    SDValue L =
+        DAG.getLoad(NewVT, dl, Chain, BasePtr, MPI, NewAlign, MMOFlags, AAInfo);
+    LdChain.push_back(L.getValue(1));
+
     LdOps.push_back(L);
     LdOp = L;
-
-    LdWidth -= NewVTWidth;
-  }
+  } while (LdWidth > NewVTWidth);
 
   // Build the vector from the load operations.
   unsigned End = LdOps.size();
@@ -5044,13 +5045,17 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     }
     ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i + 1, End);
   }
+
   ConcatOps[--Idx] = LdOps[i];
   for (--i; i >= 0; --i) {
     EVT NewLdTy = LdOps[i].getValueType();
     if (NewLdTy != LdTy) {
       // Create a larger vector.
-      unsigned NumOps = NewLdTy.getSizeInBits() / LdTy.getSizeInBits();
-      assert(NewLdTy.getSizeInBits() % LdTy.getSizeInBits() == 0);
+      unsigned NumOps =
+          (NewLdTy.getSizeInBits() / LdTy.getSizeInBits()).getKnownMinSize();
+      assert(
+          (NewLdTy.getSizeInBits() % LdTy.getSizeInBits()).getKnownMinSize() ==
+          0);
       SmallVector<SDValue, 16> WidenOps(NumOps);
       unsigned j = 0;
       for (; j != End-Idx; ++j)
@@ -5071,7 +5076,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
                        makeArrayRef(&ConcatOps[Idx], End - Idx));
 
   // We need to fill the rest with undefs to build the vector.
-  unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
+  unsigned NumOps = (WidenWidth / LdTy.getSizeInBits()).getKnownMinSize();
   SmallVector<SDValue, 16> WidenOps(NumOps);
   SDValue UndefVal = DAG.getUNDEF(LdTy);
   {
@@ -5094,6 +5099,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   EVT LdVT    = LD->getMemoryVT();
   SDLoc dl(LD);
   assert(LdVT.isVector() && WidenVT.isVector());
+  assert(LdVT.isScalableVector() == WidenVT.isScalableVector());
 
   // Load information
   SDValue Chain = LD->getChain();
@@ -5101,6 +5107,10 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
+  if (LdVT.isScalableVector())
+    report_fatal_error("Generating widen scalable extending vector loads is "
+                       "not yet supported");
+
   EVT EltVT = WidenVT.getVectorElementType();
   EVT LdEltVT = LdVT.getVectorElementType();
   unsigned NumElts = LdVT.getVectorNumElements();

diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a76505861dee..96a4354f6ba0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6724,6 +6724,9 @@ TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
   EVT DstVT = LD->getValueType(0);
   ISD::LoadExtType ExtType = LD->getExtensionType();
 
+  if (SrcVT.isScalableVector())
+    report_fatal_error("Cannot scalarize scalable vector loads");
+
   unsigned NumElem = SrcVT.getVectorNumElements();
 
   EVT SrcEltVT = SrcVT.getScalarType();
@@ -6811,6 +6814,9 @@ SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
   SDValue Value = ST->getValue();
   EVT StVT = ST->getMemoryVT();
 
+  if (StVT.isScalableVector())
+    report_fatal_error("Cannot scalarize scalable vector stores");
+
   // The type of the data we want to save
   EVT RegVT = Value.getValueType();
   EVT RegSclVT = RegVT.getScalarType();

diff  --git a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll
index abb7513b2a58..f83865ac0d07 100644
--- a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll
@@ -104,3 +104,40 @@ define <vscale x 2 x i64> @ld1d_inbound(<vscale x 2 x i64>* %a) {
   %load = load <vscale x 2 x i64>, <vscale x 2 x i64>* %base
   ret <vscale x 2 x i64> %load
 }
+
+define void @load_nxv6f16(<vscale x 6 x half>* %a) {
+; CHECK-LABEL: load_nxv6f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load volatile <vscale x 6 x half>, <vscale x 6 x half>* %a
+  ret void
+}
+
+define void @load_nxv6f32(<vscale x 6 x float>* %a) {
+; CHECK-LABEL: load_nxv6f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load volatile <vscale x 6 x float>, <vscale x 6 x float>* %a
+  ret void
+}
+
+define void @load_nxv12f16(<vscale x 12 x half>* %a) {
+; CHECK-LABEL: load_nxv12f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load volatile <vscale x 12 x half>, <vscale x 12 x half>* %a
+  ret void
+}
+

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-load.ll b/llvm/test/CodeGen/AArch64/sve-split-load.ll
index 5e64ff981286..a291412cec5f 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-load.ll
@@ -24,6 +24,18 @@ define <vscale x 16 x i16> @load_split_i16(<vscale x 16 x i16>* %a) {
   ret <vscale x 16 x i16> %load
 }
 
+define <vscale x 24 x i16> @load_split_24i16(<vscale x 24 x i16>* %a) {
+; CHECK-LABEL: load_split_24i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ret
+  %load = load <vscale x 24 x i16>, <vscale x 24 x i16>* %a
+  ret <vscale x 24 x i16> %load
+}
+
 define <vscale x 32 x i16> @load_split_32i16(<vscale x 32 x i16>* %a) {
 ; CHECK-LABEL: load_split_32i16:
 ; CHECK:       // %bb.0: