[llvm] 4ed47d5 - [SVE][CodeGen] Fix DAGCombiner::ForwardStoreValueToDirectLoad for scalable vectors

Tue Oct 6 00:22:15 PDT 2020

Author: David Sherwood
Date: 2020-10-06T08:04:03+01:00
New Revision: 4ed47d50ea819fc3cf7341e86e947fef920743df

URL: https://github.com/llvm/llvm-project/commit/4ed47d50ea819fc3cf7341e86e947fef920743df
DIFF: https://github.com/llvm/llvm-project/commit/4ed47d50ea819fc3cf7341e86e947fef920743df.diff

LOG: [SVE][CodeGen] Fix DAGCombiner::ForwardStoreValueToDirectLoad for scalable vectors

In DAGCombiner::ForwardStoreValueToDirectLoad I have fixed up some
implicit casts from TypeSize -> uint64_t and replaced calls to
getVectorNumElements() with getVectorElementCount(). There are some
simple cases of forwarding that we can definitely support for
scalable vectors, i.e. when the store and load are both scalable
vectors and have the same size. I have added tests for the new
code paths here:

  CodeGen/AArch64/sve-forward-st-to-ld.ll

Differential Revision: https://reviews.llvm.org/D87098

Added: 
    llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f67a06f81c55..c36a085599bf 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14792,8 +14792,8 @@ SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
   return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
 }
 
-static inline int numVectorEltsOrZero(EVT T) {
-  return T.isVector() ? T.getVectorNumElements() : 0;
+static inline ElementCount numVectorEltsOrZero(EVT T) {
+  return T.isVector() ? T.getVectorElementCount() : ElementCount::getFixed(0);
 }
 
 bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) {
@@ -14861,6 +14861,24 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
   EVT STMemType = ST->getMemoryVT();
   EVT STType = ST->getValue().getValueType();
 
+  // There are two cases to consider here:
+  //  1. The store is fixed width and the load is scalable. In this case we
+  //     don't know at compile time if the store completely envelops the load
+  //     so we abandon the optimisation.
+  //  2. The store is scalable and the load is fixed width. We could
+  //     potentially support a limited number of cases here, but there has been
+  //     no cost-benefit analysis to prove it's worth it.
+  bool LdStScalable = LDMemType.isScalableVector();
+  if (LdStScalable != STMemType.isScalableVector())
+    return SDValue();
+
+  // If we are dealing with scalable vectors on a big endian platform the
+  // calculation of offsets below becomes trickier, since we do not know at
+  // compile time the absolute size of the vector. Until we've done more
+  // analysis on big-endian platforms it seems better to bail out for now.
+  if (LdStScalable && DAG.getDataLayout().isBigEndian())
+    return SDValue();
+
   BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
   BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG);
   int64_t Offset;
@@ -14872,13 +14890,21 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
   // the stored value). With Offset=n (for n > 0) the loaded value starts at the
   // n:th least significant byte of the stored value.
   if (DAG.getDataLayout().isBigEndian())
-    Offset = ((int64_t)STMemType.getStoreSizeInBits() -
-              (int64_t)LDMemType.getStoreSizeInBits()) / 8 - Offset;
+    Offset = ((int64_t)STMemType.getStoreSizeInBits().getFixedSize() -
+              (int64_t)LDMemType.getStoreSizeInBits().getFixedSize()) /
+                 8 -
+             Offset;
 
   // Check that the stored value cover all bits that are loaded.
-  bool STCoversLD =
-      (Offset >= 0) &&
-      (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits());
+  bool STCoversLD;
+
+  TypeSize LdMemSize = LDMemType.getSizeInBits();
+  TypeSize StMemSize = STMemType.getSizeInBits();
+  if (LdStScalable)
+    STCoversLD = (Offset == 0) && LdMemSize == StMemSize;
+  else
+    STCoversLD = (Offset >= 0) && (Offset * 8 + LdMemSize.getFixedSize() <=
+                                   StMemSize.getFixedSize());
 
   auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue {
     if (LD->isIndexed()) {
@@ -14899,15 +14925,15 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
   // Memory as copy space (potentially masked).
   if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
     // Simple case: Direct non-truncating forwarding
-    if (LDType.getSizeInBits() == LDMemType.getSizeInBits())
+    if (LDType.getSizeInBits() == LdMemSize)
       return ReplaceLd(LD, ST->getValue(), Chain);
     // Can we model the truncate and extension with an and mask?
     if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() &&
         !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) {
       // Mask to size of LDMemType
       auto Mask =
-          DAG.getConstant(APInt::getLowBitsSet(STType.getSizeInBits(),
-                                               STMemType.getSizeInBits()),
+          DAG.getConstant(APInt::getLowBitsSet(STType.getFixedSizeInBits(),
+                                               StMemSize.getFixedSize()),
                           SDLoc(ST), STType);
       auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask);
       return ReplaceLd(LD, Val, Chain);

diff  --git a/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll
new file mode 100644
index 000000000000..b38f7da73eac
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+define <vscale x 2 x i64> @sti64ldi64(<vscale x 2 x i64>* nocapture %P, <vscale x 2 x i64> %v) {
+; CHECK-LABEL: sti64ldi64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
+  store <vscale x 2 x i64> %v, <vscale x 2 x i64>* %arrayidx0
+  %arrayidx1 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
+  %0 = load <vscale x 2 x i64>, <vscale x 2 x i64>* %arrayidx1
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x double> @stf64ldf64(<vscale x 2 x double>* nocapture %P, <vscale x 2 x double> %v) {
+; CHECK-LABEL: stf64ldf64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x double>, <vscale x 2 x double>* %P, i64 1
+  store <vscale x 2 x double> %v, <vscale x 2 x double>* %arrayidx0
+  %arrayidx1 = getelementptr inbounds <vscale x 2 x double>, <vscale x 2 x double>* %P, i64 1
+  %0 = load <vscale x 2 x double>, <vscale x 2 x double>* %arrayidx1
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x i64> @sti32ldi32ext(<vscale x 2 x i32>* nocapture %P, <vscale x 2 x i64> %v) {
+; CHECK-LABEL: sti32ldi32ext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtw z1.d, p0/m, z0.d
+; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+  %0 = trunc <vscale x 2 x i64> %v to <vscale x 2 x i32>
+  store <vscale x 2 x i32> %0, <vscale x 2 x i32>* %P
+  %1 = load <vscale x 2 x i32>, <vscale x 2 x i32>* %P
+  %2 = sext <vscale x 2 x i32> %1 to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %2
+}
+
+define <2 x i64> @sti64ldfixedi64(<vscale x 2 x i64>* nocapture %P, <vscale x 2 x i64> %v) {
+; CHECK-LABEL: sti64ldfixedi64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    ret
+entry:
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
+  store <vscale x 2 x i64> %v, <vscale x 2 x i64>* %arrayidx0
+  %arrayidx1 = bitcast <vscale x 2 x i64>* %arrayidx0 to <2 x i64>*
+  %0 = load <2 x i64>, <2 x i64>* %arrayidx1
+  ret <2 x i64> %0
+}
+
+define <vscale x 4 x i32> @sti64ldi32(<vscale x 2 x i64>* nocapture %P, <vscale x 2 x i64> %v) {
+; CHECK-LABEL: sti64ldi32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %0 = bitcast <vscale x 2 x i64>* %P to <vscale x 4 x i32>*
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
+  store <vscale x 2 x i64> %v, <vscale x 2 x i64>* %arrayidx0
+  %arrayidx1 = getelementptr inbounds <vscale x 4 x i32>, <vscale x 4 x i32>* %0, i64 1
+  %1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %arrayidx1
+  ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 2 x i64> @stf64ldi64(<vscale x 2 x double>* nocapture %P, <vscale x 2 x double> %v) {
+; CHECK-LABEL: stf64ldi64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %0 = bitcast <vscale x 2 x double>* %P to <vscale x 2 x i64>*
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x double>, <vscale x 2 x double>* %P, i64 1
+  store <vscale x 2 x double> %v, <vscale x 2 x double>* %arrayidx0
+  %arrayidx1 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %0, i64 1
+  %1 = load <vscale x 2 x i64>, <vscale x 2 x i64>* %arrayidx1
+  ret <vscale x 2 x i64> %1
+}