[llvm] r248622 - merge vector stores into wider vector stores and fix AArch64 misaligned access TLI hook (PR21711)

Fri Sep 25 14:49:48 PDT 2015

Author: spatel
Date: Fri Sep 25 16:49:48 2015
New Revision: 248622

URL: http://llvm.org/viewvc/llvm-project?rev=248622&view=rev
Log:
merge vector stores into wider vector stores and fix AArch64 misaligned access TLI hook (PR21711)

This is a redo of D7208 ( r227242 - http://llvm.org/viewvc/llvm-project?view=revision&revision=227242 ).

The patch was reverted because an AArch64 target could infinite loop after the change in DAGCombiner 
to merge vector stores. That happened because AArch64's allowsMisalignedMemoryAccesses() wasn't telling
the truth. It reported all unaligned memory accesses as fast, but then split some 128-bit unaligned
accesses up in performSTORECombine() because they are slow.

This patch attempts to fix the problem in AArch's allowsMisalignedMemoryAccesses() while preserving
existing (perhaps questionable) lowering behavior.

The x86 test shows that store merging is working as intended for a target with fast 32-byte unaligned
stores.

Differential Revision: http://reviews.llvm.org/D12635
 

Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/trunk/test/CodeGen/AArch64/merge-store.ll
    llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=248622&r1=248621&r2=248622&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Fri Sep 25 16:49:48 2015
@@ -11072,8 +11072,7 @@ bool DAGCombiner::MergeConsecutiveStores
   if (ElementSizeBytes * 8 != MemVT.getSizeInBits())
     return false;
 
-  // Don't merge vectors into wider inputs.
-  if (MemVT.isVector() || !MemVT.isSimple())
+  if (!MemVT.isSimple())
     return false;
 
   // Perform an early exit check. Do not bother looking at stored values that
@@ -11082,9 +11081,16 @@ bool DAGCombiner::MergeConsecutiveStores
   bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
   bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) ||
                        isa<ConstantFPSDNode>(StoredVal);
-  bool IsExtractVecEltSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT);
+  bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+                          StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR);
 
-  if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecEltSrc)
+  if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc)
+    return false;
+
+  // Don't merge vectors into wider vectors if the source data comes from loads.
+  // TODO: This restriction can be lifted by using logic similar to the
+  // ExtractVecSrc case.
+  if (MemVT.isVector() && IsLoadSrc)
     return false;
 
   // Only look at ends of store sequences.
@@ -11217,29 +11223,36 @@ bool DAGCombiner::MergeConsecutiveStores
 
   // When extracting multiple vector elements, try to store them
   // in one vector store rather than a sequence of scalar stores.
-  if (IsExtractVecEltSrc) {
-    unsigned NumElem = 0;
+  if (IsExtractVecSrc) {
+    unsigned NumStoresToMerge = 0;
+    bool IsVec = MemVT.isVector();
     for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) {
       StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      SDValue StoredVal = St->getValue();
+      unsigned StoreValOpcode = St->getValue().getOpcode();
       // This restriction could be loosened.
       // Bail out if any stored values are not elements extracted from a vector.
       // It should be possible to handle mixed sources, but load sources need
       // more careful handling (see the block of code below that handles
       // consecutive loads).
-      if (StoredVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT &&
+          StoreValOpcode != ISD::EXTRACT_SUBVECTOR)
         return false;
 
       // Find a legal type for the vector store.
-      EVT Ty = EVT::getVectorVT(Context, MemVT, i+1);
+      unsigned Elts = i + 1;
+      if (IsVec) {
+        // When merging vector stores, get the total number of elements.
+        Elts *= MemVT.getVectorNumElements();
+      }
+      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
       bool IsFast;
       if (TLI.isTypeLegal(Ty) &&
           TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
                                  FirstStoreAlign, &IsFast) && IsFast)
-        NumElem = i + 1;
+        NumStoresToMerge = i + 1;
     }
 
-    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
+    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumStoresToMerge,
                                            false, true);
   }
 

Modified: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp?rev=248622&r1=248621&r2=248622&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp Fri Sep 25 16:49:48 2015
@@ -796,9 +796,25 @@ bool AArch64TargetLowering::allowsMisali
                                                            bool *Fast) const {
   if (Subtarget->requiresStrictAlign())
     return false;
-  // FIXME: True for Cyclone, but not necessary others.
-  if (Fast)
-    *Fast = true;
+
+  // FIXME: This is mostly true for Cyclone, but not necessarily others.
+  if (Fast) {
+    // FIXME: Define an attribute for slow unaligned accesses instead of
+    // relying on the CPU type as a proxy.
+    // On Cyclone, unaligned 128-bit stores are slow.
+    *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||
+            // See comments in performSTORECombine() for more details about
+            // these conditions.
+
+            // Code that uses clang vector extensions can mark that it
+            // wants unaligned accesses to be treated as fast by
+            // underspecifying alignment to be 1 or 2.
+            Align <= 2 ||
+
+            // Disregard v2i64. Memcpy lowering produces those and splitting
+            // them regresses performance on micro-benchmarks and olden/bh.
+            VT == MVT::v2i64;
+  }
   return true;
 }
 
@@ -8435,6 +8451,10 @@ static SDValue performSTORECombine(SDNod
   if (S->isVolatile())
     return SDValue();
 
+  // FIXME: The logic for deciding if an unaligned store should be split should
+  // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
+  // a call to that function here.
+
   // Cyclone has bad performance on unaligned 16B stores when crossing line and
   // page boundaries. We want to split such stores.
   if (!Subtarget->isCyclone())

Modified: llvm/trunk/test/CodeGen/AArch64/merge-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/merge-store.ll?rev=248622&r1=248621&r2=248622&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/merge-store.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/merge-store.ll Fri Sep 25 16:49:48 2015
@@ -1,4 +1,5 @@
 ; RUN: llc -march aarch64 %s -o - | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown  -mcpu=cyclone  | FileCheck %s --check-prefix=CYCLONE
 
 @g0 = external global <3 x float>, align 16
 @g1 = external global <3 x float>, align 4
@@ -18,3 +19,32 @@ define void @blam() {
   store float %tmp9, float* %tmp7
   ret void;
 }
+
+
+; PR21711 - Merge vector stores into wider vector stores.
+
+; On Cyclone, the stores should not get merged into a 16-byte store because
+; unaligned 16-byte stores are slow. This test would infinite loop when
+; the fastness of unaligned accesses was not specified correctly.
+
+define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) {
+  %idx0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
+  %idx1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 4
+
+  %shuffle0 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuffle1 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+
+  store <2 x float> %shuffle0, <2 x float>* %idx0, align 8
+  store <2 x float> %shuffle1, <2 x float>* %idx1, align 8
+  ret void
+
+; CHECK-LABEL:    merge_vec_extract_stores
+; CHECK:          stur   q0, [x0, #24]
+; CHECK-NEXT:     ret
+
+; CYCLONE-LABEL:    merge_vec_extract_stores
+; CYCLONE:          ext   v1.16b, v0.16b, v0.16b, #8
+; CYCLONE-NEXT:     str   d0, [x0, #24]
+; CYCLONE-NEXT:     str   d1, [x0, #32]
+; CYCLONE-NEXT:     ret
+}

Modified: llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll?rev=248622&r1=248621&r2=248622&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll (original)
+++ llvm/trunk/test/CodeGen/X86/MergeConsecutiveStores.ll Fri Sep 25 16:49:48 2015
@@ -478,10 +478,8 @@ define void @merge_vec_extract_stores(<8
   ret void
 
 ; CHECK-LABEL: merge_vec_extract_stores
-; CHECK:      vmovaps %xmm0, 48(%rdi)
-; CHECK-NEXT: vextractf128 $1, %ymm0, 64(%rdi)
-; CHECK-NEXT: vmovaps %xmm1, 80(%rdi)
-; CHECK-NEXT: vextractf128 $1, %ymm1, 96(%rdi)
+; CHECK:      vmovups %ymm0, 48(%rdi)
+; CHECK-NEXT: vmovups %ymm1, 80(%rdi)
 ; CHECK-NEXT: vzeroupper
 ; CHECK-NEXT: retq
 }