[llvm] r191525 - [SystemZ] Rein back the use of block operations

Fri Sep 27 08:29:20 PDT 2013

Author: rsandifo
Date: Fri Sep 27 10:29:20 2013
New Revision: 191525

URL: http://llvm.org/viewvc/llvm-project?rev=191525&view=rev
Log:
[SystemZ] Rein back the use of block operations

The backend tries to use block operations like MVC, NC, OC and XC for
simple scalar operations.  For correctness reasons, it rejects any case
in which the regions might partially overlap.  However, for performance
reasons, it should also reject cases where the regions might be equal,
since the instruction might then not use the fast path.

This fixes a performance regression seen in bzip2.  We may want to limit
the optimisation even more in future, or even remove it entirely, but I'll
try with this for now.

Modified:
    llvm/trunk/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
    llvm/trunk/lib/Target/SystemZ/SystemZInstrInfo.cpp
    llvm/trunk/test/CodeGen/SystemZ/and-08.ll
    llvm/trunk/test/CodeGen/SystemZ/fp-move-02.ll
    llvm/trunk/test/CodeGen/SystemZ/memcpy-02.ll

Modified: llvm/trunk/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp?rev=191525&r1=191524&r2=191525&view=diff
==============================================================================

--- llvm/trunk/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp Fri Sep 27 10:29:20 2013
@@ -290,6 +290,17 @@ class SystemZDAGToDAGISel : public Selec
   SDNode *splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0,
                               uint64_t UpperVal, uint64_t LowerVal);
 
+  // Return true if Load and Store are loads and stores of the same size
+  // and are guaranteed not to overlap.  Such operations can be implemented
+  // using block (SS-format) instructions.
+  //
+  // Partial overlap would lead to incorrect code, since the block operations
+  // are logically bytewise, even though they have a fast path for the
+  // non-overlapping case.  We also need to avoid full overlap (i.e. two
+  // addresses that might be equal at run time) because although that case
+  // would be handled correctly, it might be implemented by millicode.
+  bool canUseBlockOperation(StoreSDNode *Store, LoadSDNode *Load) const;
+
   // N is a (store (load Y), X) pattern.  Return true if it can use an MVC
   // from Y to X.
   bool storeLoadCanUseMVC(SDNode *N) const;
@@ -938,13 +949,8 @@ SDNode *SystemZDAGToDAGISel::splitLargeI
   return Or.getNode();
 }
 
-// Return true if Load and Store:
-// - are loads and stores of the same size;
-// - do not partially overlap; and
-// - can be decomposed into what are logically individual character accesses
-//   without changing the semantics.
-static bool canUseBlockOperation(StoreSDNode *Store, LoadSDNode *Load,
-                                 AliasAnalysis *AA) {
+bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store,
+                                               LoadSDNode *Load) const {
   // Check that the two memory operands have the same size.
   if (Load->getMemoryVT() != Store->getMemoryVT())
     return false;
@@ -957,19 +963,19 @@ static bool canUseBlockOperation(StoreSD
   if (Load->isInvariant())
     return true;
 
-  // If both operands are aligned, they must be equal or not overlap.
-  uint64_t Size = Load->getMemoryVT().getStoreSize();
-  if (Load->getAlignment() >= Size && Store->getAlignment() >= Size)
-    return true;
-
   // Otherwise we need to check whether there's an alias.
   const Value *V1 = Load->getSrcValue();
   const Value *V2 = Store->getSrcValue();
   if (!V1 || !V2)
     return false;
 
+  // Reject equality.
+  uint64_t Size = Load->getMemoryVT().getStoreSize();
   int64_t End1 = Load->getSrcValueOffset() + Size;
   int64_t End2 = Store->getSrcValueOffset() + Size;
+  if (V1 == V2 && End1 == End2)
+    return false;
+
   return !AA->alias(AliasAnalysis::Location(V1, End1, Load->getTBAAInfo()),
                     AliasAnalysis::Location(V2, End2, Store->getTBAAInfo()));
 }
@@ -990,7 +996,7 @@ bool SystemZDAGToDAGISel::storeLoadCanUs
       return false;
   }
 
-  return canUseBlockOperation(Store, Load, AA);
+  return canUseBlockOperation(Store, Load);
 }
 
 bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
@@ -998,11 +1004,7 @@ bool SystemZDAGToDAGISel::storeLoadCanUs
   StoreSDNode *StoreA = cast<StoreSDNode>(N);
   LoadSDNode *LoadA = cast<LoadSDNode>(StoreA->getValue().getOperand(1 - I));
   LoadSDNode *LoadB = cast<LoadSDNode>(StoreA->getValue().getOperand(I));
-  if (LoadA->isVolatile() ||
-      LoadA->getMemoryVT() != StoreA->getMemoryVT() ||
-      LoadA->getBasePtr() != StoreA->getBasePtr())
-    return false;
-  return canUseBlockOperation(StoreA, LoadB, AA);
+  return !LoadA->isVolatile() && canUseBlockOperation(StoreA, LoadB);
 }
 
 SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {

Modified: llvm/trunk/lib/Target/SystemZ/SystemZInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/SystemZ/SystemZInstrInfo.cpp?rev=191525&r1=191524&r2=191525&view=diff
==============================================================================
--- llvm/trunk/lib/Target/SystemZ/SystemZInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/SystemZ/SystemZInstrInfo.cpp Fri Sep 27 10:29:20 2013
@@ -674,10 +674,14 @@ SystemZInstrInfo::foldMemoryOperandImpl(
   //
   // Although MVC is in practice a fast choice in these cases, it is still
   // logically a bytewise copy.  This means that we cannot use it if the
-  // load or store is volatile.  It also means that the transformation is
-  // not valid in cases where the two memories partially overlap; however,
-  // that is not a problem here, because we know that one of the memories
-  // is a full frame index.
+  // load or store is volatile.  We also wouldn't be able to use MVC if
+  // the two memories partially overlap, but that case cannot occur here,
+  // because we know that one of the memories is a full frame index.
+  //
+  // For performance reasons, we also want to avoid using MVC if the addresses
+  // might be equal.  We don't worry about that case here, because spill slot
+  // coloring happens later, and because we have special code to remove
+  // MVCs that turn out to be redundant.
   if (OpNum == 0 && MI->hasOneMemOperand()) {
     MachineMemOperand *MMO = *MI->memoperands_begin();
     if (MMO->getSize() == Size && !MMO->isVolatile()) {

Modified: llvm/trunk/test/CodeGen/SystemZ/and-08.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/SystemZ/and-08.ll?rev=191525&r1=191524&r2=191525&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/SystemZ/and-08.ll (original)
+++ llvm/trunk/test/CodeGen/SystemZ/and-08.ll Fri Sep 27 10:29:20 2013
@@ -2,8 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
- at g1 = global i8 1
- at g2 = global i16 2
+ at g1src = global i8 1
+ at g1dst = global i8 1
+ at g2src = global i16 2
+ at g2dst = global i16 2
 
 ; Test the simple i8 case.
 define void @f1(i8 *%ptr1) {
@@ -239,11 +241,12 @@ define void @f16(i64 *%ptr1) {
   ret void
 }
 
-; Test that NC is used for aligned loads and stores, even if there is
-; no way of telling whether they alias.
+; Test that NC is not used for aligned loads and stores if there is
+; no way of telling whether they alias.  We don't want to use NC in
+; cases where the addresses could be equal.
 define void @f17(i64 *%ptr1, i64 *%ptr2) {
 ; CHECK-LABEL: f17:
-; CHECK: nc 0(8,%r3), 0(%r2)
+; CHECK-NOT: nc
 ; CHECK: br %r14
   %val = load i64 *%ptr1
   %old = load i64 *%ptr2
@@ -306,58 +309,34 @@ define void @f21(i64 %base) {
 ; Test that we can use NC for global addresses for i8.
 define void @f22(i8 *%ptr) {
 ; CHECK-LABEL: f22:
-; CHECK: larl [[REG:%r[0-5]]], g1
-; CHECK: nc 0(1,%r2), 0([[REG]])
-; CHECK: br %r14
-  %val = load i8 *@g1
-  %old = load i8 *%ptr
-  %and = and i8 %val, %old
-  store i8 %and, i8 *%ptr
-  ret void
-}
-
-; ...and again with the global on the store.
-define void @f23(i8 *%ptr) {
-; CHECK-LABEL: f23:
-; CHECK: larl [[REG:%r[0-5]]], g1
-; CHECK: nc 0(1,[[REG]]), 0(%r2)
+; CHECK-DAG: larl [[SRC:%r[0-5]]], g1src
+; CHECK-DAG: larl [[DST:%r[0-5]]], g1dst
+; CHECK: nc 0(1,[[DST]]), 0([[SRC]])
 ; CHECK: br %r14
-  %val = load i8 *%ptr
-  %old = load i8 *@g1
+  %val = load i8 *@g1src
+  %old = load i8 *@g1dst
   %and = and i8 %val, %old
-  store i8 %and, i8 *@g1
+  store i8 %and, i8 *@g1dst
   ret void
 }
 
 ; Test that we use NC even where LHRL and STHRL are available.
-define void @f24(i16 *%ptr) {
-; CHECK-LABEL: f24:
-; CHECK: larl [[REG:%r[0-5]]], g2
-; CHECK: nc 0(2,%r2), 0([[REG]])
-; CHECK: br %r14
-  %val = load i16 *@g2
-  %old = load i16 *%ptr
-  %and = and i16 %val, %old
-  store i16 %and, i16 *%ptr
-  ret void
-}
-
-; ...likewise on the other side.
-define void @f25(i16 *%ptr) {
-; CHECK-LABEL: f25:
-; CHECK: larl [[REG:%r[0-5]]], g2
-; CHECK: nc 0(2,[[REG]]), 0(%r2)
+define void @f23(i16 *%ptr) {
+; CHECK-LABEL: f23:
+; CHECK-DAG: larl [[SRC:%r[0-5]]], g2src
+; CHECK-DAG: larl [[DST:%r[0-5]]], g2dst
+; CHECK: nc 0(2,[[DST]]), 0([[SRC]])
 ; CHECK: br %r14
-  %val = load i16 *%ptr
-  %old = load i16 *@g2
+  %val = load i16 *@g2src
+  %old = load i16 *@g2dst
   %and = and i16 %val, %old
-  store i16 %and, i16 *@g2
+  store i16 %and, i16 *@g2dst
   ret void
 }
 
 ; Test a case where offset disambiguation is enough.
-define void @f26(i64 *%ptr1) {
-; CHECK-LABEL: f26:
+define void @f24(i64 *%ptr1) {
+; CHECK-LABEL: f24:
 ; CHECK: nc 8(8,%r2), 0(%r2)
 ; CHECK: br %r14
   %ptr2 = getelementptr i64 *%ptr1, i64 1
@@ -369,8 +348,8 @@ define void @f26(i64 *%ptr1) {
 }
 
 ; Test a case where TBAA tells us there is no alias.
-define void @f27(i64 *%ptr1, i64 *%ptr2) {
-; CHECK-LABEL: f27:
+define void @f25(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f25:
 ; CHECK: nc 0(8,%r3), 0(%r2)
 ; CHECK: br %r14
   %val = load i64 *%ptr1, align 2, !tbaa !1
@@ -381,8 +360,8 @@ define void @f27(i64 *%ptr1, i64 *%ptr2)
 }
 
 ; Test a case where TBAA information is present but doesn't help.
-define void @f28(i64 *%ptr1, i64 *%ptr2) {
-; CHECK-LABEL: f28:
+define void @f26(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f26:
 ; CHECK-NOT: nc
 ; CHECK: br %r14
   %val = load i64 *%ptr1, align 2, !tbaa !1

Modified: llvm/trunk/test/CodeGen/SystemZ/fp-move-02.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/SystemZ/fp-move-02.ll?rev=191525&r1=191524&r2=191525&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/SystemZ/fp-move-02.ll (original)
+++ llvm/trunk/test/CodeGen/SystemZ/fp-move-02.ll Fri Sep 27 10:29:20 2013
@@ -63,11 +63,11 @@ define double @f5(i64 %a) {
 
 ; Test 128-bit moves from GPRs to FPRs.  i128 isn't a legitimate type,
 ; so this goes through memory.
-; FIXME: it would be better to use one MVC here.
 define void @f6(fp128 *%a, i128 *%b) {
 ; CHECK-LABEL: f6:
 ; CHECK: lg
-; CHECK: mvc
+; CHECK: lg
+; CHECK: stg
 ; CHECK: stg
 ; CHECK: br %r14
   %val = load i128 *%b

Modified: llvm/trunk/test/CodeGen/SystemZ/memcpy-02.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/SystemZ/memcpy-02.ll?rev=191525&r1=191524&r2=191525&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/SystemZ/memcpy-02.ll (original)
+++ llvm/trunk/test/CodeGen/SystemZ/memcpy-02.ll Fri Sep 27 10:29:20 2013
@@ -2,11 +2,14 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
- at g1 = global i8 1
- at g2 = global i16 2
+ at g1src = global i8 1
+ at g1dst = global i8 1
+ at g2src = global i16 2
+ at g2dst = global i16 2
 @g3 = global i32 3
 @g4 = global i64 4
- at g5 = external global fp128, align 16
+ at g5src = external global fp128, align 16
+ at g5dst = external global fp128, align 16
 
 ; Test the simple i8 case.
 define void @f1(i8 *%ptr1) {
@@ -237,18 +240,19 @@ define void @f19(i64 *%ptr1) {
   ret void
 }
 
-; Test that MVC is used for aligned loads and stores, even if there is
-; no way of telling whether they alias.
+; Test that MVC is not used for aligned loads and stores if there is
+; no way of telling whether they alias.  We don't want to use MVC in
+; cases where the addresses could be equal.
 define void @f20(i64 *%ptr1, i64 *%ptr2) {
 ; CHECK-LABEL: f20:
-; CHECK: mvc 0(8,%r3), 0(%r2)
+; CHECK-NOT: mvc
 ; CHECK: br %r14
   %val = load i64 *%ptr1
   store i64 %val, i64 *%ptr2
   ret void
 }
 
-; ...but if the loads aren't aligned, we can't be sure.
+; ...and again for unaligned loads and stores.
 define void @f21(i64 *%ptr1, i64 *%ptr2) {
 ; CHECK-LABEL: f21:
 ; CHECK-NOT: mvc
@@ -274,50 +278,29 @@ define void @f22(i64 %base) {
 ; Test that we can use MVC for global addresses for i8.
 define void @f23(i8 *%ptr) {
 ; CHECK-LABEL: f23:
-; CHECK: larl [[REG:%r[0-5]]], g1
-; CHECK: mvc 0(1,%r2), 0([[REG]])
+; CHECK-DAG: larl [[SRC:%r[0-5]]], g1src
+; CHECK-DAG: larl [[DST:%r[0-5]]], g1dst
+; CHECK: mvc 0(1,[[DST]]), 0([[SRC]])
 ; CHECK: br %r14
-  %val = load i8 *@g1
-  store i8 %val, i8 *%ptr
+  %val = load i8 *@g1src
+  store i8 %val, i8 *@g1dst
   ret void
 }
 
-; ...and again with the global on the store.
-define void @f24(i8 *%ptr) {
+; Test that we use LHRL and STHRL for i16.
+define void @f24(i16 *%ptr) {
 ; CHECK-LABEL: f24:
-; CHECK: larl [[REG:%r[0-5]]], g1
-; CHECK: mvc 0(1,[[REG]]), 0(%r2)
-; CHECK: br %r14
-  %val = load i8 *%ptr
-  store i8 %val, i8 *@g1
-  ret void
-}
-
-; Test that we use LHRL for i16.
-define void @f25(i16 *%ptr) {
-; CHECK-LABEL: f25:
-; CHECK: lhrl [[REG:%r[0-5]]], g2
-; CHECK: sth [[REG]], 0(%r2)
+; CHECK: lhrl [[REG:%r[0-5]]], g2src
+; CHECK: sthrl [[REG]], g2dst
 ; CHECK: br %r14
-  %val = load i16 *@g2
-  store i16 %val, i16 *%ptr
-  ret void
-}
-
-; ...likewise STHRL.
-define void @f26(i16 *%ptr) {
-; CHECK-LABEL: f26:
-; CHECK: lh [[REG:%r[0-5]]], 0(%r2)
-; CHECK: sthrl [[REG]], g2
-; CHECK: br %r14
-  %val = load i16 *%ptr
-  store i16 %val, i16 *@g2
+  %val = load i16 *@g2src
+  store i16 %val, i16 *@g2dst
   ret void
 }
 
 ; Test that we use LRL for i32.
-define void @f27(i32 *%ptr) {
-; CHECK-LABEL: f27:
+define void @f25(i32 *%ptr) {
+; CHECK-LABEL: f25:
 ; CHECK: lrl [[REG:%r[0-5]]], g3
 ; CHECK: st [[REG]], 0(%r2)
 ; CHECK: br %r14
@@ -327,8 +310,8 @@ define void @f27(i32 *%ptr) {
 }
 
 ; ...likewise STRL.
-define void @f28(i32 *%ptr) {
-; CHECK-LABEL: f28:
+define void @f26(i32 *%ptr) {
+; CHECK-LABEL: f26:
 ; CHECK: l [[REG:%r[0-5]]], 0(%r2)
 ; CHECK: strl [[REG]], g3
 ; CHECK: br %r14
@@ -338,8 +321,8 @@ define void @f28(i32 *%ptr) {
 }
 
 ; Test that we use LGRL for i64.
-define void @f29(i64 *%ptr) {
-; CHECK-LABEL: f29:
+define void @f27(i64 *%ptr) {
+; CHECK-LABEL: f27:
 ; CHECK: lgrl [[REG:%r[0-5]]], g4
 ; CHECK: stg [[REG]], 0(%r2)
 ; CHECK: br %r14
@@ -349,8 +332,8 @@ define void @f29(i64 *%ptr) {
 }
 
 ; ...likewise STGRL.
-define void @f30(i64 *%ptr) {
-; CHECK-LABEL: f30:
+define void @f28(i64 *%ptr) {
+; CHECK-LABEL: f28:
 ; CHECK: lg [[REG:%r[0-5]]], 0(%r2)
 ; CHECK: stgrl [[REG]], g4
 ; CHECK: br %r14
@@ -360,30 +343,20 @@ define void @f30(i64 *%ptr) {
 }
 
 ; Test that we can use MVC for global addresses for fp128.
-define void @f31(fp128 *%ptr) {
-; CHECK-LABEL: f31:
-; CHECK: larl [[REG:%r[0-5]]], g5
-; CHECK: mvc 0(16,%r2), 0([[REG]])
-; CHECK: br %r14
-  %val = load fp128 *@g5, align 16
-  store fp128 %val, fp128 *%ptr, align 16
-  ret void
-}
-
-; ...and again with the global on the store.
-define void @f32(fp128 *%ptr) {
-; CHECK-LABEL: f32:
-; CHECK: larl [[REG:%r[0-5]]], g5
-; CHECK: mvc 0(16,[[REG]]), 0(%r2)
+define void @f29(fp128 *%ptr) {
+; CHECK-LABEL: f29:
+; CHECK-DAG: larl [[SRC:%r[0-5]]], g5src
+; CHECK-DAG: larl [[DST:%r[0-5]]], g5dst
+; CHECK: mvc 0(16,[[DST]]), 0([[SRC]])
 ; CHECK: br %r14
-  %val = load fp128 *%ptr, align 16
-  store fp128 %val, fp128 *@g5, align 16
+  %val = load fp128 *@g5src, align 16
+  store fp128 %val, fp128 *@g5dst, align 16
   ret void
 }
 
 ; Test a case where offset disambiguation is enough.
-define void @f33(i64 *%ptr1) {
-; CHECK-LABEL: f33:
+define void @f30(i64 *%ptr1) {
+; CHECK-LABEL: f30:
 ; CHECK: mvc 8(8,%r2), 0(%r2)
 ; CHECK: br %r14
   %ptr2 = getelementptr i64 *%ptr1, i64 1
@@ -393,8 +366,8 @@ define void @f33(i64 *%ptr1) {
 }
 
 ; Test f21 in cases where TBAA tells us there is no alias.
-define void @f34(i64 *%ptr1, i64 *%ptr2) {
-; CHECK-LABEL: f34:
+define void @f31(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f31:
 ; CHECK: mvc 0(8,%r3), 0(%r2)
 ; CHECK: br %r14
   %val = load i64 *%ptr1, align 2, !tbaa !1
@@ -403,8 +376,8 @@ define void @f34(i64 *%ptr1, i64 *%ptr2)
 }
 
 ; Test f21 in cases where TBAA is present but doesn't help.
-define void @f35(i64 *%ptr1, i64 *%ptr2) {
-; CHECK-LABEL: f35:
+define void @f32(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f32:
 ; CHECK-NOT: mvc
 ; CHECK: br %r14
   %val = load i64 *%ptr1, align 2, !tbaa !1