[llvm] r347747 - [MachineScheduler] Add support for clustering mem ops with FI base operands

Wed Nov 28 04:00:28 PST 2018

Author: thegameg
Date: Wed Nov 28 04:00:28 2018
New Revision: 347747

URL: http://llvm.org/viewvc/llvm-project?rev=347747&view=rev
Log:
[MachineScheduler] Add support for clustering mem ops with FI base operands

Before this patch, the following stores in `merge_fail` would fail to be
merged, while they would get merged in `merge_ok`:

```
void use(unsigned long long *);
void merge_fail(unsigned key, unsigned index)
{
  unsigned long long args[8];
  args[0] = key;
  args[1] = index;
  use(args);
}
void merge_ok(unsigned long long *dst, unsigned a, unsigned b)
{
  dst[0] = a;
  dst[1] = b;
}
```

The reason is that `getMemOpBaseImmOfs` would return false for FI base
operands.

This adds support for this.

Differential Revision: https://reviews.llvm.org/D54847

Added:
    llvm/trunk/test/CodeGen/AArch64/cluster-frame-index.mir
Modified:
    llvm/trunk/lib/CodeGen/MachineScheduler.cpp
    llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp
    llvm/trunk/test/CodeGen/AArch64/arm64-memset-inline.ll

Modified: llvm/trunk/lib/CodeGen/MachineScheduler.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineScheduler.cpp?rev=347747&r1=347746&r2=347747&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/MachineScheduler.cpp (original)
+++ llvm/trunk/lib/CodeGen/MachineScheduler.cpp Wed Nov 28 04:00:28 2018
@@ -1490,8 +1490,20 @@ class BaseMemOpClusterMutation : public
         : SU(su), BaseOp(Op), Offset(ofs) {}
 
     bool operator<(const MemOpInfo &RHS) const {
-      return std::make_tuple(BaseOp->getReg(), Offset, SU->NodeNum) <
-             std::make_tuple(RHS.BaseOp->getReg(), RHS.Offset, RHS.SU->NodeNum);
+      if (BaseOp->getType() != RHS.BaseOp->getType())
+        return BaseOp->getType() < RHS.BaseOp->getType();
+
+      if (BaseOp->isReg())
+        return std::make_tuple(BaseOp->getReg(), Offset, SU->NodeNum) <
+               std::make_tuple(RHS.BaseOp->getReg(), RHS.Offset,
+                               RHS.SU->NodeNum);
+      if (BaseOp->isFI())
+        return std::make_tuple(BaseOp->getIndex(), Offset, SU->NodeNum) <
+               std::make_tuple(RHS.BaseOp->getIndex(), RHS.Offset,
+                               RHS.SU->NodeNum);
+
+      llvm_unreachable("MemOpClusterMutation only supports register or frame "
+                       "index bases.");
     }
   };
 

Modified: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp?rev=347747&r1=347746&r2=347747&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp Wed Nov 28 04:00:28 2018
@@ -1146,9 +1146,9 @@ bool AArch64InstrInfo::areMemAccessesTri
       MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
     return false;
 
-  // Retrieve the base register, offset from the base and width. Width
+  // Retrieve the base, offset from the base and width. Width
   // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
-  // base registers are identical, and the offset of a lower memory access +
+  // base are identical, and the offset of a lower memory access +
   // the width doesn't overlap the offset of a higher memory access,
   // then the memory accesses are different.
   if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
@@ -2035,8 +2035,9 @@ bool AArch64InstrInfo::isCandidateToMerg
   if (MI.hasOrderedMemoryRef())
     return false;
 
-  // Make sure this is a reg+imm (as opposed to an address reloc).
-  assert(MI.getOperand(1).isReg() && "Expected a reg operand.");
+  // Make sure this is a reg/fi+imm (as opposed to an address reloc).
+  assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
+         "Expected a reg or frame index operand.");
   if (!MI.getOperand(2).isImm())
     return false;
 
@@ -2086,11 +2087,13 @@ bool AArch64InstrInfo::getMemOperandWith
   // Handle only loads/stores with base register followed by immediate offset.
   if (LdSt.getNumExplicitOperands() == 3) {
     // Non-paired instruction (e.g., ldr x1, [x0, #8]).
-    if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm())
+    if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
+        !LdSt.getOperand(2).isImm())
       return false;
   } else if (LdSt.getNumExplicitOperands() == 4) {
     // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
-    if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isReg() ||
+    if (!LdSt.getOperand(1).isReg() ||
+        (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
         !LdSt.getOperand(3).isImm())
       return false;
   } else
@@ -2117,9 +2120,9 @@ bool AArch64InstrInfo::getMemOperandWith
     Offset = LdSt.getOperand(3).getImm() * Scale;
   }
 
-  assert(
-      BaseOp->isReg() &&
-      "getMemOperandWithOffset only supports base operands of type register.");
+  assert((BaseOp->isReg() || BaseOp->isFI()) &&
+         "getMemOperandWithOffset only supports base "
+         "operands of type register or frame index.");
 
   return true;
 }
@@ -2275,31 +2278,33 @@ bool AArch64InstrInfo::getMemOpInfo(unsi
   return true;
 }
 
-// Scale the unscaled offsets.  Returns false if the unscaled offset can't be
-// scaled.
-static bool scaleOffset(unsigned Opc, int64_t &Offset) {
-  unsigned OffsetStride = 1;
+static unsigned getOffsetStride(unsigned Opc) {
   switch (Opc) {
   default:
-    return false;
+    return 0;
   case AArch64::LDURQi:
   case AArch64::STURQi:
-    OffsetStride = 16;
-    break;
+    return 16;
   case AArch64::LDURXi:
   case AArch64::LDURDi:
   case AArch64::STURXi:
   case AArch64::STURDi:
-    OffsetStride = 8;
-    break;
+    return 8;
   case AArch64::LDURWi:
   case AArch64::LDURSi:
   case AArch64::LDURSWi:
   case AArch64::STURWi:
   case AArch64::STURSi:
-    OffsetStride = 4;
-    break;
+    return 4;
   }
+}
+
+// Scale the unscaled offsets.  Returns false if the unscaled offset can't be
+// scaled.
+static bool scaleOffset(unsigned Opc, int64_t &Offset) {
+  unsigned OffsetStride = getOffsetStride(Opc);
+  if (OffsetStride == 0)
+    return false;
   // If the byte-offset isn't a multiple of the stride, we can't scale this
   // offset.
   if (Offset % OffsetStride != 0)
@@ -2311,6 +2316,19 @@ static bool scaleOffset(unsigned Opc, in
   return true;
 }
 
+// Unscale the scaled offsets. Returns false if the scaled offset can't be
+// unscaled.
+static bool unscaleOffset(unsigned Opc, int64_t &Offset) {
+  unsigned OffsetStride = getOffsetStride(Opc);
+  if (OffsetStride == 0)
+    return false;
+
+  // Convert the "element" offset used by scaled pair load/store instructions
+  // into the byte-offset used by unscaled.
+  Offset *= OffsetStride;
+  return true;
+}
+
 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
   if (FirstOpc == SecondOpc)
     return true;
@@ -2329,6 +2347,30 @@ static bool canPairLdStOpc(unsigned Firs
   return false;
 }
 
+static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
+                            int64_t Offset1, unsigned Opcode1, int FI2,
+                            int64_t Offset2, unsigned Opcode2) {
+  // Accesses through fixed stack object frame indices may access a different
+  // fixed stack slot. Check that the object offsets + offsets match.
+  if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
+    int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
+    int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
+    assert(ObjectOffset1 >= ObjectOffset2 && "Object offsets are not ordered.");
+    // Get the byte-offset from the object offset.
+    if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2))
+      return false;
+    ObjectOffset1 += Offset1;
+    ObjectOffset2 += Offset2;
+    // Get the "element" index in the object.
+    if (!scaleOffset(Opcode1, ObjectOffset1) ||
+        !scaleOffset(Opcode2, ObjectOffset2))
+      return false;
+    return ObjectOffset2 + 1 == ObjectOffset1;
+  }
+
+  return FI1 == FI2;
+}
+
 /// Detect opportunities for ldp/stp formation.
 ///
 /// Only called for LdSt for which getMemOperandWithOffset returns true.
@@ -2340,9 +2382,11 @@ bool AArch64InstrInfo::shouldClusterMemO
   if (BaseOp1.getType() != BaseOp2.getType())
     return false;
 
-  assert(BaseOp1.isReg() && "Only base registers are supported.");
+  assert(BaseOp1.isReg() ||
+         BaseOp1.isFI() &&
+             "Only base registers and frame indices are supported.");
 
-  // Check for base regs.
+  // Check for both base regs and base FI.
   if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
     return false;
 
@@ -2379,7 +2423,17 @@ bool AArch64InstrInfo::shouldClusterMemO
     return false;
 
   // The caller should already have ordered First/SecondLdSt by offset.
-  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
+  // Note: except for non-equal frame index bases
+  assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
+         "Caller should have ordered offsets.");
+
+  if (BaseOp1.isFI()) {
+    const MachineFrameInfo &MFI =
+        FirstLdSt.getParent()->getParent()->getFrameInfo();
+    return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
+                           BaseOp2.getIndex(), Offset2, SecondOpc);
+  }
+
   return Offset1 + 1 == Offset2;
 }
 

Modified: llvm/trunk/test/CodeGen/AArch64/arm64-memset-inline.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-memset-inline.ll?rev=347747&r1=347746&r2=347747&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-memset-inline.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-memset-inline.ll Wed Nov 28 04:00:28 2018
@@ -259,9 +259,9 @@ define void @memset_12_stack() {
 define void @memset_16_stack() {
 ; CHECK-LABEL: memset_16_stack:
 ; CHECK:       mov x8, #-6148914691236517206
+; CHECK-NEXT:  str x8, [sp, #-32]!
 ; CHECK-NEXT:  mov x0, sp
 ; CHECK-NEXT:  stp x8, x30, [sp, #8]
-; CHECK-NEXT:  str x8, [sp]
 ; CHECK-NEXT:  bl something
   %buf = alloca [16 x i8], align 1
   %cast = bitcast [16 x i8]* %buf to i8*

Added: llvm/trunk/test/CodeGen/AArch64/cluster-frame-index.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/cluster-frame-index.mir?rev=347747&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/cluster-frame-index.mir (added)
+++ llvm/trunk/test/CodeGen/AArch64/cluster-frame-index.mir Wed Nov 28 04:00:28 2018
@@ -0,0 +1,27 @@
+#RUN: llc -mtriple=aarch64-- -mcpu=cyclone -run-pass machine-scheduler -o - %s | FileCheck %s
+...
+---
+name:            merge_stack
+# CHECK-LABEL: name: merge_stack
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 8 }
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    %0:gpr32 = COPY $w0
+    %1:gpr32 = COPY $w1
+    undef %3.sub_32:gpr64 = ORRWrs $wzr, %0, 0
+    STRXui %3, %stack.0, 0 :: (store 8)
+    undef %5.sub_32:gpr64 = ORRWrs $wzr, %1, 0
+    STRXui %5, %stack.0, 1 :: (store 8)
+    RET_ReallyLR
+
+    ; CHECK: COPY
+    ; CHECK-NEXT: COPY
+    ; CHECK-NEXT: ORRWrs
+    ; CHECK-NEXT: ORRWrs
+    ; CHECK-NEXT: STRXui
+    ; CHECK-NEXT: STRXui
+    ; CHECK-NEXT: RET