[llvm] r365014 - [ARM] Thumb2: favor R4-R7 over R12/LR in allocation order when opt for minsize

Wed Jul 3 02:58:52 PDT 2019

Author: ostannard
Date: Wed Jul  3 02:58:52 2019
New Revision: 365014

URL: http://llvm.org/viewvc/llvm-project?rev=365014&view=rev
Log:
[ARM] Thumb2: favor R4-R7 over R12/LR in allocation order when opt for minsize

For Thumb2, we prefer low regs (costPerUse = 0) to allow narrow
encoding. However, current allocation order is like:
  R0-R3, R12, LR, R4-R11

As a result, a lot of instructs that use R12/LR will be wide instrs.

This patch changes the allocation order to:
  R0-R7, R12, LR, R8-R11
for thumb2 and -Osize.

In most cases, there is no extra push/pop instrs as they will be folded
into existing ones. There might be slight performance impact due to more
stack usage, so we only enable it when opt for min size.

https://reviews.llvm.org/D30324

Added:
    llvm/trunk/test/CodeGen/ARM/favor-low-reg-for-Osize.ll
Modified:
    llvm/trunk/include/llvm/CodeGen/TargetSubtargetInfo.h
    llvm/trunk/lib/CodeGen/RegisterClassInfo.cpp
    llvm/trunk/lib/Target/ARM/ARMRegisterInfo.td
    llvm/trunk/lib/Target/ARM/ARMSubtarget.cpp
    llvm/trunk/lib/Target/ARM/ARMSubtarget.h
    llvm/trunk/test/CodeGen/ARM/avoid-cpsr-rmw.ll

Modified: llvm/trunk/include/llvm/CodeGen/TargetSubtargetInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/TargetSubtargetInfo.h?rev=365014&r1=365013&r2=365014&view=diff
==============================================================================

--- llvm/trunk/include/llvm/CodeGen/TargetSubtargetInfo.h (original)
+++ llvm/trunk/include/llvm/CodeGen/TargetSubtargetInfo.h Wed Jul  3 02:58:52 2019
@@ -291,6 +291,14 @@ public:
 
   /// This is called after a .mir file was loaded.
   virtual void mirFileLoaded(MachineFunction &MF) const;
+
+  /// True if the register allocator should use the allocation orders exactly as
+  /// written in the tablegen descriptions, false if it should allocate
+  /// the specified physical register later if is it callee-saved.
+  virtual bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                           unsigned PhysReg) const {
+    return false;
+  }
 };
 
 } // end namespace llvm

Modified: llvm/trunk/lib/CodeGen/RegisterClassInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/RegisterClassInfo.cpp?rev=365014&r1=365013&r2=365014&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/RegisterClassInfo.cpp (original)
+++ llvm/trunk/lib/CodeGen/RegisterClassInfo.cpp Wed Jul  3 02:58:52 2019
@@ -90,6 +90,7 @@ void RegisterClassInfo::runOnMachineFunc
 void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
   assert(RC && "no register class given");
   RCInfo &RCI = RegClass[RC->getID()];
+  auto &STI = MF->getSubtarget();
 
   // Raw register count, including all reserved regs.
   unsigned NumRegs = RC->getNumRegs();
@@ -114,7 +115,8 @@ void RegisterClassInfo::compute(const Ta
     unsigned Cost = TRI->getCostPerUse(PhysReg);
     MinCost = std::min(MinCost, Cost);
 
-    if (CalleeSavedAliases[PhysReg])
+    if (CalleeSavedAliases[PhysReg] &&
+        !STI.ignoreCSRForAllocationOrder(*MF, PhysReg))
       // PhysReg aliases a CSR, save it for later.
       CSRAlias.push_back(PhysReg);
     else {

Modified: llvm/trunk/lib/Target/ARM/ARMRegisterInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMRegisterInfo.td?rev=365014&r1=365013&r2=365014&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMRegisterInfo.td (original)
+++ llvm/trunk/lib/Target/ARM/ARMRegisterInfo.td Wed Jul  3 02:58:52 2019
@@ -227,9 +227,10 @@ def GPR : RegisterClass<"ARM", [i32], 32
   // know how to spill them. If we make our prologue/epilogue code smarter at
   // some point, we can go back to using the above allocation orders for the
   // Thumb1 instructions that know how to use hi regs.
-  let AltOrders = [(add LR, GPR), (trunc GPR, 8)];
+  let AltOrders = [(add LR, GPR), (trunc GPR, 8),
+                   (add (trunc GPR, 8), R12, LR, (shl GPR, 8))];
   let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
   }];
   let DiagnosticString = "operand must be a register in range [r0, r15]";
 }
@@ -238,9 +239,10 @@ def GPR : RegisterClass<"ARM", [i32], 32
 // certain operand slots, particularly as the destination.  Primarily
 // useful for disassembly.
 def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
-  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8),
+                   (add (trunc GPRnopc, 8), R12, LR, (shl GPRnopc, 8))];
   let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
   }];
   let DiagnosticString = "operand must be a register in range [r0, r14]";
 }
@@ -295,9 +297,10 @@ def GPRlr : RegisterClass<"ARM", [i32],
 // or SP (R13 or R15) are used. The ARM ISA refers to these operands
 // via the BadReg() pseudo-code description.
 def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
-  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)];
+  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8),
+                   (add (trunc rGPR, 8), R12, LR, (shl rGPR, 8))];
   let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
   }];
   let DiagnosticType = "rGPR";
 }

Modified: llvm/trunk/lib/Target/ARM/ARMSubtarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMSubtarget.cpp?rev=365014&r1=365013&r2=365014&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMSubtarget.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMSubtarget.cpp Wed Jul  3 02:58:52 2019
@@ -413,3 +413,45 @@ bool ARMSubtarget::useFastISel() const {
          ((isTargetMachO() && !isThumb1Only()) ||
           (isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb()));
 }
+
+unsigned ARMSubtarget::getGPRAllocationOrder(const MachineFunction &MF) const {
+  // The GPR register class has multiple possible allocation orders, with
+  // tradeoffs preferred by different sub-architectures and optimisation goals.
+  // The allocation orders are:
+  // 0: (the default tablegen order, not used)
+  // 1: r14, r0-r13
+  // 2: r0-r7
+  // 3: r0-r7, r12, lr, r8-r11
+  // Note that the register allocator will change this order so that
+  // callee-saved registers are used later, as they require extra work in the
+  // prologue/epilogue (though we sometimes override that).
+
+  // For thumb1-only targets, only the low registers are allocatable.
+  if (isThumb1Only())
+    return 2;
+
+  // Allocate low registers first, so we can select more 16-bit instructions.
+  // We also (in ignoreCSRForAllocationOrder) override  the default behaviour
+  // with regards to callee-saved registers, because pushing extra registers is
+  // much cheaper (in terms of code size) than using high registers. After
+  // that, we allocate r12 (doesn't need to be saved), lr (saving it means we
+  // can return with the pop, don't need an extra "bx lr") and then the rest of
+  // the high registers.
+  if (isThumb2() && MF.getFunction().hasMinSize())
+    return 3;
+
+  // Otherwise, allocate in the default order, using LR first because saving it
+  // allows a shorter epilogue sequence.
+  return 1;
+}
+
+bool ARMSubtarget::ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                               unsigned PhysReg) const {
+  // To minimize code size in Thumb2, we prefer the usage of low regs (lower
+  // cost per use) so we can  use narrow encoding. By default, caller-saved
+  // registers (e.g. lr, r12) are always  allocated first, regardless of
+  // their cost per use. When optForMinSize, we prefer the low regs even if
+  // they are CSR because usually push/pop can be folded into existing ones.
+  return isThumb2() && MF.getFunction().hasMinSize() &&
+         ARM::GPRRegClass.contains(PhysReg);
+}

Modified: llvm/trunk/lib/Target/ARM/ARMSubtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMSubtarget.h?rev=365014&r1=365013&r2=365014&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMSubtarget.h (original)
+++ llvm/trunk/lib/Target/ARM/ARMSubtarget.h Wed Jul  3 02:58:52 2019
@@ -856,6 +856,10 @@ public:
   unsigned getPrefLoopAlignment() const {
     return PrefLoopAlignment;
   }
+
+  bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                   unsigned PhysReg) const override;
+  unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm

Modified: llvm/trunk/test/CodeGen/ARM/avoid-cpsr-rmw.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/avoid-cpsr-rmw.ll?rev=365014&r1=365013&r2=365014&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/avoid-cpsr-rmw.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/avoid-cpsr-rmw.ll Wed Jul  3 02:58:52 2019
@@ -60,7 +60,7 @@ entry:
 
 while.body:
 ; CHECK: while.body
-; CHECK: mul r{{[0-9]+}}
+; CHECK: muls r{{[0-9]+}}
 ; CHECK: muls
   %ptr1.addr.09 = phi i32* [ %add.ptr, %while.body ], [ %ptr1, %entry ]
   %ptr2.addr.08 = phi i32* [ %incdec.ptr, %while.body ], [ %ptr2, %entry ]

Added: llvm/trunk/test/CodeGen/ARM/favor-low-reg-for-Osize.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/favor-low-reg-for-Osize.ll?rev=365014&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/favor-low-reg-for-Osize.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/favor-low-reg-for-Osize.ll Wed Jul  3 02:58:52 2019
@@ -0,0 +1,29 @@
+; REQUIRES: asserts
+; RUN: llc -debug-only=regalloc < %s 2>%t | FileCheck %s --check-prefix=CHECK
+; RUN: FileCheck %s < %t --check-prefix=DEBUG
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
+target triple = "thumbv7m--linux-gnueabi"
+
+
+; DEBUG:         AllocationOrder(GPR) = [ $r0 $r1 $r2 $r3 $r4 $r5 $r6 $r7 $r12 $lr $r8 $r9 $r10 $r11 ]
+
+define i32 @test_minsize(i32 %x) optsize minsize {
+; CHECK-LABEL: test_minsize:
+entry:
+; CHECK: mov     r4, r0
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
+; CHECK: mov     r0, r4
+  ret i32 %x
+}
+
+; DEBUG: AllocationOrder(GPR) = [ $r0 $r1 $r2 $r3 $r12 $lr $r4 $r5 $r6 $r7 $r8 $r9 $r10 $r11 ]
+
+define i32 @test_optsize(i32 %x) optsize {
+; CHECK-LABEL: test_optsize:
+entry:
+; CHECK: mov     r12, r0
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
+; CHECK: mov     r0, r12
+  ret i32 %x
+}