[llvm] r215729 - ARM: Fix and re-enable load/store optimizer for Thumb1.

Fri Aug 15 10:00:30 PDT 2014

Author: mroth
Date: Fri Aug 15 12:00:30 2014
New Revision: 215729

URL: http://llvm.org/viewvc/llvm-project?rev=215729&view=rev
Log:
ARM: Fix and re-enable load/store optimizer for Thumb1.

In a previous iteration of the pass, we would try to compensate for
writeback by updating later instructions and/or inserting a SUBS to
reset the base register if necessary.
Since such a SUBS sets the condition flags it's not generally safe to do
this. For now, only merge LDR/STRs if there is no writeback to the base
register (LDM that loads into the base register) or the base register is
killed by one of the merged instructions. These cases are clear wins
both in terms of instruction count and performance.

Also add three new test cases, and update the existing ones accordingly.

Added:
    llvm/trunk/test/CodeGen/Thumb/ldm-merge-call.ll
    llvm/trunk/test/CodeGen/Thumb/ldm-merge-struct.ll
    llvm/trunk/test/CodeGen/Thumb/stm-merge.ll
Modified:
    llvm/trunk/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
    llvm/trunk/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll
    llvm/trunk/test/CodeGen/Thumb/dyn-stackalloc.ll
    llvm/trunk/test/CodeGen/Thumb/thumb-ldm.ll
    llvm/trunk/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll

Modified: llvm/trunk/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMLoadStoreOptimizer.cpp?rev=215729&r1=215728&r2=215729&view=diff
==============================================================================

--- llvm/trunk/lib/Target/ARM/ARMLoadStoreOptimizer.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMLoadStoreOptimizer.cpp Fri Aug 15 12:00:30 2014
@@ -97,10 +97,6 @@ namespace {
     void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs,
                           const MemOpQueue &MemOps, unsigned DefReg,
                           unsigned RangeBegin, unsigned RangeEnd);
-    void UpdateBaseRegUses(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI,
-                           DebugLoc dl, unsigned Base, unsigned WordOffset,
-                           ARMCC::CondCodes Pred, unsigned PredReg);
     bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                   int Offset, unsigned Base, bool BaseKill, int Opcode,
                   ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
@@ -311,101 +307,6 @@ static bool isi32Store(unsigned Opc) {
   return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
 }
 
-static unsigned getImmScale(unsigned Opc) {
-  switch (Opc) {
-  default: llvm_unreachable("Unhandled opcode!");
-  case ARM::tLDRi:
-  case ARM::tSTRi:
-    return 1;
-  case ARM::tLDRHi:
-  case ARM::tSTRHi:
-    return 2;
-  case ARM::tLDRBi:
-  case ARM::tSTRBi:
-    return 4;
-  }
-}
-
-/// Update future uses of the base register with the offset introduced
-/// due to writeback. This function only works on Thumb1.
-void
-ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   DebugLoc dl, unsigned Base,
-                                   unsigned WordOffset,
-                                   ARMCC::CondCodes Pred, unsigned PredReg) {
-  assert(isThumb1 && "Can only update base register uses for Thumb1!");
-
-  // Start updating any instructions with immediate offsets. Insert a sub before
-  // the first non-updateable instruction (if any).
-  for (; MBBI != MBB.end(); ++MBBI) {
-    if (MBBI->readsRegister(Base)) {
-      unsigned Opc = MBBI->getOpcode();
-      int Offset;
-      bool InsertSub = false;
-
-      if (Opc == ARM::tLDRi  || Opc == ARM::tSTRi  ||
-          Opc == ARM::tLDRHi || Opc == ARM::tSTRHi ||
-          Opc == ARM::tLDRBi || Opc == ARM::tSTRBi) {
-        // Loads and stores with immediate offsets can be updated, but only if
-        // the new offset isn't negative.
-        // The MachineOperand containing the offset immediate is the last one
-        // before predicates.
-        MachineOperand &MO =
-          MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
-        // The offsets are scaled by 1, 2 or 4 depending on the Opcode
-        Offset = MO.getImm() - WordOffset * getImmScale(Opc);
-        if (Offset >= 0)
-          MO.setImm(Offset);
-        else
-          InsertSub = true;
-
-      } else if (Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) {
-        // SUB/ADD using this register. Merge it with the update.
-        // If the merged offset is too large, insert a new sub instead.
-        MachineOperand &MO =
-          MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
-        Offset = (Opc == ARM::tSUBi8) ?
-          MO.getImm() + WordOffset * 4 :
-          MO.getImm() - WordOffset * 4 ;
-        if (TL->isLegalAddImmediate(Offset)) {
-          MO.setImm(Offset);
-          // The base register has now been reset, so exit early.
-          return;
-        } else {
-          InsertSub = true;
-        }
-
-      } else {
-        // Can't update the instruction.
-        InsertSub = true;
-      }
-
-      if (InsertSub) {
-        // An instruction above couldn't be updated, so insert a sub.
-        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base))
-          .addReg(Base, getKillRegState(true)).addImm(WordOffset * 4)
-          .addImm(Pred).addReg(PredReg);
-        return;
-      }
-    }
-
-    if (MBBI->killsRegister(Base))
-      // Register got killed. Stop updating.
-      return;
-  }
-
-  // The end of the block was reached. This means register liveness escapes the
-  // block, and it's necessary to insert a sub before the last instruction.
-  if (MBB.succ_size() > 0)
-    // But only insert the SUB if there is actually a successor block.
-    // FIXME: Check more carefully if register is live at this point, e.g. by
-    // also examining the successor block's register liveness information.
-    AddDefaultT1CC(BuildMI(MBB, --MBBI, dl, TII->get(ARM::tSUBi8), Base))
-      .addReg(Base, getKillRegState(true)).addImm(WordOffset * 4)
-      .addImm(Pred).addReg(PredReg);
-}
-
 /// MergeOps - Create and insert a LDM or STM with Base as base register and
 /// registers in Regs as the register operands that would be loaded / stored.
 /// It returns true if the transformation is done.
@@ -512,6 +413,14 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBl
         break;
       }
 
+  // If the merged instruction has writeback and the base register is not killed
+  // it's not safe to do the merge on Thumb1. This is because resetting the base
+  // register writeback by inserting a SUBS sets the condition flags.
+  // FIXME: Try something clever here to see if resetting the base register can
+  // be avoided, e.g. by updating a later ADD/SUB of the base register with the
+  // writeback.
+  if (isThumb1 && Writeback && !BaseKill) return false;
+
   MachineInstrBuilder MIB;
 
   if (Writeback) {
@@ -524,12 +433,6 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBl
     // Thumb1: we might need to set base writeback when building the MI.
     MIB.addReg(Base, getDefRegState(true))
        .addReg(Base, getKillRegState(BaseKill));
-
-    // The base isn't dead after a merged instruction with writeback. Update
-    // future uses of the base with the added offset (if possible), or reset
-    // the base register as necessary.
-    if (!BaseKill)
-      UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg);
   } else {
     // No writeback, simply build the MachineInstr.
     MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
@@ -1740,12 +1643,6 @@ bool ARMLoadStoreOpt::runOnMachineFuncti
   isThumb2 = AFI->isThumb2Function();
   isThumb1 = AFI->isThumbFunction() && !isThumb2;
 
-  // FIXME: Temporarily disabling for Thumb-1 due to miscompiles
-  if (isThumb1) {
-    delete RS;
-    return false;
-  }
-
   bool Modified = false;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
        ++MFI) {

Modified: llvm/trunk/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll?rev=215729&r1=215728&r2=215729&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll Fri Aug 15 12:00:30 2014
@@ -1,12 +1,11 @@
-; RUN: llc < %s -mtriple=thumbv6m-eabi -o - | FileCheck %s
-; XFAIL: *
+; RUN: llc < %s -mtriple=thumbv6m-eabi -verify-machineinstrs -o - | FileCheck %s
 
 define void @foo(i32* %A) #0 {
 entry:
 ; CHECK-LABEL: foo:
 ; CHECK: push {r7, lr}
-; CHECK: ldm [[REG0:r[0-9]]]!,
-; CHECK-NEXT: subs [[REG0]]
+; CHECK: ldr
+; CHECK-NEXT: ldr
 ; CHECK-NEXT: bl
   %0 = load i32* %A, align 4
   %arrayidx1 = getelementptr inbounds i32* %A, i32 1

Modified: llvm/trunk/test/CodeGen/Thumb/dyn-stackalloc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb/dyn-stackalloc.ll?rev=215729&r1=215728&r2=215729&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb/dyn-stackalloc.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb/dyn-stackalloc.ll Fri Aug 15 12:00:30 2014
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra | FileCheck %s
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
 
 	%struct.state = type { i32, %struct.info*, float**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i8* }
 	%struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* }

Added: llvm/trunk/test/CodeGen/Thumb/ldm-merge-call.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb/ldm-merge-call.ll?rev=215729&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb/ldm-merge-call.ll (added)
+++ llvm/trunk/test/CodeGen/Thumb/ldm-merge-call.ll Fri Aug 15 12:00:30 2014
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m--linux-gnueabi"
+
+; Function Attrs: nounwind optsize
+define void @foo(i32* nocapture readonly %A) #0 {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: ldm r[[BASE:[0-9]]]!,
+; CHECK-NEXT: mov r[[BASE]],
+  %0 = load i32* %A, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i32 1
+  %1 = load i32* %arrayidx1, align 4
+  %call = tail call i32 @bar(i32 %0, i32 %1, i32 %0, i32 %1) #2
+  %call2 = tail call i32 @bar(i32 %0, i32 %1, i32 %0, i32 %1) #2
+  ret void
+}
+
+; Function Attrs: optsize
+declare i32 @bar(i32, i32, i32, i32) #1
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind optsize }

Added: llvm/trunk/test/CodeGen/Thumb/ldm-merge-struct.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb/ldm-merge-struct.ll?rev=215729&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb/ldm-merge-struct.ll (added)
+++ llvm/trunk/test/CodeGen/Thumb/ldm-merge-struct.ll Fri Aug 15 12:00:30 2014
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m-none--eabi"
+
+%struct.S = type { i32, i32 }
+
+ at s = common global %struct.S zeroinitializer, align 4
+
+define i32 @f() {
+entry:
+; CHECK-LABEL: f:
+; CHECK: ldm r[[BASE:[0-9]]],
+; CHECK-NEXT-NOT: subs r[[BASE]]
+  %0 = load i32* getelementptr inbounds (%struct.S* @s, i32 0, i32 0), align 4
+  %1 = load i32* getelementptr inbounds (%struct.S* @s, i32 0, i32 1), align 4
+  %cmp = icmp sgt i32 %0, %1
+  %2 = sub i32 0, %1
+  %cond.p = select i1 %cmp, i32 %1, i32 %2
+  %cond = add i32 %cond.p, %0
+  ret i32 %cond
+}

Added: llvm/trunk/test/CodeGen/Thumb/stm-merge.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb/stm-merge.ll?rev=215729&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb/stm-merge.ll (added)
+++ llvm/trunk/test/CodeGen/Thumb/stm-merge.ll Fri Aug 15 12:00:30 2014
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m--linux-gnueabi"
+
+ at d = internal unnamed_addr global i32 0, align 4
+ at c = internal global i32* null, align 4
+ at e = internal unnamed_addr global i32* null, align 4
+
+; Function Attrs: nounwind optsize
+define void @fn1() #0 {
+entry:
+; CHECK-LABEL: fn1:
+; CHECK: stm r[[BASE:[0-9]]]!, {{.*}}
+; CHECK-NOT: {{.*}} r[[BASE]]
+; CHECK: ldr r[[BASE]], {{.*}}
+  %g = alloca i32, align 4
+  %h = alloca i32, align 4
+  store i32 1, i32* %g, align 4
+  store i32 0, i32* %h, align 4
+  %.pr = load i32* @d, align 4
+  %cmp11 = icmp slt i32 %.pr, 1
+  br i1 %cmp11, label %for.inc.lr.ph, label %for.body5
+
+for.inc.lr.ph:                                    ; preds = %entry
+  store i32 1, i32* @d, align 4
+  br label %for.body5
+
+for.body5:                                        ; preds = %entry, %for.inc.lr.ph, %for.body5
+  %f.010 = phi i32 [ %inc7, %for.body5 ], [ 0, %for.inc.lr.ph ], [ 0, %entry ]
+  store volatile i32* %g, i32** @c, align 4
+  %inc7 = add nsw i32 %f.010, 1
+  %exitcond = icmp eq i32 %inc7, 2
+  br i1 %exitcond, label %for.end8, label %for.body5
+
+for.end8:                                         ; preds = %for.body5
+  store i32* %h, i32** @e, align 4
+  ret void
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

Modified: llvm/trunk/test/CodeGen/Thumb/thumb-ldm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb/thumb-ldm.ll?rev=215729&r1=215728&r2=215729&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb/thumb-ldm.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb/thumb-ldm.ll Fri Aug 15 12:00:30 2014
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -mtriple=thumbv6m-eabi -o - | FileCheck %s
-; XFAIL: *
 
 @X = external global [0 x i32]          ; <[0 x i32]*> [#uses=5]
 

Modified: llvm/trunk/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll?rev=215729&r1=215728&r2=215729&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll Fri Aug 15 12:00:30 2014
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s
-; XFAIL: *
 
 @d = external global [64 x i32]
 @s = external global [64 x i32]
@@ -7,29 +6,19 @@
 ; Function Attrs: nounwind
 define void @t1() #0 {
 entry:
-; CHECK: ldr [[REG0:r[0-9]]],
-; CHECK: ldm [[REG0]]!,
-; CHECK: ldr [[REG1:r[0-9]]],
-; CHECK: stm [[REG1]]!,
-; CHECK: subs [[REG0]], #32
-; CHECK-NEXT: ldrb
-; CHECK: subs [[REG1]], #32
-; CHECK-NEXT: strb
-    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 33, i32 4, i1 false)
+; CHECK-LABEL: t1
+; CHECK-NOT: ldm
+; CHECK-NOT: stm
+    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 17, i32 4, i1 false)
     ret void
 }
 
 ; Function Attrs: nounwind
 define void @t2() #0 {
 entry:
-; CHECK: ldr [[REG0:r[0-9]]],
-; CHECK: ldm [[REG0]]!,
-; CHECK: ldr [[REG1:r[0-9]]],
-; CHECK: stm [[REG1]]!,
-; CHECK: ldrh
-; CHECK: ldrb
-; CHECK: strb
-; CHECK: strh
+; CHECK-LABEL: t2:
+; CHECK-NOT: ldm
+; CHECK-NOT: stm
     tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false)
     ret void
 }