[llvm] 5aa8014 - [AVR] Remove faulty stack pushing behavior

Tue Jun 16 04:54:18 PDT 2020

Author: Ayke van Laethem
Date: 2020-06-16T13:53:32+02:00
New Revision: 5aa8014ca811c8132ff99921c1eb39940f98c7e9

URL: https://github.com/llvm/llvm-project/commit/5aa8014ca811c8132ff99921c1eb39940f98c7e9
DIFF: https://github.com/llvm/llvm-project/commit/5aa8014ca811c8132ff99921c1eb39940f98c7e9.diff

LOG: [AVR] Remove faulty stack pushing behavior

An instruction like this will need to allocate some stack space for the
last parameter:

  %x = call addrspace(1) i16 @bar(i64 undef, i64 undef, i16 undef, i16 0)

This worked fine when passing an actual value (in this case 0). However,
when passing undef, no value was pushed to the stack and therefore no
push instructions were created. This caused an unbalanced stack leading
to interesting results.

This commit fixes that by replacing the push logic with a regular stack
adjustment and stack-relative load/stores. This is less efficient but at
least it correctly compiles the code.

I can think of a few improvements in the future:

  * The stack should have been adjusted in the function prologue when
    there are no allocas in the function.
  * Many (if not most) stack adjustments can be replaced by
    pushing/popping the values directly. Exactly like the previous code
    attempted but didn't do correctly.
  * Small stack adjustments can be done more efficiently with a few
    push/pop instructions (pushing/popping bogus values), both for code
    size and for speed.

All in all, as long as there are no allocas in the function I think that
it is almost always more efficient to emit regular push/pop
instructions. This is however left for future optimizations.

Differential Revision: https://reviews.llvm.org/D78581

Added: 
    

Modified: 
    llvm/lib/Target/AVR/AVRFrameLowering.cpp
    llvm/test/CodeGen/AVR/call.ll
    llvm/test/CodeGen/AVR/dynalloca.ll
    llvm/test/CodeGen/AVR/varargs.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
index a1cb29d08360..a2bc1d050fbb 100644

--- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -281,15 +281,10 @@ bool AVRFrameLowering::restoreCalleeSavedRegisters(
 }
 
 /// Replace pseudo store instructions that pass arguments through the stack with
-/// real instructions. If insertPushes is true then all instructions are
-/// replaced with push instructions, otherwise regular std instructions are
-/// inserted.
+/// real instructions.
 static void fixStackStores(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI,
-                           const TargetInstrInfo &TII, bool insertPushes) {
-  const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>();
-  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
-
+                           const TargetInstrInfo &TII, Register FP) {
   // Iterate through the BB until we hit a call instruction or we reach the end.
   for (auto I = MI, E = MBB.end(); I != E && !I->isCall();) {
     MachineBasicBlock::iterator NextMI = std::next(I);
@@ -304,29 +299,6 @@ static void fixStackStores(MachineBasicBlock &MBB,
 
     assert(MI.getOperand(0).getReg() == AVR::SP &&
            "Invalid register, should be SP!");
-    if (insertPushes) {
-      // Replace this instruction with a push.
-      Register SrcReg = MI.getOperand(2).getReg();
-      bool SrcIsKill = MI.getOperand(2).isKill();
-
-      // We can't use PUSHWRr here because when expanded the order of the new
-      // instructions are reversed from what we need. Perform the expansion now.
-      if (Opcode == AVR::STDWSPQRr) {
-        BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
-            .addReg(TRI.getSubReg(SrcReg, AVR::sub_hi),
-                    getKillRegState(SrcIsKill));
-        BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
-            .addReg(TRI.getSubReg(SrcReg, AVR::sub_lo),
-                    getKillRegState(SrcIsKill));
-      } else {
-        BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
-            .addReg(SrcReg, getKillRegState(SrcIsKill));
-      }
-
-      MI.eraseFromParent();
-      I = NextMI;
-      continue;
-    }
 
     // Replace this instruction with a regular store. Use Y as the base
     // pointer since it is guaranteed to contain a copy of SP.
@@ -334,7 +306,7 @@ static void fixStackStores(MachineBasicBlock &MBB,
         (Opcode == AVR::STDWSPQRr) ? AVR::STDWPtrQRr : AVR::STDPtrQRr;
 
     MI.setDesc(TII.get(STOpc));
-    MI.getOperand(0).setReg(AVR::R29R28);
+    MI.getOperand(0).setReg(FP);
 
     I = NextMI;
   }
@@ -350,7 +322,7 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
   // function entry. Delete the call frame pseudo and replace all pseudo stores
   // with real store instructions.
   if (hasReservedCallFrame(MF)) {
-    fixStackStores(MBB, MI, TII, false);
+    fixStackStores(MBB, MI, TII, AVR::R29R28);
     return MBB.erase(MI);
   }
 
@@ -358,18 +330,37 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
   unsigned int Opcode = MI->getOpcode();
   int Amount = TII.getFrameSize(*MI);
 
-  // Adjcallstackup does not need to allocate stack space for the call, instead
-  // we insert push instructions that will allocate the necessary stack.
-  // For adjcallstackdown we convert it into an 'adiw reg, <amt>' handling
-  // the read and write of SP in I/O space.
+  // ADJCALLSTACKUP and ADJCALLSTACKDOWN are converted to adiw/subi
+  // instructions to read and write the stack pointer in I/O space.
   if (Amount != 0) {
     assert(getStackAlign() == Align(1) && "Unsupported stack alignment");
 
     if (Opcode == TII.getCallFrameSetupOpcode()) {
-      fixStackStores(MBB, MI, TII, true);
+      // Update the stack pointer.
+      // In many cases this can be done far more efficiently by pushing the
+      // relevant values directly to the stack. However, doing that correctly
+      // (in the right order, possibly skipping some empty space for undef
+      // values, etc) is tricky and thus left to be optimized in the future.
+      BuildMI(MBB, MI, DL, TII.get(AVR::SPREAD), AVR::R31R30).addReg(AVR::SP);
+
+      MachineInstr *New = BuildMI(MBB, MI, DL, TII.get(AVR::SUBIWRdK), AVR::R31R30)
+                              .addReg(AVR::R31R30, RegState::Kill)
+                              .addImm(Amount);
+      New->getOperand(3).setIsDead();
+
+      BuildMI(MBB, MI, DL, TII.get(AVR::SPWRITE), AVR::SP)
+          .addReg(AVR::R31R30, RegState::Kill);
+
+      // Make sure the remaining stack stores are converted to real store
+      // instructions.
+      fixStackStores(MBB, MI, TII, AVR::R31R30);
     } else {
       assert(Opcode == TII.getCallFrameDestroyOpcode());
 
+      // Note that small stack changes could be implemented more efficiently
+      // with a few pop instructions instead of the 8-9 instructions now
+      // required.
+
       // Select the best opcode to adjust SP based on the offset size.
       unsigned addOpcode;
       if (isUInt<6>(Amount)) {

diff  --git a/llvm/test/CodeGen/AVR/call.ll b/llvm/test/CodeGen/AVR/call.ll
index 6ce0399b27cb..0501ed53fede 100644
--- a/llvm/test/CodeGen/AVR/call.ll
+++ b/llvm/test/CodeGen/AVR/call.ll
@@ -32,8 +32,8 @@ define i8 @calli8_stack() {
 ; CHECK-LABEL: calli8_stack:
 ; CHECK: ldi [[REG1:r[0-9]+]], 10
 ; CHECK: ldi [[REG2:r[0-9]+]], 11
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+1, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
 ; CHECK: call foo8_3
     %result1 = call i8 @foo8_3(i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11)
     ret i8 %result1
@@ -54,12 +54,12 @@ define i16 @calli16_stack() {
 ; CHECK-LABEL: calli16_stack:
 ; CHECK: ldi [[REG1:r[0-9]+]], 9
 ; CHECK: ldi [[REG2:r[0-9]+]], 2 
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+1, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
 ; CHECK: ldi [[REG1:r[0-9]+]], 10
 ; CHECK: ldi [[REG2:r[0-9]+]], 2
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+3, [[REG1]]
+; CHECK: std Z+4, [[REG2]]
 ; CHECK: call foo16_2
     %result1 = call i16 @foo16_2(i16 512, i16 513, i16 514, i16 515, i16 516, i16 517, i16 518, i16 519, i16 520, i16 521, i16 522)
     ret i16 %result1
@@ -84,12 +84,12 @@ define i32 @calli32_stack() {
 ; CHECK-LABEL: calli32_stack:
 ; CHECK: ldi [[REG1:r[0-9]+]], 64
 ; CHECK: ldi [[REG2:r[0-9]+]], 66
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+1, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
 ; CHECK: ldi [[REG1:r[0-9]+]], 15
 ; CHECK: ldi [[REG2:r[0-9]+]], 2
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+3, [[REG1]]
+; CHECK: std Z+4, [[REG2]]
 ; CHECK: call foo32_2
     %result1 = call i32 @foo32_2(i32 1, i32 2, i32 3, i32 4, i32 34554432)
     ret i32 %result1
@@ -115,20 +115,20 @@ define i64 @calli64_stack() {
 
 ; CHECK: ldi [[REG1:r[0-9]+]], 76
 ; CHECK: ldi [[REG2:r[0-9]+]], 73
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+5, [[REG1]]
+; CHECK: std Z+6, [[REG2]]
 ; CHECK: ldi [[REG1:r[0-9]+]], 31
 ; CHECK: ldi [[REG2:r[0-9]+]], 242
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+7, [[REG1]]
+; CHECK: std Z+8, [[REG2]]
 ; CHECK: ldi [[REG1:r[0-9]+]], 155
 ; CHECK: ldi [[REG2:r[0-9]+]], 88
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+3, [[REG1]]
+; CHECK: std Z+4, [[REG2]]
 ; CHECK: ldi [[REG1:r[0-9]+]], 255
 ; CHECK: ldi [[REG2:r[0-9]+]], 255
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+1, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
 ; CHECK: call foo64_2
     %result1 = call i64 @foo64_2(i64 1, i64 2, i64 17446744073709551615)
     ret i64 %result1

diff  --git a/llvm/test/CodeGen/AVR/dynalloca.ll b/llvm/test/CodeGen/AVR/dynalloca.ll
index 6aa776e2de6f..f314fb06f336 100644
--- a/llvm/test/CodeGen/AVR/dynalloca.ll
+++ b/llvm/test/CodeGen/AVR/dynalloca.ll
@@ -53,9 +53,27 @@ define void @dynalloca2(i16 %x) {
 ; CHECK-LABEL: dynalloca2:
 ; CHECK: in [[SPCOPY1:r[0-9]+]], 61
 ; CHECK: in [[SPCOPY2:r[0-9]+]], 62
-; CHECK: push
-; CHECK-NOT: st
-; CHECK-NOT: std
+; Allocate stack space for call
+; CHECK: in {{.*}}, 61
+; CHECK: in {{.*}}, 62
+; CHECK: subi
+; CHECK: sbci
+; CHECK: in r0, 63
+; CHECK-NEXT: cli
+; CHECK-NEXT: out 62, {{.*}}
+; CHECK-NEXT: out 63, r0
+; CHECK-NEXT: out 61, {{.*}}
+; Store values on the stack
+; CHECK: ldi r16, 0
+; CHECK: ldi r17, 0
+; CHECK: std Z+5, r16
+; CHECK: std Z+6, r17
+; CHECK: std Z+7, r16
+; CHECK: std Z+8, r17
+; CHECK: std Z+3, r16
+; CHECK: std Z+4, r17
+; CHECK: std Z+1, r16
+; CHECK: std Z+2, r17
 ; CHECK: call
 ; Call frame restore
 ; CHECK-NEXT: in r30, 61

diff  --git a/llvm/test/CodeGen/AVR/varargs.ll b/llvm/test/CodeGen/AVR/varargs.ll
index c2046d84b48b..a743374db742 100644
--- a/llvm/test/CodeGen/AVR/varargs.ll
+++ b/llvm/test/CodeGen/AVR/varargs.ll
@@ -42,16 +42,16 @@ define void @varargcall() {
 ; CHECK-LABEL: varargcall:
 ; CHECK: ldi [[REG1:r[0-9]+]], 189
 ; CHECK: ldi [[REG2:r[0-9]+]], 205
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+3, [[REG1]]
+; CHECK: std Z+4, [[REG2]]
 ; CHECK: ldi [[REG1:r[0-9]+]], 191
 ; CHECK: ldi [[REG2:r[0-9]+]], 223
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+5, [[REG1]]
+; CHECK: std Z+6, [[REG2]]
 ; CHECK: ldi [[REG1:r[0-9]+]], 205
 ; CHECK: ldi [[REG2:r[0-9]+]], 171
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+1, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
 ; CHECK: call
 ; CHECK: adiw r30, 6
   tail call void (i16, ...) @var1223(i16 -21555, i16 -12867, i16 -8257)