[llvm] 112d769 - [ARM] generate correct code for armv6-m XO big stack operations

Tue Jul 4 02:40:12 PDT 2023

Author: Ties Stuij
Date: 2023-07-04T10:40:06+01:00
New Revision: 112d769e5ee8b7e961122ead3943f06119c05c5b

URL: https://github.com/llvm/llvm-project/commit/112d769e5ee8b7e961122ead3943f06119c05c5b
DIFF: https://github.com/llvm/llvm-project/commit/112d769e5ee8b7e961122ead3943f06119c05c5b.diff

LOG: [ARM] generate correct code for armv6-m XO big stack operations

The ARM backend codebase is dotted with places where armv6-m will generate
constant pools. Now that we can generate execute-only code for armv6-m, we need
to make sure we use the movs/lsls/adds/lsls/adds/lsls/adds pattern instead of
these.

Big stacks is one of the obvious places. In this patch we take care of two
sites:
1. take care of big stacks in prologue/epilogue
2. take care of save/tSTRspi nodes, which implicitly fixes
   emitThumbRegPlusImmInReg which is used in several frame lowering fns

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D154233

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMAsmPrinter.cpp
    llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
    llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
    llvm/test/CodeGen/ARM/large-stack.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index b1074d1265ff9b..69df1d12aa8e28 100644

--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1140,10 +1140,24 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
   case ARM::tLDRpci:
   case ARM::t2MOVi16:
   case ARM::t2MOVTi16:
+  case ARM::tMOVi8:
+  case ARM::tADDi8:
+  case ARM::tLSLri:
     // special cases:
     // 1) for Thumb1 code we sometimes materialize the constant via constpool
     //    load.
-    // 2) for Thumb2 execute only code we materialize the constant via
+    // 2) for Thumb1 execute only code we materialize the constant via the
+    // following pattern:
+    //        movs    r3, #:upper8_15:<const>
+    //        lsls    r3, #8
+    //        adds    r3, #:upper0_7:<const>
+    //        lsls    r3, #8
+    //        adds    r3, #:lower8_15:<const>
+    //        lsls    r3, #8
+    //        adds    r3, #:lower0_7:<const>
+    //    So we need to special-case MOVS, ADDS and LSLS, and keep track of
+    //    where we are in the sequence with the simplest of state machines.
+    // 3) for Thumb2 execute only code we materialize the constant via
     //    immediate constants in 2 separate instructions (MOVW/MOVT).
     SrcReg = ~0U;
     DstReg = MI->getOperand(0).getReg();
@@ -1334,6 +1348,23 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
         Offset = MI->getOperand(2).getImm();
         AFI->EHPrologueOffsetInRegs[DstReg] |= (Offset << 16);
         break;
+      case ARM::tMOVi8:
+        Offset = MI->getOperand(2).getImm();
+        AFI->EHPrologueOffsetInRegs[DstReg] = Offset;
+        break;
+      case ARM::tLSLri:
+        assert(MI->getOperand(3).getImm() == 8 &&
+               "The shift amount is not equal to 8");
+        assert(MI->getOperand(2).getReg() == MI->getOperand(0).getReg() &&
+               "The source register is not equal to the destination register");
+        AFI->EHPrologueOffsetInRegs[DstReg] <<= 8;
+        break;
+      case ARM::tADDi8:
+        assert(MI->getOperand(2).getReg() == MI->getOperand(0).getReg() &&
+               "The source register is not equal to the destination register");
+        Offset = MI->getOperand(3).getImm();
+        AFI->EHPrologueOffsetInRegs[DstReg] += Offset;
+        break;
       case ARM::t2PAC:
       case ARM::t2PACBTI:
         AFI->EHPrologueRemappedRegs[ARM::R12] = ARM::RA_AUTH_CODE;

diff  --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index 9855e48b623e47..c2962c4857c3c0 100644
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -81,8 +81,9 @@ emitPrologueEpilogueSPUpdate(MachineBasicBlock &MBB,
     MachineFunction &MF = *MBB.getParent();
     const ARMSubtarget &ST = MF.getSubtarget<ARMSubtarget>();
     if (ST.genExecuteOnly()) {
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ScratchReg)
-        .addImm(NumBytes).setMIFlags(MIFlags);
+      unsigned XOInstr = ST.useMovt() ? ARM::t2MOVi32imm : ARM::tMOVi32imm;
+      BuildMI(MBB, MBBI, dl, TII.get(XOInstr), ScratchReg)
+          .addImm(NumBytes).setMIFlags(MIFlags);
     } else {
       MRI.emitLoadConstPool(MBB, MBBI, dl, ScratchReg, 0, NumBytes, ARMCC::AL,
                             0, MIFlags);

diff  --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
index a29095e6b81af3..0c010ed1eb3423 100644
--- a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -116,9 +116,10 @@ void ThumbRegisterInfo::emitLoadConstPool(
                                  PredReg, MIFlags);
 }
 
-/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize
-/// a destreg = basereg + immediate in Thumb code. Materialize the immediate
-/// in a register using mov / mvn sequences or load the immediate from a
+/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize a
+/// destreg = basereg + immediate in Thumb code. Materialize the immediate in a
+/// register using mov / mvn (armv6-M >) sequences, movs / lsls / adds / lsls /
+/// adds / lsls / adds sequences (armv6-M) or load the immediate from a
 /// constpool entry.
 static void emitThumbRegPlusImmInReg(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
@@ -159,7 +160,8 @@ static void emitThumbRegPlusImmInReg(
         .addReg(LdReg, RegState::Kill)
         .setMIFlags(MIFlags);
   } else if (ST.genExecuteOnly()) {
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), LdReg)
+    unsigned XOInstr = ST.useMovt() ? ARM::t2MOVi32imm : ARM::tMOVi32imm;
+    BuildMI(MBB, MBBI, dl, TII.get(XOInstr), LdReg)
       .addImm(NumBytes).setMIFlags(MIFlags);
   } else
     MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes, ARMCC::AL, 0,

diff  --git a/llvm/test/CodeGen/ARM/large-stack.ll b/llvm/test/CodeGen/ARM/large-stack.ll
index aedd8aa911fe27..e7bed0b5e36ccc 100644
--- a/llvm/test/CodeGen/ARM/large-stack.ll
+++ b/llvm/test/CodeGen/ARM/large-stack.ll
@@ -1,21 +1,54 @@
 ; RUN: llc -mtriple=arm-eabi %s -o /dev/null
-; RUN: llc -mtriple=thumbv6m-eabi -mattr=+execute-only %s -o -
+; RUN: llc -mtriple=thumbv6m-eabi -mattr=+execute-only %s -o - -filetype=obj | \
+; RUN:   llvm-objdump -d --no-leading-addr --no-show-raw-insn - | Filecheck %s
 
 define void @test1() {
-    %tmp = alloca [ 64 x i32 ] , align 4
+; CHECK-LABEL: <test1>:
+;; are we using correct prologue immediate materialization pattern for
+;; execute only
+; CHECK: sub     sp, #0x100
+%tmp = alloca [ 64 x i32 ] , align 4
     ret void
 }
 
 define void @test2() {
+; CHECK-LABEL: <test2>:
+;; are we using correct prologue immediate materialization pattern for
+;; execute-only
+; CHECK:      movs    [[REG:r[0-9]+]], #0xff
+; CHECK-NEXT: lsls    [[REG]], [[REG]], #0x8
+; CHECK-NEXT: adds    [[REG]], #0xff
+; CHECK-NEXT: lsls    [[REG]], [[REG]], #0x8
+; CHECK-NEXT: adds    [[REG]], #0xef
+; CHECK-NEXT: lsls    [[REG]], [[REG]], #0x8
+; CHECK-NEXT: adds    [[REG]], #0xb8
     %tmp = alloca [ 4168 x i8 ] , align 4
     ret void
 }
 
 define i32 @test3() {
+;; are we using correct prologue immediate materialization pattern for
+;; execute-only
+; CHECK-LABEL: <test3>:
+; CHECK: movs [[REG:r[0-9]+]], #0xcf
+; CHECK-NEXT: lsls    [[REG]], [[REG]], #0x8
+; CHECK-NEXT: adds    [[REG]], #0xff
+; CHECK-NEXT: lsls    [[REG]], [[REG]], #0x8
+; CHECK-NEXT: adds    [[REG]], #0xff
+; CHECK-NEXT: lsls    [[REG]], [[REG]], #0x8
+; CHECK-NEXT: adds    [[REG]], #0xf0
 	%retval = alloca i32, align 4
 	%tmp = alloca i32, align 4
-	%a = alloca [805306369 x i8], align 16
+	%a = alloca [u0x30000001 x i8], align 16
 	store i32 0, ptr %tmp
+;; are we choosing correct store/tSTRspi pattern for execute-only
+; CHECK:      movs    [[REG:r[0-9]+]], #0x30
+; CHECK-NEXT: lsls    [[REG]], [[REG]], #0x8
+; CHECK-NEXT: adds    [[REG]], #0x0
+; CHECK-NEXT: lsls    [[REG]], [[REG]], #0x8
+; CHECK-NEXT: adds    [[REG]], #0x0
+; CHECK-NEXT: lsls    [[REG]], [[REG]], #0x8
+; CHECK-NEXT: adds    [[REG]], #0x8
 	%tmp1 = load i32, ptr %tmp
         ret i32 %tmp1
 }