[llvm] 73d92fa - [CodeGen] Emit alignment "Max Skip" operand

Wed Jan 5 04:54:35 PST 2022

Author: Nicholas Guy
Date: 2022-01-05T12:54:30Z
New Revision: 73d92faa2fc00bff240a22a51974771cd03ed86a

URL: https://github.com/llvm/llvm-project/commit/73d92faa2fc00bff240a22a51974771cd03ed86a
DIFF: https://github.com/llvm/llvm-project/commit/73d92faa2fc00bff240a22a51974771cd03ed86a.diff

LOG: [CodeGen] Emit alignment "Max Skip" operand

The current AsmPrinter has support to emit the "Max Skip" operand
(the 3rd of .p2align), however has no support for it to actually be specified.
Adding MaxBytesForAlignment to MachineBasicBlock provides this capability on a
per-block basis. Leaving the value as default (0) causes no observable differences
in behaviour.

Differential Revision: https://reviews.llvm.org/D114590

Added: 
    llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll

Modified: 
    llvm/include/llvm/CodeGen/AsmPrinter.h
    llvm/include/llvm/CodeGen/MachineBasicBlock.h
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
    llvm/lib/CodeGen/MachineBlockPlacement.cpp
    llvm/lib/CodeGen/TargetLoweringBase.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index 281ecb8de251..8608c908c108 100644

--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -431,7 +431,8 @@ class AsmPrinter : public MachineFunctionPass {
   /// global value is specified, and if that global has an explicit alignment
   /// requested, it will override the alignment request if required for
   /// correctness.
-  void emitAlignment(Align Alignment, const GlobalObject *GV = nullptr) const;
+  void emitAlignment(Align Alignment, const GlobalObject *GV = nullptr,
+                     unsigned MaxBytesToEmit = 0) const;
 
   /// Lower the specified LLVM Constant to an MCExpr.
   virtual const MCExpr *lowerConstant(const Constant *CV);

diff  --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index efe22ea8f332..638b6732a543 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -136,6 +136,10 @@ class MachineBasicBlock
   /// Alignment of the basic block. One if the basic block does not need to be
   /// aligned.
   Align Alignment;
+  /// Maximum amount of bytes that can be added to align the basic block. If the
+  /// alignment cannot be reached in this many bytes, no bytes are emitted.
+  /// Zero to represent no maximum.
+  unsigned MaxBytesForAlignment = 0;
 
   /// Indicate that this basic block is entered via an exception handler.
   bool IsEHPad = false;
@@ -521,6 +525,19 @@ class MachineBasicBlock
   /// Set alignment of the basic block.
   void setAlignment(Align A) { Alignment = A; }
 
+  void setAlignment(Align A, unsigned MaxBytes) {
+    setAlignment(A);
+    setMaxBytesForAlignment(MaxBytes);
+  }
+
+  /// Return the maximum amount of padding allowed for aligning the basic block.
+  unsigned getMaxBytesForAlignment() const { return MaxBytesForAlignment; }
+
+  /// Set the maximum amount of padding allowed for aligning the basic block
+  void setMaxBytesForAlignment(unsigned MaxBytes) {
+    MaxBytesForAlignment = MaxBytes;
+  }
+
   /// Returns true if the block is a landing pad. That is this basic block is
   /// entered via an exception handler.
   bool isEHPad() const { return IsEHPad; }

diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index aef7973ff20f..56042c726058 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1801,11 +1801,14 @@ class TargetLoweringBase {
   /// Return the preferred loop alignment.
   virtual Align getPrefLoopAlignment(MachineLoop *ML = nullptr) const;
 
+  /// Return the maximum amount of bytes allowed to be emitted when padding for
+  /// alignment
+  virtual unsigned
+  getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const;
+
   /// Should loops be aligned even when the function is marked OptSize (but not
   /// MinSize).
-  virtual bool alignLoopsWithOptSize() const {
-    return false;
-  }
+  virtual bool alignLoopsWithOptSize() const { return false; }
 
   /// If the target has a standard location for the stack protector guard,
   /// returns the address of that location. Otherwise, returns nullptr.
@@ -2340,6 +2343,9 @@ class TargetLoweringBase {
   /// means the target does not care about loop alignment. The target may also
   /// override getPrefLoopAlignment to provide per-loop values.
   void setPrefLoopAlignment(Align Alignment) { PrefLoopAlignment = Alignment; }
+  void setMaxBytesForAlignment(unsigned MaxBytes) {
+    MaxBytesForAlignment = MaxBytes;
+  }
 
   /// Set the minimum stack alignment of an argument.
   void setMinStackArgumentAlignment(Align Alignment) {
@@ -3029,6 +3035,8 @@ class TargetLoweringBase {
 
   /// The preferred loop alignment (in log2 bot in bytes).
   Align PrefLoopAlignment;
+  /// The maximum amount of bytes permitted to be emitted for alignment.
+  unsigned MaxBytesForAlignment;
 
   /// Size in bits of the maximum atomics size the backend supports.
   /// Accesses larger than this will be expanded by AtomicExpandPass.

diff  --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 533f20535655..61c2fbc59b5a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2477,7 +2477,8 @@ void AsmPrinter::emitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
 // two boundary.  If a global value is specified, and if that global has
 // an explicit alignment requested, it will override the alignment request
 // if required for correctness.
-void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV) const {
+void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV,
+                               unsigned MaxBytesToEmit) const {
   if (GV)
     Alignment = getGVAlignment(GV, GV->getParent()->getDataLayout(), Alignment);
 
@@ -2490,9 +2491,9 @@ void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV) const {
       STI = &getSubtargetInfo();
     else
       STI = TM.getMCSubtargetInfo();
-    OutStreamer->emitCodeAlignment(Alignment.value(), STI);
+    OutStreamer->emitCodeAlignment(Alignment.value(), STI, MaxBytesToEmit);
   } else
-    OutStreamer->emitValueToAlignment(Alignment.value());
+    OutStreamer->emitValueToAlignment(Alignment.value(), 0, 1, MaxBytesToEmit);
 }
 
 //===----------------------------------------------------------------------===//
@@ -3286,7 +3287,7 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
   // Emit an alignment directive for this block, if needed.
   const Align Alignment = MBB.getAlignment();
   if (Alignment != Align(1))
-    emitAlignment(Alignment);
+    emitAlignment(Alignment, nullptr, MBB.getMaxBytesForAlignment());
 
   // Switch to a new section if this basic block must begin a section. The
   // entry block is always placed in the function section and is handled

diff  --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 692587cd58fa..c93ffaabf74c 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -96,6 +96,12 @@ static cl::opt<unsigned> AlignAllNonFallThruBlocks(
              "format (e.g 4 means align on 16B boundaries)."),
     cl::init(0), cl::Hidden);
 
+static cl::opt<unsigned> MaxBytesForAlignmentOverride(
+    "max-bytes-for-alignment",
+    cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
+             "alignment"),
+    cl::init(0), cl::Hidden);
+
 // FIXME: Find a good default for this flag and remove the flag.
 static cl::opt<unsigned> ExitBlockBias(
     "block-placement-exit-block-bias",
@@ -2929,10 +2935,21 @@ void MachineBlockPlacement::alignBlocks() {
     MachineBasicBlock *LayoutPred =
         &*std::prev(MachineFunction::iterator(ChainBB));
 
+    auto DetermineMaxAlignmentPadding = [&]() {
+      // Set the maximum bytes allowed to be emitted for alignment.
+      unsigned MaxBytes;
+      if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
+        MaxBytes = MaxBytesForAlignmentOverride;
+      else
+        MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
+      ChainBB->setMaxBytesForAlignment(MaxBytes);
+    };
+
     // Force alignment if all the predecessors are jumps. We already checked
     // that the block isn't cold above.
     if (!LayoutPred->isSuccessor(ChainBB)) {
       ChainBB->setAlignment(Align);
+      DetermineMaxAlignmentPadding();
       continue;
     }
 
@@ -2943,8 +2960,10 @@ void MachineBlockPlacement::alignBlocks() {
     BranchProbability LayoutProb =
         MBPI->getEdgeProbability(LayoutPred, ChainBB);
     BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
-    if (LayoutEdgeFreq <= (Freq * ColdProb))
+    if (LayoutEdgeFreq <= (Freq * ColdProb)) {
       ChainBB->setAlignment(Align);
+      DetermineMaxAlignmentPadding();
+    }
   }
 }
 
@@ -3418,17 +3437,30 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   ComputedEdges.clear();
   ChainAllocator.DestroyAll();
 
+  bool HasMaxBytesOverride =
+      MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
+
   if (AlignAllBlock)
     // Align all of the blocks in the function to a specific alignment.
-    for (MachineBasicBlock &MBB : MF)
-      MBB.setAlignment(Align(1ULL << AlignAllBlock));
+    for (MachineBasicBlock &MBB : MF) {
+      if (HasMaxBytesOverride)
+        MBB.setAlignment(Align(1ULL << AlignAllBlock),
+                         MaxBytesForAlignmentOverride);
+      else
+        MBB.setAlignment(Align(1ULL << AlignAllBlock));
+    }
   else if (AlignAllNonFallThruBlocks) {
     // Align all of the blocks that have no fall-through predecessors to a
     // specific alignment.
     for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
       auto LayoutPred = std::prev(MBI);
-      if (!LayoutPred->isSuccessor(&*MBI))
-        MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
+      if (!LayoutPred->isSuccessor(&*MBI)) {
+        if (HasMaxBytesOverride)
+          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
+                            MaxBytesForAlignmentOverride);
+        else
+          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
+      }
     }
   }
   if (ViewBlockLayoutWithBFI != GVDT_None &&

diff  --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 6fc6881f8736..f4cfe4f341cb 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2040,6 +2040,11 @@ Align TargetLoweringBase::getPrefLoopAlignment(MachineLoop *ML) const {
   return PrefLoopAlignment;
 }
 
+unsigned TargetLoweringBase::getMaxPermittedBytesForAlignment(
+    MachineBasicBlock *MBB) const {
+  return MaxBytesForAlignment;
+}
+
 //===----------------------------------------------------------------------===//
 //  Reciprocal Estimates
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll b/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll
new file mode 100644
index 000000000000..58c638e70e9e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll
@@ -0,0 +1,93 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -max-bytes-for-alignment=8 --align-loops=32 < %s -o -| FileCheck %s --check-prefixes=CHECK,CHECK-EXPLICIT
+; RUN: llc -mtriple=aarch64-none-linux-gnu --align-loops=32 < %s -o -| FileCheck %s --check-prefixes=CHECK,CHECK-IMPLICIT
+; RUN: llc -mtriple=aarch64-none-linux-gnu --align-loops=32 < %s -o - --filetype=obj | llvm-objdump --arch=aarch64  -d -| FileCheck %s --check-prefixes=CHECK-OBJ,CHECK-OBJ-IMPLICIT
+; RUN: llc -mtriple=aarch64-none-linux-gnu -max-bytes-for-alignment=8 --align-loops=32 < %s -o - --filetype=obj | llvm-objdump --arch=aarch64  -d -| FileCheck %s --check-prefixes=CHECK-OBJ,CHECK-OBJ-EXPLICIT
+
+; This test is checking that the correct operands to the .p2align are emitted correctly, and that the resulting obj
+; is padded as expected. The key interest in the CHECK-OBJ-* sections is the size of the padding region (the nops),
+; and not the exact instructions either side of them (But the last instruction of the EXPLICIT and IMPLICIT checks
+; should be the same, at 
diff erent locations)
+define i32 @a(i32 %x, i32* nocapture readonly %y, i32* nocapture readonly %z) {
+; CHECK-LABEL: a:
+; CHECK-EXPLICIT:    .p2align 5, 0x0, 8
+; CHECK-IMPLICIT:    .p2align 5
+; CHECK-NEXT:  .LBB0_5: // %vector.body
+; CHECK-EXPLICIT:    .p2align 5, 0x0, 8
+; CHECK-IMPLICIT:    .p2align 5
+; CHECK-NEXT:  .LBB0_8: // %for.body
+; CHECK-OBJ;Disassembly of section .text:
+; CHECK-OBJ:               88: 2a 00 0a 8b   add
+; CHECK-OBJ-IMPLICIT-NEXT: 8c: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: 90: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: 94: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: 98: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: 9c: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: a0: 4b 45 40 b8   ldr
+; CHECK-OBJ-EXPLICIT-NEXT: 8c: 4b 45 40 b8   ldr
+entry:
+  %cmp10 = icmp sgt i32 %x, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %x to i64
+  %min.iters.check = icmp ult i32 %x, 8
+  br i1 %min.iters.check, label %for.body.preheader17, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i64 %wide.trip.count, 4294967288
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %10, %vector.body ]
+  %vec.phi13 = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %y, i64 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = getelementptr inbounds i32, i32* %0, i64 4
+  %3 = bitcast i32* %2 to <4 x i32>*
+  %wide.load14 = load <4 x i32>, <4 x i32>* %3, align 4
+  %4 = getelementptr inbounds i32, i32* %z, i64 %index
+  %5 = bitcast i32* %4 to <4 x i32>*
+  %wide.load15 = load <4 x i32>, <4 x i32>* %5, align 4
+  %6 = getelementptr inbounds i32, i32* %4, i64 4
+  %7 = bitcast i32* %6 to <4 x i32>*
+  %wide.load16 = load <4 x i32>, <4 x i32>* %7, align 4
+  %8 = add <4 x i32> %wide.load, %vec.phi
+  %9 = add <4 x i32> %wide.load14, %vec.phi13
+  %10 = add <4 x i32> %8, %wide.load15
+  %11 = add <4 x i32> %9, %wide.load16
+  %index.next = add nuw i64 %index, 8
+  %12 = icmp eq i64 %index.next, %n.vec
+  br i1 %12, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %bin.rdx = add <4 x i32> %11, %10
+  %13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %bin.rdx)
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader17
+
+for.body.preheader17:                             ; preds = %for.body.preheader, %middle.block
+  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  %b.011.ph = phi i32 [ 0, %for.body.preheader ], [ %13, %middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  %b.0.lcssa = phi i32 [ 0, %entry ], [ %13, %middle.block ], [ %add3, %for.body ]
+  ret i32 %b.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader17, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader17 ]
+  %b.011 = phi i32 [ %add3, %for.body ], [ %b.011.ph, %for.body.preheader17 ]
+  %arrayidx = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %14 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %z, i64 %indvars.iv
+  %15 = load i32, i32* %arrayidx2, align 4
+  %add = add i32 %14, %b.011
+  %add3 = add i32 %add, %15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)