[llvm] [AArch64] New pass for code layout optimizations. (PR #184434)

Wed Mar 11 02:50:56 PDT 2026

https://github.com/ayasin-a updated https://github.com/llvm/llvm-project/pull/184434

>From c06142a4b8027a315975b134a119a17e9b18a6ef Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Tue, 3 Mar 2026 22:32:54 +0200
Subject: [PATCH 1/4] New AArch64CodeLayoutOpt pass for code layout
 optimizations.

For simplicity, this initial version  induces function alignment when a pair is
detected for two cases:
I.  FCMP-FCSEL
II. CMP/CMN-CSEL, 32-bit only

Beyond performance improvement, this optimization helps to reduce noise due to code layout
thus stabilizes performance. For example, knock-out effects on a "sensitive function" won't
be triggered by codegen changes outside it.

Each case can be enabled/disabled individually through -aarch64-code-layout-opt bit-mask.
---
 llvm/lib/Target/AArch64/AArch64.h             |   2 +
 .../Target/AArch64/AArch64CodeLayoutOpt.cpp   | 225 +++++++++++++++++
 llvm/lib/Target/AArch64/AArch64Features.td    |   4 +
 llvm/lib/Target/AArch64/AArch64Processors.td  |   4 +
 .../Target/AArch64/AArch64TargetMachine.cpp   |   3 +
 llvm/lib/Target/AArch64/CMakeLists.txt        |   1 +
 llvm/test/CodeGen/AArch64/code-layout-opt.ll  | 233 ++++++++++++++++++
 7 files changed, 472 insertions(+)
 create mode 100644 llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
 create mode 100644 llvm/test/CodeGen/AArch64/code-layout-opt.ll

diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 40983714ddf1d..4683104ab7633 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -55,6 +55,7 @@ FunctionPass *createFalkorHWPFFixPass();
 FunctionPass *createFalkorMarkStridedAccessesPass();
 FunctionPass *createAArch64PointerAuthPass();
 FunctionPass *createAArch64BranchTargetsPass();
+FunctionPass *createAArch64CodeLayoutOptPass();
 FunctionPass *createAArch64MIPeepholeOptPass();
 FunctionPass *createAArch64PostCoalescerPass();
 
@@ -96,6 +97,7 @@ void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
 void initializeAArch64ExpandPseudoPass(PassRegistry &);
 void initializeAArch64LoadStoreOptPass(PassRegistry&);
 void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
+void initializeAArch64CodeLayoutOptPass(PassRegistry &);
 void initializeAArch64MIPeepholeOptPass(PassRegistry &);
 void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &);
 void initializeAArch64PostCoalescerPass(PassRegistry &);
diff --git a/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp b/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
new file mode 100644
index 0000000000000..00810ed2198ad
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
@@ -0,0 +1,225 @@
+//===-- AArch64CodeLayoutOpt.cpp - Code Layout Optimizations --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass runs after instruction scheduling and employs code layout
+// optimizations for certain patterns.
+//
+// Option -aarch64-code-layout-opt is a bitmask enable for instruction pairs of:
+//   Bit 0 (0x1): Enable FCMP-FCSEL code layout optimization
+//   Bit 1 (0x2): Enable CMP/CMN-CSEL code layout optimization
+//
+// The initial implementation induces function alignment to help optimize
+// code layout for the detected patterns.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-code-layout-opt"
+#define AARCH64_CODE_LAYOUT_OPT_NAME "AArch64 Code Layout Optimization"
+
+// Bitmask option for code alignment optimization:
+//   Bit 0 (0x1): Enable FCMP-FCSEL code layout optimization (requires
+//                hasFuseFCmpFCSel)
+//   Bit 1 (0x2): Enable CMP-CSEL code layout optimization,
+//                32-bit only (requires hasFuseCmpCSel)
+static cl::opt<unsigned> EnableCodeAlignment(
+    "aarch64-code-layout-opt", cl::Hidden,
+    cl::desc("Enable code alignment optimization for instruction pairs "
+             "(bitmask: bit 0 = FCMP-FCSEL, bit 1 = CMP-CSEL)"),
+    cl::init(0));
+
+static cl::opt<unsigned> FunctionAlignBytes(
+    "aarch64-code-layout-opt-align-functions", cl::Hidden,
+    cl::desc("Function alignment in bytes for code layout optimization "
+             "(must be a power of 2)"),
+    cl::init(64), cl::callback([](const unsigned &Val) {
+      if (!isPowerOf2_32(Val))
+        report_fatal_error(
+            "aarch64-code-layout-opt-align must be a power of 2");
+    }));
+
+STATISTIC(NumFunctionsAligned,
+          "Number of functions with aligned (to 64-bytes by default)");
+STATISTIC(NumFcmpFcselPairsDetected,
+          "Number of FCMP-FCSEL pairs detected for alignment");
+STATISTIC(NumCmpCselPairsDetected,
+          "Number of CMP/CMN-CSEL pairs detected for alignment");
+
+namespace {
+
+class AArch64CodeLayoutOpt : public MachineFunctionPass {
+public:
+  static char ID;
+  AArch64CodeLayoutOpt() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override {
+    return AARCH64_CODE_LAYOUT_OPT_NAME;
+  }
+
+private:
+  const AArch64InstrInfo *TII = nullptr;
+
+  // Returns true if MBB contains at least one layout-sensitive pattern.
+  bool detectLayoutSensitivePattern(MachineBasicBlock *MBB);
+
+  bool optimizeForCodeAlignment(MachineFunction &MF);
+};
+
+} // end anonymous namespace
+
+char AArch64CodeLayoutOpt::ID = 0;
+
+INITIALIZE_PASS(AArch64CodeLayoutOpt, "aarch64-code-layout-opt",
+                AARCH64_CODE_LAYOUT_OPT_NAME, false, false)
+
+void AArch64CodeLayoutOpt::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+FunctionPass *llvm::createAArch64CodeLayoutOptPass() {
+  return new AArch64CodeLayoutOpt();
+}
+
+bool AArch64CodeLayoutOpt::runOnMachineFunction(MachineFunction &MF) {
+  if (!EnableCodeAlignment)
+    return false;
+
+  const auto *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
+  TII = Subtarget->getInstrInfo();
+
+  const unsigned Mask = EnableCodeAlignment;
+  if (!((Mask & 0x1) && Subtarget->hasFuseFCmpFCSel()) &&
+      !((Mask & 0x2) && Subtarget->hasFuseCmpCSel()))
+    return false;
+
+  return optimizeForCodeAlignment(MF);
+}
+
+// Returns true if MBB contains at least one layout-sensitive pair.
+// A pair is: a qualifying lead instruction immediately followed by its
+// consumer (FCMP→FCSEL or CMP/CMN→CSEL), with no intervening instructions.
+bool AArch64CodeLayoutOpt::detectLayoutSensitivePattern(
+    MachineBasicBlock *MBB) {
+  MachineInstr *PendingFCMPInstr = nullptr;
+  MachineInstr *PendingCMPInstr = nullptr;
+
+  for (auto &MI : instructionsWithoutDebug(MBB->begin(), MBB->end())) {
+    if (MI.isMetaInstruction())
+      continue;
+
+    unsigned Opc = MI.getOpcode();
+
+    // --- FCMP-FCSEL detection (bit 0) ---
+    if (EnableCodeAlignment & 0x1) {
+      switch (Opc) {
+      case AArch64::FCMPSrr:
+      case AArch64::FCMPDrr:
+      case AArch64::FCMPESrr:
+      case AArch64::FCMPEDrr:
+      case AArch64::FCMPHrr:
+      case AArch64::FCMPEHrr:
+        PendingFCMPInstr = &MI;
+        break;
+      case AArch64::FCSELSrrr:
+      case AArch64::FCSELDrrr:
+      case AArch64::FCSELHrrr:
+        if (PendingFCMPInstr) {
+          ++NumFcmpFcselPairsDetected;
+          return true;
+        }
+        PendingFCMPInstr = nullptr;
+        break;
+      default:
+        PendingFCMPInstr = nullptr;
+        break;
+      }
+    }
+
+    // --- CMP/CMN-CSEL detection (bit 1) ---
+    // CMP is encoded as SUBS with WZR destination (32-bit only).
+    // CMN is encoded as ADDS with WZR destination (32-bit only).
+    // Only simple variants (no shifted/extended reg) qualify.
+    if (EnableCodeAlignment & 0x2) {
+      bool IsCMP = false;
+      switch (Opc) {
+      case AArch64::SUBSWrr:
+      case AArch64::ADDSWrr:
+        IsCMP = MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr);
+        break;
+      case AArch64::SUBSWri:
+      case AArch64::ADDSWri:
+        // Only CMP/CMN #imm (no LSL #12 shift) with small immediates (<=15)
+        IsCMP = MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) &&
+                MI.getOperand(3).getImm() == 0 &&
+                MI.getOperand(2).getImm() <= 15;
+        break;
+      case AArch64::SUBSWrs:
+      case AArch64::ADDSWrs:
+        IsCMP = MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) &&
+                !AArch64InstrInfo::hasShiftedReg(MI);
+        break;
+      case AArch64::SUBSWrx:
+        IsCMP = MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) &&
+                !AArch64InstrInfo::hasExtendedReg(MI);
+        break;
+      case AArch64::CSELWr:
+        if (PendingCMPInstr) {
+          ++NumCmpCselPairsDetected;
+          return true;
+        }
+        PendingCMPInstr = nullptr;
+        break;
+      default:
+        break;
+      }
+
+      if (IsCMP)
+        PendingCMPInstr = &MI;
+      else if (Opc != AArch64::CSELWr)
+        PendingCMPInstr = nullptr;
+    }
+  }
+
+  return false;
+}
+
+bool AArch64CodeLayoutOpt::optimizeForCodeAlignment(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE ": optimizeForCodeAlignment: " << MF.getName()
+                    << "\n");
+
+  for (auto &MBB : MF) {
+    if (!detectLayoutSensitivePattern(&MBB))
+      continue;
+
+    if (MF.getAlignment() >= Align(FunctionAlignBytes)) {
+      LLVM_DEBUG(dbgs() << DEBUG_TYPE ": Function " << MF.getName()
+                        << " already has sufficient alignment\n");
+      return false;
+    }
+
+    MF.setAlignment(Align(FunctionAlignBytes));
+    ++NumFunctionsAligned;
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE ": Set " << FunctionAlignBytes
+                      << "-byte alignment for function " << MF.getName()
+                      << "\n");
+    return true;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index faee640a910d0..7e3c8097ef830 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -792,6 +792,10 @@ def FeatureFuseCmpCSet : SubtargetFeature<
     "fuse-cset", "HasFuseCmpCSet", "true",
     "CPU can fuse CMP and CSET operations">;
 
+def FeatureFuseFCmpFCSel : SubtargetFeature<
+    "fuse-fcsel", "HasFuseFCmpFCSel", "true",
+    "CPU can fuse FCMP and FCSEL operations">;
+
 def FeatureFuseCryptoEOR : SubtargetFeature<
     "fuse-crypto-eor", "HasFuseCryptoEOR", "true",
     "CPU fuses AES/PMULL and EOR operations">;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 7791e556f3b13..94617ca1ad244 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -527,6 +527,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
                                     FeatureFuseCmpCSel,
+                                    FeatureFuseFCmpFCSel,
                                     FeatureFuseCryptoEOR,
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
@@ -550,6 +551,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
                                     FeatureFuseCmpCSel,
+                                    FeatureFuseFCmpFCSel,
                                     FeatureFuseCryptoEOR,
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
@@ -573,6 +575,7 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
                                      FeatureFuseAES,
                                      FeatureFuseArithmeticLogic,
                                      FeatureFuseCmpCSel,
+                                     FeatureFuseFCmpFCSel,
                                      FeatureFuseCryptoEOR,
                                      FeatureFuseLiterals,
                                      FeatureZCRegMoveGPR64,
@@ -595,6 +598,7 @@ def TuneAppleM5 : SubtargetFeature<"apple-m5", "ARMProcFamily", "AppleM5",
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
                                     FeatureFuseCmpCSel,
+                                    FeatureFuseFCmpFCSel,
                                     FeatureFuseCryptoEOR,
                                     FeatureFuseLiterals,
                                     FeatureZCRegMoveGPR64,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 4255ebd4cc557..4e23e72672cdc 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -245,6 +245,7 @@ LLVMInitializeAArch64Target() {
   initializeGlobalISel(PR);
   initializeAArch64A53Fix835769Pass(PR);
   initializeAArch64A57FPLoadBalancingPass(PR);
+  initializeAArch64CodeLayoutOptPass(PR);
   initializeAArch64AdvSIMDScalarPass(PR);
   initializeAArch64AsmPrinterPass(PR);
   initializeAArch64BranchTargetsPass(PR);
@@ -869,6 +870,8 @@ void AArch64PassConfig::addPostRegAlloc() {
 }
 
 void AArch64PassConfig::addPreSched2() {
+  // Apply code layout optimizations for instruction pairs.
+  addPass(createAArch64CodeLayoutOptPass());
   // Lower homogeneous frame instructions
   if (EnableHomogeneousPrologEpilog)
     addPass(createAArch64LowerHomogeneousPrologEpilogPass());
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index 2fe554217c1ba..1abff4f5add29 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -42,6 +42,7 @@ add_llvm_target(AArch64CodeGen
   GISel/AArch64PostSelectOptimize.cpp
   GISel/AArch64RegisterBankInfo.cpp
   AArch64A57FPLoadBalancing.cpp
+  AArch64CodeLayoutOpt.cpp
   AArch64AdvSIMDScalarPass.cpp
   AArch64Arm64ECCallLowering.cpp
   AArch64AsmPrinter.cpp
diff --git a/llvm/test/CodeGen/AArch64/code-layout-opt.ll b/llvm/test/CodeGen/AArch64/code-layout-opt.ll
new file mode 100644
index 0000000000000..604522b5b9623
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/code-layout-opt.ll
@@ -0,0 +1,233 @@
+; NOTE: Test cases for FCMP-FCSEL and CMP/CMN-CSEL code layout optimization
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-apple-darwin -mcpu=apple-m4 -aarch64-code-layout-opt=3 | FileCheck %s
+
+; Test coverage for optimizeForCodeAlignment function:
+; 1. Basic FCMP-FCSEL instruction pair detection and function alignment (single/double precision)
+; 2. Loop block alignment for instruction pairs in loops (simple, nested, multi-block)
+; 3. Multiple instruction pairs in same function (also tests different predicates)
+; 4. FCMP with immediate operand (#0.0) is excluded from optimization
+; 5. Instruction pairs with function calls
+; 6. Negative tests (no false positives)
+; 7. Basic CMP-CSEL and CMN-CSEL instruction pair detection and function alignment
+; 8. CMP/CMN with immediate <=15 qualifies; immediate >15 is excluded
+
+; Test 1: Basic single-precision FCMP-FCSEL instruction pair
+; CHECK: .globl _test_basic_fcmp_fcsel_single
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_basic_fcmp_fcsel_single:
+define float @test_basic_fcmp_fcsel_single(float %a, float %b, float %c, float %d) {
+entry:
+  %cmp = fcmp oeq float %a, %b
+  %sel = select i1 %cmp, float %c, float %d
+  ret float %sel
+}
+
+; Test 2: Basic double-precision FCMP-FCSEL instruction pair
+; CHECK: .globl _test_basic_fcmp_fcsel_double
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_basic_fcmp_fcsel_double:
+define double @test_basic_fcmp_fcsel_double(double %a, double %b, double %c, double %d) {
+entry:
+  %cmp = fcmp oeq double %a, %b
+  %sel = select i1 %cmp, double %c, double %d
+  ret double %sel
+}
+
+; Test 3: FCMP-FCSEL instruction pair in a simple loop
+; CHECK: .globl _test_fcmp_fcsel_in_loop
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_fcmp_fcsel_in_loop:
+define float @test_fcmp_fcsel_in_loop(ptr %arr, i32 %n) {
+entry:
+  br label %loop
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %acc = phi float [ 0.0, %entry ], [ %new_acc, %loop ]
+  %ptr = getelementptr float, ptr %arr, i32 %i
+  %val = load float, ptr %ptr
+  %cmp = fcmp ogt float %val, %acc
+  %new_acc = select i1 %cmp, float %val, float %acc
+  %i.next = add i32 %i, 1
+  %exit_cond = icmp eq i32 %i.next, %n
+  br i1 %exit_cond, label %exit, label %loop
+exit:
+  ret float %new_acc
+}
+
+; Test 4: Multiple FCMP-FCSEL instruction pairs in same function
+; CHECK: .globl _test_multiple_patterns
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_multiple_patterns:
+define float @test_multiple_patterns(float %a, float %b, float %c, float %d, float %e, float %f) {
+entry:
+  %cmp1 = fcmp oeq float %a, %b
+  %sel1 = select i1 %cmp1, float %c, float %d
+  %cmp2 = fcmp ogt float %sel1, %e
+  %sel2 = select i1 %cmp2, float %sel1, float %f
+  ret float %sel2
+}
+
+; Test 5: FCMP with comparison to zero (immediate) - excluded from optimization
+; FCMP #0.0 uses the ri-form opcode which is not in the detection list
+; CHECK: .globl _test_fcmp_immediate
+; CHECK-NEXT: .p2align 2
+; CHECK-LABEL: _test_fcmp_immediate:
+define float @test_fcmp_immediate(float %a, float %b) {
+entry:
+  %cmp = fcmp oeq float %a, 0.0
+  %sel = select i1 %cmp, float %a, float %b
+  ret float %sel
+}
+
+; Test 6: Nested loops with FCMP-FCSEL instruction pair
+; CHECK: .globl _test_nested_loops
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_nested_loops:
+define double @test_nested_loops(ptr %arr, i32 %rows, i32 %cols) {
+entry:
+  br label %outer_loop
+outer_loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %outer_loop_latch ]
+  %outer_acc = phi double [ 0.0, %entry ], [ %inner_result, %outer_loop_latch ]
+  br label %inner_loop
+inner_loop:
+  %j = phi i32 [ 0, %outer_loop ], [ %j.next, %inner_loop ]
+  %acc = phi double [ %outer_acc, %outer_loop ], [ %new_acc, %inner_loop ]
+  %offset = mul i32 %i, %cols
+  %idx = add i32 %offset, %j
+  %ptr = getelementptr double, ptr %arr, i32 %idx
+  %val = load double, ptr %ptr
+  %cmp = fcmp ogt double %val, %acc
+  %new_acc = select i1 %cmp, double %val, double %acc
+  %j.next = add i32 %j, 1
+  %inner_exit = icmp eq i32 %j.next, %cols
+  br i1 %inner_exit, label %outer_loop_latch, label %inner_loop
+outer_loop_latch:
+  %inner_result = phi double [ %new_acc, %inner_loop ]
+  %i.next = add i32 %i, 1
+  %outer_exit = icmp eq i32 %i.next, %rows
+  br i1 %outer_exit, label %exit, label %outer_loop
+exit:
+  %result = phi double [ %inner_result, %outer_loop_latch ]
+  ret double %result
+}
+
+; Test 7: Mixed single and double precision in same function
+; CHECK: .globl _test_mixed_precision
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_mixed_precision:
+define float @test_mixed_precision(float %a, float %b, double %c, double %d) {
+entry:
+  %cmp_single = fcmp ogt float %a, %b
+  %sel_single = select i1 %cmp_single, float %a, float %b
+  %cmp_double = fcmp olt double %c, %d
+  %sel_double = select i1 %cmp_double, double %c, double %d
+  %trunc = fptrunc double %sel_double to float
+  %final = fadd float %sel_single, %trunc
+  ret float %final
+}
+
+; Test 8: FCMP-FCSEL instruction pair with a function call present
+; CHECK: .globl _test_with_function_calls
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_with_function_calls:
+declare float @external_func(float)
+define float @test_with_function_calls(float %a, float %b, float %c, float %d) {
+entry:
+  %cmp = fcmp ogt float %a, %b
+  %sel = select i1 %cmp, float %c, float %d
+  %result = call float @external_func(float %sel)
+  ret float %result
+}
+
+; Test 9: Verify no false positives - FCMP without FCSEL
+; CHECK: .globl _test_fcmp_without_fcsel
+; CHECK-NEXT: .p2align 2
+; CHECK-LABEL: _test_fcmp_without_fcsel:
+define i32 @test_fcmp_without_fcsel(float %a, float %b) {
+entry:
+  %cmp = fcmp ogt float %a, %b
+  %result = zext i1 %cmp to i32
+  ret i32 %result
+}
+
+; Test 10: Verify no false positives - FCSEL without preceding FCMP
+; CHECK: .globl _test_fcsel_without_fcmp
+; CHECK-NEXT: .p2align 2
+; CHECK-LABEL: _test_fcsel_without_fcmp:
+define float @test_fcsel_without_fcmp(i1 %cond, float %a, float %b) {
+entry:
+  %result = select i1 %cond, float %a, float %b
+  ret float %result
+}
+
+;------------------------------------------------------------------------------
+; CMP/CMN-CSEL tests (bit 1 of -aarch64-code-layout-opt)
+;------------------------------------------------------------------------------
+
+; Test 11: Basic CMP-CSEL instruction pair (integer register comparison)
+; CHECK: .globl _test_basic_cmp_csel
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_basic_cmp_csel:
+define i32 @test_basic_cmp_csel(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %sel = select i1 %cmp, i32 %c, i32 %d
+  ret i32 %sel
+}
+
+; Test 12: CMP-CSEL instruction pair with small immediate (<=15, qualifies for optimization)
+; CHECK: .globl _test_cmp_small_imm_csel
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_cmp_small_imm_csel:
+define i32 @test_cmp_small_imm_csel(i32 %a, i32 %b, i32 %c) {
+entry:
+  %cmp = icmp eq i32 %a, 7
+  %sel = select i1 %cmp, i32 %b, i32 %c
+  ret i32 %sel
+}
+
+; Test 13: CMP-CSEL with immediate > 15 - excluded from optimization
+; CHECK: .globl _test_cmp_large_imm_csel
+; CHECK-NEXT: .p2align 2
+; CHECK-LABEL: _test_cmp_large_imm_csel:
+define i32 @test_cmp_large_imm_csel(i32 %a, i32 %b, i32 %c) {
+entry:
+  %cmp = icmp eq i32 %a, 100
+  %sel = select i1 %cmp, i32 %b, i32 %c
+  ret i32 %sel
+}
+
+; Test 14: Basic CMN-CSEL instruction pair (ADDSWrr with WZR destination)
+; CHECK: .globl _test_basic_cmn_csel
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_basic_cmn_csel:
+define i32 @test_basic_cmn_csel(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  %sum = add i32 %a, %b
+  %cmp = icmp eq i32 %sum, 0
+  %sel = select i1 %cmp, i32 %c, i32 %d
+  ret i32 %sel
+}
+
+; Test 15: CMN-CSEL instruction pair with small immediate (ADDSWri imm=7, qualifies)
+; CHECK: .globl _test_cmn_small_imm_csel
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_cmn_small_imm_csel:
+define i32 @test_cmn_small_imm_csel(i32 %a, i32 %b, i32 %c) {
+entry:
+  %cmp = icmp eq i32 %a, -7
+  %sel = select i1 %cmp, i32 %b, i32 %c
+  ret i32 %sel
+}
+
+; Test 16: CMP without CSEL - no false positive
+; CHECK: .globl _test_cmp_without_csel
+; CHECK-NEXT: .p2align 2
+; CHECK-LABEL: _test_cmp_without_csel:
+define i32 @test_cmp_without_csel(i32 %a, i32 %b) {
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %result = zext i1 %cmp to i32
+  ret i32 %result
+}

>From 6859fc32e60ce904f45a1c6e10d02b6c3dc403b0 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Wed, 4 Mar 2026 23:31:21 +0200
Subject: [PATCH 2/4] tests: remove loop-specific tests & rename main function

---
 .../Target/AArch64/AArch64CodeLayoutOpt.cpp   |  8 +-
 llvm/test/CodeGen/AArch64/code-layout-opt.ll  | 93 ++++---------------
 2 files changed, 23 insertions(+), 78 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp b/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
index 00810ed2198ad..f20765e372421 100644
--- a/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
@@ -77,7 +77,7 @@ class AArch64CodeLayoutOpt : public MachineFunctionPass {
   // Returns true if MBB contains at least one layout-sensitive pattern.
   bool detectLayoutSensitivePattern(MachineBasicBlock *MBB);
 
-  bool optimizeForCodeAlignment(MachineFunction &MF);
+  bool optimizeForCodeLayout(MachineFunction &MF);
 };
 
 } // end anonymous namespace
@@ -108,7 +108,7 @@ bool AArch64CodeLayoutOpt::runOnMachineFunction(MachineFunction &MF) {
       !((Mask & 0x2) && Subtarget->hasFuseCmpCSel()))
     return false;
 
-  return optimizeForCodeAlignment(MF);
+  return optimizeForCodeLayout(MF);
 }
 
 // Returns true if MBB contains at least one layout-sensitive pair.
@@ -199,8 +199,8 @@ bool AArch64CodeLayoutOpt::detectLayoutSensitivePattern(
   return false;
 }
 
-bool AArch64CodeLayoutOpt::optimizeForCodeAlignment(MachineFunction &MF) {
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE ": optimizeForCodeAlignment: " << MF.getName()
+bool AArch64CodeLayoutOpt::optimizeForCodeLayout(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE ": optimizeForCodeLayout: " << MF.getName()
                     << "\n");
 
   for (auto &MBB : MF) {
diff --git a/llvm/test/CodeGen/AArch64/code-layout-opt.ll b/llvm/test/CodeGen/AArch64/code-layout-opt.ll
index 604522b5b9623..30215ec2fb059 100644
--- a/llvm/test/CodeGen/AArch64/code-layout-opt.ll
+++ b/llvm/test/CodeGen/AArch64/code-layout-opt.ll
@@ -1,15 +1,14 @@
 ; NOTE: Test cases for FCMP-FCSEL and CMP/CMN-CSEL code layout optimization
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-apple-darwin -mcpu=apple-m4 -aarch64-code-layout-opt=3 | FileCheck %s
 
-; Test coverage for optimizeForCodeAlignment function:
+; Test coverage for optimizeForCodeLayout function:
 ; 1. Basic FCMP-FCSEL instruction pair detection and function alignment (single/double precision)
-; 2. Loop block alignment for instruction pairs in loops (simple, nested, multi-block)
-; 3. Multiple instruction pairs in same function (also tests different predicates)
-; 4. FCMP with immediate operand (#0.0) is excluded from optimization
-; 5. Instruction pairs with function calls
-; 6. Negative tests (no false positives)
-; 7. Basic CMP-CSEL and CMN-CSEL instruction pair detection and function alignment
-; 8. CMP/CMN with immediate <=15 qualifies; immediate >15 is excluded
+; 2. Multiple instruction pairs in same function (also tests different predicates)
+; 3. FCMP with immediate operand (#0.0) is excluded from optimization
+; 4. Instruction pairs with function calls
+; 5. Negative tests (no false positives)
+; 6. Basic CMP-CSEL and CMN-CSEL instruction pair detection and function alignment
+; 7. CMP/CMN with immediate <=15 qualifies; immediate >15 is excluded
 
 ; Test 1: Basic single-precision FCMP-FCSEL instruction pair
 ; CHECK: .globl _test_basic_fcmp_fcsel_single
@@ -33,28 +32,7 @@ entry:
   ret double %sel
 }
 
-; Test 3: FCMP-FCSEL instruction pair in a simple loop
-; CHECK: .globl _test_fcmp_fcsel_in_loop
-; CHECK-NEXT: .p2align 6
-; CHECK-LABEL: _test_fcmp_fcsel_in_loop:
-define float @test_fcmp_fcsel_in_loop(ptr %arr, i32 %n) {
-entry:
-  br label %loop
-loop:
-  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
-  %acc = phi float [ 0.0, %entry ], [ %new_acc, %loop ]
-  %ptr = getelementptr float, ptr %arr, i32 %i
-  %val = load float, ptr %ptr
-  %cmp = fcmp ogt float %val, %acc
-  %new_acc = select i1 %cmp, float %val, float %acc
-  %i.next = add i32 %i, 1
-  %exit_cond = icmp eq i32 %i.next, %n
-  br i1 %exit_cond, label %exit, label %loop
-exit:
-  ret float %new_acc
-}
-
-; Test 4: Multiple FCMP-FCSEL instruction pairs in same function
+; Test 3: Multiple FCMP-FCSEL instruction pairs in same function
 ; CHECK: .globl _test_multiple_patterns
 ; CHECK-NEXT: .p2align 6
 ; CHECK-LABEL: _test_multiple_patterns:
@@ -67,7 +45,7 @@ entry:
   ret float %sel2
 }
 
-; Test 5: FCMP with comparison to zero (immediate) - excluded from optimization
+; Test 4: FCMP with comparison to zero (immediate) - excluded from optimization
 ; FCMP #0.0 uses the ri-form opcode which is not in the detection list
 ; CHECK: .globl _test_fcmp_immediate
 ; CHECK-NEXT: .p2align 2
@@ -79,40 +57,7 @@ entry:
   ret float %sel
 }
 
-; Test 6: Nested loops with FCMP-FCSEL instruction pair
-; CHECK: .globl _test_nested_loops
-; CHECK-NEXT: .p2align 6
-; CHECK-LABEL: _test_nested_loops:
-define double @test_nested_loops(ptr %arr, i32 %rows, i32 %cols) {
-entry:
-  br label %outer_loop
-outer_loop:
-  %i = phi i32 [ 0, %entry ], [ %i.next, %outer_loop_latch ]
-  %outer_acc = phi double [ 0.0, %entry ], [ %inner_result, %outer_loop_latch ]
-  br label %inner_loop
-inner_loop:
-  %j = phi i32 [ 0, %outer_loop ], [ %j.next, %inner_loop ]
-  %acc = phi double [ %outer_acc, %outer_loop ], [ %new_acc, %inner_loop ]
-  %offset = mul i32 %i, %cols
-  %idx = add i32 %offset, %j
-  %ptr = getelementptr double, ptr %arr, i32 %idx
-  %val = load double, ptr %ptr
-  %cmp = fcmp ogt double %val, %acc
-  %new_acc = select i1 %cmp, double %val, double %acc
-  %j.next = add i32 %j, 1
-  %inner_exit = icmp eq i32 %j.next, %cols
-  br i1 %inner_exit, label %outer_loop_latch, label %inner_loop
-outer_loop_latch:
-  %inner_result = phi double [ %new_acc, %inner_loop ]
-  %i.next = add i32 %i, 1
-  %outer_exit = icmp eq i32 %i.next, %rows
-  br i1 %outer_exit, label %exit, label %outer_loop
-exit:
-  %result = phi double [ %inner_result, %outer_loop_latch ]
-  ret double %result
-}
-
-; Test 7: Mixed single and double precision in same function
+; Test 5: Mixed single and double precision in same function
 ; CHECK: .globl _test_mixed_precision
 ; CHECK-NEXT: .p2align 6
 ; CHECK-LABEL: _test_mixed_precision:
@@ -127,7 +72,7 @@ entry:
   ret float %final
 }
 
-; Test 8: FCMP-FCSEL instruction pair with a function call present
+; Test 6: FCMP-FCSEL instruction pair with a function call present
 ; CHECK: .globl _test_with_function_calls
 ; CHECK-NEXT: .p2align 6
 ; CHECK-LABEL: _test_with_function_calls:
@@ -140,7 +85,7 @@ entry:
   ret float %result
 }
 
-; Test 9: Verify no false positives - FCMP without FCSEL
+; Test 7: Verify no false positives - FCMP without FCSEL
 ; CHECK: .globl _test_fcmp_without_fcsel
 ; CHECK-NEXT: .p2align 2
 ; CHECK-LABEL: _test_fcmp_without_fcsel:
@@ -151,7 +96,7 @@ entry:
   ret i32 %result
 }
 
-; Test 10: Verify no false positives - FCSEL without preceding FCMP
+; Test 8: Verify no false positives - FCSEL without preceding FCMP
 ; CHECK: .globl _test_fcsel_without_fcmp
 ; CHECK-NEXT: .p2align 2
 ; CHECK-LABEL: _test_fcsel_without_fcmp:
@@ -165,7 +110,7 @@ entry:
 ; CMP/CMN-CSEL tests (bit 1 of -aarch64-code-layout-opt)
 ;------------------------------------------------------------------------------
 
-; Test 11: Basic CMP-CSEL instruction pair (integer register comparison)
+; Test 9: Basic CMP-CSEL instruction pair (integer register comparison)
 ; CHECK: .globl _test_basic_cmp_csel
 ; CHECK-NEXT: .p2align 6
 ; CHECK-LABEL: _test_basic_cmp_csel:
@@ -176,7 +121,7 @@ entry:
   ret i32 %sel
 }
 
-; Test 12: CMP-CSEL instruction pair with small immediate (<=15, qualifies for optimization)
+; Test 10: CMP-CSEL instruction pair with small immediate (<=15, qualifies for optimization)
 ; CHECK: .globl _test_cmp_small_imm_csel
 ; CHECK-NEXT: .p2align 6
 ; CHECK-LABEL: _test_cmp_small_imm_csel:
@@ -187,7 +132,7 @@ entry:
   ret i32 %sel
 }
 
-; Test 13: CMP-CSEL with immediate > 15 - excluded from optimization
+; Test 11: CMP-CSEL with immediate > 15 - excluded from optimization
 ; CHECK: .globl _test_cmp_large_imm_csel
 ; CHECK-NEXT: .p2align 2
 ; CHECK-LABEL: _test_cmp_large_imm_csel:
@@ -198,7 +143,7 @@ entry:
   ret i32 %sel
 }
 
-; Test 14: Basic CMN-CSEL instruction pair (ADDSWrr with WZR destination)
+; Test 12: Basic CMN-CSEL instruction pair (ADDSWrr with WZR destination)
 ; CHECK: .globl _test_basic_cmn_csel
 ; CHECK-NEXT: .p2align 6
 ; CHECK-LABEL: _test_basic_cmn_csel:
@@ -210,7 +155,7 @@ entry:
   ret i32 %sel
 }
 
-; Test 15: CMN-CSEL instruction pair with small immediate (ADDSWri imm=7, qualifies)
+; Test 13: CMN-CSEL instruction pair with small immediate (ADDSWri imm=7, qualifies)
 ; CHECK: .globl _test_cmn_small_imm_csel
 ; CHECK-NEXT: .p2align 6
 ; CHECK-LABEL: _test_cmn_small_imm_csel:
@@ -221,7 +166,7 @@ entry:
   ret i32 %sel
 }
 
-; Test 16: CMP without CSEL - no false positive
+; Test 14: CMP without CSEL - no false positive
 ; CHECK: .globl _test_cmp_without_csel
 ; CHECK-NEXT: .p2align 2
 ; CHECK-LABEL: _test_cmp_without_csel:

>From f09c66307aa8e55a021981f37529a511a1d32b35 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Wed, 4 Mar 2026 23:34:24 +0200
Subject: [PATCH 3/4] fix: AnalysisUsage setPreservesAll()

---
 llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp b/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
index f20765e372421..703c595887db1 100644
--- a/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
@@ -88,7 +88,7 @@ INITIALIZE_PASS(AArch64CodeLayoutOpt, "aarch64-code-layout-opt",
                 AARCH64_CODE_LAYOUT_OPT_NAME, false, false)
 
 void AArch64CodeLayoutOpt::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
+  AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 

>From 02b6739046206a4e15011df70a32381857c14e65 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Wed, 11 Mar 2026 11:50:44 +0200
Subject: [PATCH 4/4] revert changes introducing FeatureFuseFCmpFCSel

---
 llvm/lib/Target/AArch64/AArch64Features.td   | 4 ----
 llvm/lib/Target/AArch64/AArch64Processors.td | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 7e3c8097ef830..faee640a910d0 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -792,10 +792,6 @@ def FeatureFuseCmpCSet : SubtargetFeature<
     "fuse-cset", "HasFuseCmpCSet", "true",
     "CPU can fuse CMP and CSET operations">;
 
-def FeatureFuseFCmpFCSel : SubtargetFeature<
-    "fuse-fcsel", "HasFuseFCmpFCSel", "true",
-    "CPU can fuse FCMP and FCSEL operations">;
-
 def FeatureFuseCryptoEOR : SubtargetFeature<
     "fuse-crypto-eor", "HasFuseCryptoEOR", "true",
     "CPU fuses AES/PMULL and EOR operations">;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 94617ca1ad244..7791e556f3b13 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -527,7 +527,6 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
                                     FeatureFuseCmpCSel,
-                                    FeatureFuseFCmpFCSel,
                                     FeatureFuseCryptoEOR,
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
@@ -551,7 +550,6 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
                                     FeatureFuseCmpCSel,
-                                    FeatureFuseFCmpFCSel,
                                     FeatureFuseCryptoEOR,
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
@@ -575,7 +573,6 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
                                      FeatureFuseAES,
                                      FeatureFuseArithmeticLogic,
                                      FeatureFuseCmpCSel,
-                                     FeatureFuseFCmpFCSel,
                                      FeatureFuseCryptoEOR,
                                      FeatureFuseLiterals,
                                      FeatureZCRegMoveGPR64,
@@ -598,7 +595,6 @@ def TuneAppleM5 : SubtargetFeature<"apple-m5", "ARMProcFamily", "AppleM5",
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
                                     FeatureFuseCmpCSel,
-                                    FeatureFuseFCmpFCSel,
                                     FeatureFuseCryptoEOR,
                                     FeatureFuseLiterals,
                                     FeatureZCRegMoveGPR64,