[llvm] [AArch64] New pass for code layout optimizations. (PR #184434)
Ahmad Yasin via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 11 02:50:56 PDT 2026
https://github.com/ayasin-a updated https://github.com/llvm/llvm-project/pull/184434
>From c06142a4b8027a315975b134a119a17e9b18a6ef Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Tue, 3 Mar 2026 22:32:54 +0200
Subject: [PATCH 1/4] New AArch64CodeLayoutOpt pass for code layout
optimizations.
For simplicity, this initial version induces function alignment when a pair is
detected for two cases:
I. FCMP-FCSEL
II. CMP/CMN-CSEL, 32-bit only
Beyond performance improvement, this optimization helps to reduce noise due to code layout
thus stabilizes performance. For example, knock-out effects on a "sensitive function" won't
be triggered by codegen changes outside it.
Each case can be enabled/disabled individually through -aarch64-code-layout-opt bit-mask.
---
llvm/lib/Target/AArch64/AArch64.h | 2 +
.../Target/AArch64/AArch64CodeLayoutOpt.cpp | 225 +++++++++++++++++
llvm/lib/Target/AArch64/AArch64Features.td | 4 +
llvm/lib/Target/AArch64/AArch64Processors.td | 4 +
.../Target/AArch64/AArch64TargetMachine.cpp | 3 +
llvm/lib/Target/AArch64/CMakeLists.txt | 1 +
llvm/test/CodeGen/AArch64/code-layout-opt.ll | 233 ++++++++++++++++++
7 files changed, 472 insertions(+)
create mode 100644 llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
create mode 100644 llvm/test/CodeGen/AArch64/code-layout-opt.ll
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 40983714ddf1d..4683104ab7633 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -55,6 +55,7 @@ FunctionPass *createFalkorHWPFFixPass();
FunctionPass *createFalkorMarkStridedAccessesPass();
FunctionPass *createAArch64PointerAuthPass();
FunctionPass *createAArch64BranchTargetsPass();
+FunctionPass *createAArch64CodeLayoutOptPass();
FunctionPass *createAArch64MIPeepholeOptPass();
FunctionPass *createAArch64PostCoalescerPass();
@@ -96,6 +97,7 @@ void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
void initializeAArch64ExpandPseudoPass(PassRegistry &);
void initializeAArch64LoadStoreOptPass(PassRegistry&);
void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
+void initializeAArch64CodeLayoutOptPass(PassRegistry &);
void initializeAArch64MIPeepholeOptPass(PassRegistry &);
void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &);
void initializeAArch64PostCoalescerPass(PassRegistry &);
diff --git a/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp b/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
new file mode 100644
index 0000000000000..00810ed2198ad
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
@@ -0,0 +1,225 @@
+//===-- AArch64CodeLayoutOpt.cpp - Code Layout Optimizations --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass runs after instruction scheduling and employs code layout
+// optimizations for certain patterns.
+//
+// Option -aarch64-code-layout-opt is a bitmask enable for instruction pairs of:
+// Bit 0 (0x1): Enable FCMP-FCSEL code layout optimization
+// Bit 1 (0x2): Enable CMP/CMN-CSEL code layout optimization
+//
+// The initial implementation induces function alignment to help optimize
+// code layout for the detected patterns.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-code-layout-opt"
+#define AARCH64_CODE_LAYOUT_OPT_NAME "AArch64 Code Layout Optimization"
+
+// Bitmask option for code alignment optimization:
+// Bit 0 (0x1): Enable FCMP-FCSEL code layout optimization (requires
+// hasFuseFCmpFCSel)
+// Bit 1 (0x2): Enable CMP-CSEL code layout optimization,
+// 32-bit only (requires hasFuseCmpCSel)
+static cl::opt<unsigned> EnableCodeAlignment(
+ "aarch64-code-layout-opt", cl::Hidden,
+ cl::desc("Enable code alignment optimization for instruction pairs "
+ "(bitmask: bit 0 = FCMP-FCSEL, bit 1 = CMP-CSEL)"),
+ cl::init(0));
+
+static cl::opt<unsigned> FunctionAlignBytes(
+ "aarch64-code-layout-opt-align-functions", cl::Hidden,
+ cl::desc("Function alignment in bytes for code layout optimization "
+ "(must be a power of 2)"),
+ cl::init(64), cl::callback([](const unsigned &Val) {
+ if (!isPowerOf2_32(Val))
+ report_fatal_error(
+ "aarch64-code-layout-opt-align must be a power of 2");
+ }));
+
+STATISTIC(NumFunctionsAligned,
+ "Number of functions with aligned (to 64-bytes by default)");
+STATISTIC(NumFcmpFcselPairsDetected,
+ "Number of FCMP-FCSEL pairs detected for alignment");
+STATISTIC(NumCmpCselPairsDetected,
+ "Number of CMP/CMN-CSEL pairs detected for alignment");
+
+namespace {
+
+class AArch64CodeLayoutOpt : public MachineFunctionPass {
+public:
+ static char ID;
+ AArch64CodeLayoutOpt() : MachineFunctionPass(ID) {}
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ StringRef getPassName() const override {
+ return AARCH64_CODE_LAYOUT_OPT_NAME;
+ }
+
+private:
+ const AArch64InstrInfo *TII = nullptr;
+
+ // Returns true if MBB contains at least one layout-sensitive pattern.
+ bool detectLayoutSensitivePattern(MachineBasicBlock *MBB);
+
+ bool optimizeForCodeAlignment(MachineFunction &MF);
+};
+
+} // end anonymous namespace
+
+char AArch64CodeLayoutOpt::ID = 0;
+
+INITIALIZE_PASS(AArch64CodeLayoutOpt, "aarch64-code-layout-opt",
+ AARCH64_CODE_LAYOUT_OPT_NAME, false, false)
+
+void AArch64CodeLayoutOpt::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+FunctionPass *llvm::createAArch64CodeLayoutOptPass() {
+ return new AArch64CodeLayoutOpt();
+}
+
+bool AArch64CodeLayoutOpt::runOnMachineFunction(MachineFunction &MF) {
+ if (!EnableCodeAlignment)
+ return false;
+
+ const auto *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
+ TII = Subtarget->getInstrInfo();
+
+ const unsigned Mask = EnableCodeAlignment;
+ if (!((Mask & 0x1) && Subtarget->hasFuseFCmpFCSel()) &&
+ !((Mask & 0x2) && Subtarget->hasFuseCmpCSel()))
+ return false;
+
+ return optimizeForCodeAlignment(MF);
+}
+
+// Returns true if MBB contains at least one layout-sensitive pair.
+// A pair is: a qualifying lead instruction immediately followed by its
+// consumer (FCMP→FCSEL or CMP/CMN→CSEL), with no intervening instructions.
+bool AArch64CodeLayoutOpt::detectLayoutSensitivePattern(
+ MachineBasicBlock *MBB) {
+ MachineInstr *PendingFCMPInstr = nullptr;
+ MachineInstr *PendingCMPInstr = nullptr;
+
+ for (auto &MI : instructionsWithoutDebug(MBB->begin(), MBB->end())) {
+ if (MI.isMetaInstruction())
+ continue;
+
+ unsigned Opc = MI.getOpcode();
+
+ // --- FCMP-FCSEL detection (bit 0) ---
+ if (EnableCodeAlignment & 0x1) {
+ switch (Opc) {
+ case AArch64::FCMPSrr:
+ case AArch64::FCMPDrr:
+ case AArch64::FCMPESrr:
+ case AArch64::FCMPEDrr:
+ case AArch64::FCMPHrr:
+ case AArch64::FCMPEHrr:
+ PendingFCMPInstr = &MI;
+ break;
+ case AArch64::FCSELSrrr:
+ case AArch64::FCSELDrrr:
+ case AArch64::FCSELHrrr:
+ if (PendingFCMPInstr) {
+ ++NumFcmpFcselPairsDetected;
+ return true;
+ }
+ PendingFCMPInstr = nullptr;
+ break;
+ default:
+ PendingFCMPInstr = nullptr;
+ break;
+ }
+ }
+
+ // --- CMP/CMN-CSEL detection (bit 1) ---
+ // CMP is encoded as SUBS with WZR destination (32-bit only).
+ // CMN is encoded as ADDS with WZR destination (32-bit only).
+ // Only simple variants (no shifted/extended reg) qualify.
+ if (EnableCodeAlignment & 0x2) {
+ bool IsCMP = false;
+ switch (Opc) {
+ case AArch64::SUBSWrr:
+ case AArch64::ADDSWrr:
+ IsCMP = MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr);
+ break;
+ case AArch64::SUBSWri:
+ case AArch64::ADDSWri:
+ // Only CMP/CMN #imm (no LSL #12 shift) with small immediates (<=15)
+ IsCMP = MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) &&
+ MI.getOperand(3).getImm() == 0 &&
+ MI.getOperand(2).getImm() <= 15;
+ break;
+ case AArch64::SUBSWrs:
+ case AArch64::ADDSWrs:
+ IsCMP = MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) &&
+ !AArch64InstrInfo::hasShiftedReg(MI);
+ break;
+ case AArch64::SUBSWrx:
+ IsCMP = MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) &&
+ !AArch64InstrInfo::hasExtendedReg(MI);
+ break;
+ case AArch64::CSELWr:
+ if (PendingCMPInstr) {
+ ++NumCmpCselPairsDetected;
+ return true;
+ }
+ PendingCMPInstr = nullptr;
+ break;
+ default:
+ break;
+ }
+
+ if (IsCMP)
+ PendingCMPInstr = &MI;
+ else if (Opc != AArch64::CSELWr)
+ PendingCMPInstr = nullptr;
+ }
+ }
+
+ return false;
+}
+
+bool AArch64CodeLayoutOpt::optimizeForCodeAlignment(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE ": optimizeForCodeAlignment: " << MF.getName()
+ << "\n");
+
+ for (auto &MBB : MF) {
+ if (!detectLayoutSensitivePattern(&MBB))
+ continue;
+
+ if (MF.getAlignment() >= Align(FunctionAlignBytes)) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE ": Function " << MF.getName()
+ << " already has sufficient alignment\n");
+ return false;
+ }
+
+ MF.setAlignment(Align(FunctionAlignBytes));
+ ++NumFunctionsAligned;
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE ": Set " << FunctionAlignBytes
+ << "-byte alignment for function " << MF.getName()
+ << "\n");
+ return true;
+ }
+
+ return false;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index faee640a910d0..7e3c8097ef830 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -792,6 +792,10 @@ def FeatureFuseCmpCSet : SubtargetFeature<
"fuse-cset", "HasFuseCmpCSet", "true",
"CPU can fuse CMP and CSET operations">;
+def FeatureFuseFCmpFCSel : SubtargetFeature<
+ "fuse-fcsel", "HasFuseFCmpFCSel", "true",
+ "CPU can fuse FCMP and FCSEL operations">;
+
def FeatureFuseCryptoEOR : SubtargetFeature<
"fuse-crypto-eor", "HasFuseCryptoEOR", "true",
"CPU fuses AES/PMULL and EOR operations">;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 7791e556f3b13..94617ca1ad244 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -527,6 +527,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureFuseAES,
FeatureFuseArithmeticLogic,
FeatureFuseCmpCSel,
+ FeatureFuseFCmpFCSel,
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureStorePairSuppress,
@@ -550,6 +551,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureFuseAES,
FeatureFuseArithmeticLogic,
FeatureFuseCmpCSel,
+ FeatureFuseFCmpFCSel,
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureStorePairSuppress,
@@ -573,6 +575,7 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureFuseAES,
FeatureFuseArithmeticLogic,
FeatureFuseCmpCSel,
+ FeatureFuseFCmpFCSel,
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureZCRegMoveGPR64,
@@ -595,6 +598,7 @@ def TuneAppleM5 : SubtargetFeature<"apple-m5", "ARMProcFamily", "AppleM5",
FeatureFuseAES,
FeatureFuseArithmeticLogic,
FeatureFuseCmpCSel,
+ FeatureFuseFCmpFCSel,
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureZCRegMoveGPR64,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 4255ebd4cc557..4e23e72672cdc 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -245,6 +245,7 @@ LLVMInitializeAArch64Target() {
initializeGlobalISel(PR);
initializeAArch64A53Fix835769Pass(PR);
initializeAArch64A57FPLoadBalancingPass(PR);
+ initializeAArch64CodeLayoutOptPass(PR);
initializeAArch64AdvSIMDScalarPass(PR);
initializeAArch64AsmPrinterPass(PR);
initializeAArch64BranchTargetsPass(PR);
@@ -869,6 +870,8 @@ void AArch64PassConfig::addPostRegAlloc() {
}
void AArch64PassConfig::addPreSched2() {
+ // Apply code layout optimizations for instruction pairs.
+ addPass(createAArch64CodeLayoutOptPass());
// Lower homogeneous frame instructions
if (EnableHomogeneousPrologEpilog)
addPass(createAArch64LowerHomogeneousPrologEpilogPass());
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index 2fe554217c1ba..1abff4f5add29 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -42,6 +42,7 @@ add_llvm_target(AArch64CodeGen
GISel/AArch64PostSelectOptimize.cpp
GISel/AArch64RegisterBankInfo.cpp
AArch64A57FPLoadBalancing.cpp
+ AArch64CodeLayoutOpt.cpp
AArch64AdvSIMDScalarPass.cpp
AArch64Arm64ECCallLowering.cpp
AArch64AsmPrinter.cpp
diff --git a/llvm/test/CodeGen/AArch64/code-layout-opt.ll b/llvm/test/CodeGen/AArch64/code-layout-opt.ll
new file mode 100644
index 0000000000000..604522b5b9623
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/code-layout-opt.ll
@@ -0,0 +1,233 @@
+; NOTE: Test cases for FCMP-FCSEL and CMP/CMN-CSEL code layout optimization
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-apple-darwin -mcpu=apple-m4 -aarch64-code-layout-opt=3 | FileCheck %s
+
+; Test coverage for optimizeForCodeAlignment function:
+; 1. Basic FCMP-FCSEL instruction pair detection and function alignment (single/double precision)
+; 2. Loop block alignment for instruction pairs in loops (simple, nested, multi-block)
+; 3. Multiple instruction pairs in same function (also tests different predicates)
+; 4. FCMP with immediate operand (#0.0) is excluded from optimization
+; 5. Instruction pairs with function calls
+; 6. Negative tests (no false positives)
+; 7. Basic CMP-CSEL and CMN-CSEL instruction pair detection and function alignment
+; 8. CMP/CMN with immediate <=15 qualifies; immediate >15 is excluded
+
+; Test 1: Basic single-precision FCMP-FCSEL instruction pair
+; CHECK: .globl _test_basic_fcmp_fcsel_single
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_basic_fcmp_fcsel_single:
+define float @test_basic_fcmp_fcsel_single(float %a, float %b, float %c, float %d) {
+entry:
+ %cmp = fcmp oeq float %a, %b
+ %sel = select i1 %cmp, float %c, float %d
+ ret float %sel
+}
+
+; Test 2: Basic double-precision FCMP-FCSEL instruction pair
+; CHECK: .globl _test_basic_fcmp_fcsel_double
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_basic_fcmp_fcsel_double:
+define double @test_basic_fcmp_fcsel_double(double %a, double %b, double %c, double %d) {
+entry:
+ %cmp = fcmp oeq double %a, %b
+ %sel = select i1 %cmp, double %c, double %d
+ ret double %sel
+}
+
+; Test 3: FCMP-FCSEL instruction pair in a simple loop
+; CHECK: .globl _test_fcmp_fcsel_in_loop
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_fcmp_fcsel_in_loop:
+define float @test_fcmp_fcsel_in_loop(ptr %arr, i32 %n) {
+entry:
+ br label %loop
+loop:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+ %acc = phi float [ 0.0, %entry ], [ %new_acc, %loop ]
+ %ptr = getelementptr float, ptr %arr, i32 %i
+ %val = load float, ptr %ptr
+ %cmp = fcmp ogt float %val, %acc
+ %new_acc = select i1 %cmp, float %val, float %acc
+ %i.next = add i32 %i, 1
+ %exit_cond = icmp eq i32 %i.next, %n
+ br i1 %exit_cond, label %exit, label %loop
+exit:
+ ret float %new_acc
+}
+
+; Test 4: Multiple FCMP-FCSEL instruction pairs in same function
+; CHECK: .globl _test_multiple_patterns
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_multiple_patterns:
+define float @test_multiple_patterns(float %a, float %b, float %c, float %d, float %e, float %f) {
+entry:
+ %cmp1 = fcmp oeq float %a, %b
+ %sel1 = select i1 %cmp1, float %c, float %d
+ %cmp2 = fcmp ogt float %sel1, %e
+ %sel2 = select i1 %cmp2, float %sel1, float %f
+ ret float %sel2
+}
+
+; Test 5: FCMP with comparison to zero (immediate) - excluded from optimization
+; FCMP #0.0 uses the ri-form opcode which is not in the detection list
+; CHECK: .globl _test_fcmp_immediate
+; CHECK-NEXT: .p2align 2
+; CHECK-LABEL: _test_fcmp_immediate:
+define float @test_fcmp_immediate(float %a, float %b) {
+entry:
+ %cmp = fcmp oeq float %a, 0.0
+ %sel = select i1 %cmp, float %a, float %b
+ ret float %sel
+}
+
+; Test 6: Nested loops with FCMP-FCSEL instruction pair
+; CHECK: .globl _test_nested_loops
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_nested_loops:
+define double @test_nested_loops(ptr %arr, i32 %rows, i32 %cols) {
+entry:
+ br label %outer_loop
+outer_loop:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %outer_loop_latch ]
+ %outer_acc = phi double [ 0.0, %entry ], [ %inner_result, %outer_loop_latch ]
+ br label %inner_loop
+inner_loop:
+ %j = phi i32 [ 0, %outer_loop ], [ %j.next, %inner_loop ]
+ %acc = phi double [ %outer_acc, %outer_loop ], [ %new_acc, %inner_loop ]
+ %offset = mul i32 %i, %cols
+ %idx = add i32 %offset, %j
+ %ptr = getelementptr double, ptr %arr, i32 %idx
+ %val = load double, ptr %ptr
+ %cmp = fcmp ogt double %val, %acc
+ %new_acc = select i1 %cmp, double %val, double %acc
+ %j.next = add i32 %j, 1
+ %inner_exit = icmp eq i32 %j.next, %cols
+ br i1 %inner_exit, label %outer_loop_latch, label %inner_loop
+outer_loop_latch:
+ %inner_result = phi double [ %new_acc, %inner_loop ]
+ %i.next = add i32 %i, 1
+ %outer_exit = icmp eq i32 %i.next, %rows
+ br i1 %outer_exit, label %exit, label %outer_loop
+exit:
+ %result = phi double [ %inner_result, %outer_loop_latch ]
+ ret double %result
+}
+
+; Test 7: Mixed single and double precision in same function
+; CHECK: .globl _test_mixed_precision
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_mixed_precision:
+define float @test_mixed_precision(float %a, float %b, double %c, double %d) {
+entry:
+ %cmp_single = fcmp ogt float %a, %b
+ %sel_single = select i1 %cmp_single, float %a, float %b
+ %cmp_double = fcmp olt double %c, %d
+ %sel_double = select i1 %cmp_double, double %c, double %d
+ %trunc = fptrunc double %sel_double to float
+ %final = fadd float %sel_single, %trunc
+ ret float %final
+}
+
+; Test 8: FCMP-FCSEL instruction pair with a function call present
+; CHECK: .globl _test_with_function_calls
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_with_function_calls:
+declare float @external_func(float)
+define float @test_with_function_calls(float %a, float %b, float %c, float %d) {
+entry:
+ %cmp = fcmp ogt float %a, %b
+ %sel = select i1 %cmp, float %c, float %d
+ %result = call float @external_func(float %sel)
+ ret float %result
+}
+
+; Test 9: Verify no false positives - FCMP without FCSEL
+; CHECK: .globl _test_fcmp_without_fcsel
+; CHECK-NEXT: .p2align 2
+; CHECK-LABEL: _test_fcmp_without_fcsel:
+define i32 @test_fcmp_without_fcsel(float %a, float %b) {
+entry:
+ %cmp = fcmp ogt float %a, %b
+ %result = zext i1 %cmp to i32
+ ret i32 %result
+}
+
+; Test 10: Verify no false positives - FCSEL without preceding FCMP
+; CHECK: .globl _test_fcsel_without_fcmp
+; CHECK-NEXT: .p2align 2
+; CHECK-LABEL: _test_fcsel_without_fcmp:
+define float @test_fcsel_without_fcmp(i1 %cond, float %a, float %b) {
+entry:
+ %result = select i1 %cond, float %a, float %b
+ ret float %result
+}
+
+;------------------------------------------------------------------------------
+; CMP/CMN-CSEL tests (bit 1 of -aarch64-code-layout-opt)
+;------------------------------------------------------------------------------
+
+; Test 11: Basic CMP-CSEL instruction pair (integer register comparison)
+; CHECK: .globl _test_basic_cmp_csel
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_basic_cmp_csel:
+define i32 @test_basic_cmp_csel(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %sel = select i1 %cmp, i32 %c, i32 %d
+ ret i32 %sel
+}
+
+; Test 12: CMP-CSEL instruction pair with small immediate (<=15, qualifies for optimization)
+; CHECK: .globl _test_cmp_small_imm_csel
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_cmp_small_imm_csel:
+define i32 @test_cmp_small_imm_csel(i32 %a, i32 %b, i32 %c) {
+entry:
+ %cmp = icmp eq i32 %a, 7
+ %sel = select i1 %cmp, i32 %b, i32 %c
+ ret i32 %sel
+}
+
+; Test 13: CMP-CSEL with immediate > 15 - excluded from optimization
+; CHECK: .globl _test_cmp_large_imm_csel
+; CHECK-NEXT: .p2align 2
+; CHECK-LABEL: _test_cmp_large_imm_csel:
+define i32 @test_cmp_large_imm_csel(i32 %a, i32 %b, i32 %c) {
+entry:
+ %cmp = icmp eq i32 %a, 100
+ %sel = select i1 %cmp, i32 %b, i32 %c
+ ret i32 %sel
+}
+
+; Test 14: Basic CMN-CSEL instruction pair (ADDSWrr with WZR destination)
+; CHECK: .globl _test_basic_cmn_csel
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_basic_cmn_csel:
+define i32 @test_basic_cmn_csel(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+ %sum = add i32 %a, %b
+ %cmp = icmp eq i32 %sum, 0
+ %sel = select i1 %cmp, i32 %c, i32 %d
+ ret i32 %sel
+}
+
+; Test 15: CMN-CSEL instruction pair with small immediate (ADDSWri imm=7, qualifies)
+; CHECK: .globl _test_cmn_small_imm_csel
+; CHECK-NEXT: .p2align 6
+; CHECK-LABEL: _test_cmn_small_imm_csel:
+define i32 @test_cmn_small_imm_csel(i32 %a, i32 %b, i32 %c) {
+entry:
+ %cmp = icmp eq i32 %a, -7
+ %sel = select i1 %cmp, i32 %b, i32 %c
+ ret i32 %sel
+}
+
+; Test 16: CMP without CSEL - no false positive
+; CHECK: .globl _test_cmp_without_csel
+; CHECK-NEXT: .p2align 2
+; CHECK-LABEL: _test_cmp_without_csel:
+define i32 @test_cmp_without_csel(i32 %a, i32 %b) {
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %result = zext i1 %cmp to i32
+ ret i32 %result
+}
>From 6859fc32e60ce904f45a1c6e10d02b6c3dc403b0 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Wed, 4 Mar 2026 23:31:21 +0200
Subject: [PATCH 2/4] tests: remove loop-specific tests & rename main function
---
.../Target/AArch64/AArch64CodeLayoutOpt.cpp | 8 +-
llvm/test/CodeGen/AArch64/code-layout-opt.ll | 93 ++++---------------
2 files changed, 23 insertions(+), 78 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp b/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
index 00810ed2198ad..f20765e372421 100644
--- a/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
@@ -77,7 +77,7 @@ class AArch64CodeLayoutOpt : public MachineFunctionPass {
// Returns true if MBB contains at least one layout-sensitive pattern.
bool detectLayoutSensitivePattern(MachineBasicBlock *MBB);
- bool optimizeForCodeAlignment(MachineFunction &MF);
+ bool optimizeForCodeLayout(MachineFunction &MF);
};
} // end anonymous namespace
@@ -108,7 +108,7 @@ bool AArch64CodeLayoutOpt::runOnMachineFunction(MachineFunction &MF) {
!((Mask & 0x2) && Subtarget->hasFuseCmpCSel()))
return false;
- return optimizeForCodeAlignment(MF);
+ return optimizeForCodeLayout(MF);
}
// Returns true if MBB contains at least one layout-sensitive pair.
@@ -199,8 +199,8 @@ bool AArch64CodeLayoutOpt::detectLayoutSensitivePattern(
return false;
}
-bool AArch64CodeLayoutOpt::optimizeForCodeAlignment(MachineFunction &MF) {
- LLVM_DEBUG(dbgs() << DEBUG_TYPE ": optimizeForCodeAlignment: " << MF.getName()
+bool AArch64CodeLayoutOpt::optimizeForCodeLayout(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE ": optimizeForCodeLayout: " << MF.getName()
<< "\n");
for (auto &MBB : MF) {
diff --git a/llvm/test/CodeGen/AArch64/code-layout-opt.ll b/llvm/test/CodeGen/AArch64/code-layout-opt.ll
index 604522b5b9623..30215ec2fb059 100644
--- a/llvm/test/CodeGen/AArch64/code-layout-opt.ll
+++ b/llvm/test/CodeGen/AArch64/code-layout-opt.ll
@@ -1,15 +1,14 @@
; NOTE: Test cases for FCMP-FCSEL and CMP/CMN-CSEL code layout optimization
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-apple-darwin -mcpu=apple-m4 -aarch64-code-layout-opt=3 | FileCheck %s
-; Test coverage for optimizeForCodeAlignment function:
+; Test coverage for optimizeForCodeLayout function:
; 1. Basic FCMP-FCSEL instruction pair detection and function alignment (single/double precision)
-; 2. Loop block alignment for instruction pairs in loops (simple, nested, multi-block)
-; 3. Multiple instruction pairs in same function (also tests different predicates)
-; 4. FCMP with immediate operand (#0.0) is excluded from optimization
-; 5. Instruction pairs with function calls
-; 6. Negative tests (no false positives)
-; 7. Basic CMP-CSEL and CMN-CSEL instruction pair detection and function alignment
-; 8. CMP/CMN with immediate <=15 qualifies; immediate >15 is excluded
+; 2. Multiple instruction pairs in same function (also tests different predicates)
+; 3. FCMP with immediate operand (#0.0) is excluded from optimization
+; 4. Instruction pairs with function calls
+; 5. Negative tests (no false positives)
+; 6. Basic CMP-CSEL and CMN-CSEL instruction pair detection and function alignment
+; 7. CMP/CMN with immediate <=15 qualifies; immediate >15 is excluded
; Test 1: Basic single-precision FCMP-FCSEL instruction pair
; CHECK: .globl _test_basic_fcmp_fcsel_single
@@ -33,28 +32,7 @@ entry:
ret double %sel
}
-; Test 3: FCMP-FCSEL instruction pair in a simple loop
-; CHECK: .globl _test_fcmp_fcsel_in_loop
-; CHECK-NEXT: .p2align 6
-; CHECK-LABEL: _test_fcmp_fcsel_in_loop:
-define float @test_fcmp_fcsel_in_loop(ptr %arr, i32 %n) {
-entry:
- br label %loop
-loop:
- %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
- %acc = phi float [ 0.0, %entry ], [ %new_acc, %loop ]
- %ptr = getelementptr float, ptr %arr, i32 %i
- %val = load float, ptr %ptr
- %cmp = fcmp ogt float %val, %acc
- %new_acc = select i1 %cmp, float %val, float %acc
- %i.next = add i32 %i, 1
- %exit_cond = icmp eq i32 %i.next, %n
- br i1 %exit_cond, label %exit, label %loop
-exit:
- ret float %new_acc
-}
-
-; Test 4: Multiple FCMP-FCSEL instruction pairs in same function
+; Test 3: Multiple FCMP-FCSEL instruction pairs in same function
; CHECK: .globl _test_multiple_patterns
; CHECK-NEXT: .p2align 6
; CHECK-LABEL: _test_multiple_patterns:
@@ -67,7 +45,7 @@ entry:
ret float %sel2
}
-; Test 5: FCMP with comparison to zero (immediate) - excluded from optimization
+; Test 4: FCMP with comparison to zero (immediate) - excluded from optimization
; FCMP #0.0 uses the ri-form opcode which is not in the detection list
; CHECK: .globl _test_fcmp_immediate
; CHECK-NEXT: .p2align 2
@@ -79,40 +57,7 @@ entry:
ret float %sel
}
-; Test 6: Nested loops with FCMP-FCSEL instruction pair
-; CHECK: .globl _test_nested_loops
-; CHECK-NEXT: .p2align 6
-; CHECK-LABEL: _test_nested_loops:
-define double @test_nested_loops(ptr %arr, i32 %rows, i32 %cols) {
-entry:
- br label %outer_loop
-outer_loop:
- %i = phi i32 [ 0, %entry ], [ %i.next, %outer_loop_latch ]
- %outer_acc = phi double [ 0.0, %entry ], [ %inner_result, %outer_loop_latch ]
- br label %inner_loop
-inner_loop:
- %j = phi i32 [ 0, %outer_loop ], [ %j.next, %inner_loop ]
- %acc = phi double [ %outer_acc, %outer_loop ], [ %new_acc, %inner_loop ]
- %offset = mul i32 %i, %cols
- %idx = add i32 %offset, %j
- %ptr = getelementptr double, ptr %arr, i32 %idx
- %val = load double, ptr %ptr
- %cmp = fcmp ogt double %val, %acc
- %new_acc = select i1 %cmp, double %val, double %acc
- %j.next = add i32 %j, 1
- %inner_exit = icmp eq i32 %j.next, %cols
- br i1 %inner_exit, label %outer_loop_latch, label %inner_loop
-outer_loop_latch:
- %inner_result = phi double [ %new_acc, %inner_loop ]
- %i.next = add i32 %i, 1
- %outer_exit = icmp eq i32 %i.next, %rows
- br i1 %outer_exit, label %exit, label %outer_loop
-exit:
- %result = phi double [ %inner_result, %outer_loop_latch ]
- ret double %result
-}
-
-; Test 7: Mixed single and double precision in same function
+; Test 5: Mixed single and double precision in same function
; CHECK: .globl _test_mixed_precision
; CHECK-NEXT: .p2align 6
; CHECK-LABEL: _test_mixed_precision:
@@ -127,7 +72,7 @@ entry:
ret float %final
}
-; Test 8: FCMP-FCSEL instruction pair with a function call present
+; Test 6: FCMP-FCSEL instruction pair with a function call present
; CHECK: .globl _test_with_function_calls
; CHECK-NEXT: .p2align 6
; CHECK-LABEL: _test_with_function_calls:
@@ -140,7 +85,7 @@ entry:
ret float %result
}
-; Test 9: Verify no false positives - FCMP without FCSEL
+; Test 7: Verify no false positives - FCMP without FCSEL
; CHECK: .globl _test_fcmp_without_fcsel
; CHECK-NEXT: .p2align 2
; CHECK-LABEL: _test_fcmp_without_fcsel:
@@ -151,7 +96,7 @@ entry:
ret i32 %result
}
-; Test 10: Verify no false positives - FCSEL without preceding FCMP
+; Test 8: Verify no false positives - FCSEL without preceding FCMP
; CHECK: .globl _test_fcsel_without_fcmp
; CHECK-NEXT: .p2align 2
; CHECK-LABEL: _test_fcsel_without_fcmp:
@@ -165,7 +110,7 @@ entry:
; CMP/CMN-CSEL tests (bit 1 of -aarch64-code-layout-opt)
;------------------------------------------------------------------------------
-; Test 11: Basic CMP-CSEL instruction pair (integer register comparison)
+; Test 9: Basic CMP-CSEL instruction pair (integer register comparison)
; CHECK: .globl _test_basic_cmp_csel
; CHECK-NEXT: .p2align 6
; CHECK-LABEL: _test_basic_cmp_csel:
@@ -176,7 +121,7 @@ entry:
ret i32 %sel
}
-; Test 12: CMP-CSEL instruction pair with small immediate (<=15, qualifies for optimization)
+; Test 10: CMP-CSEL instruction pair with small immediate (<=15, qualifies for optimization)
; CHECK: .globl _test_cmp_small_imm_csel
; CHECK-NEXT: .p2align 6
; CHECK-LABEL: _test_cmp_small_imm_csel:
@@ -187,7 +132,7 @@ entry:
ret i32 %sel
}
-; Test 13: CMP-CSEL with immediate > 15 - excluded from optimization
+; Test 11: CMP-CSEL with immediate > 15 - excluded from optimization
; CHECK: .globl _test_cmp_large_imm_csel
; CHECK-NEXT: .p2align 2
; CHECK-LABEL: _test_cmp_large_imm_csel:
@@ -198,7 +143,7 @@ entry:
ret i32 %sel
}
-; Test 14: Basic CMN-CSEL instruction pair (ADDSWrr with WZR destination)
+; Test 12: Basic CMN-CSEL instruction pair (ADDSWrr with WZR destination)
; CHECK: .globl _test_basic_cmn_csel
; CHECK-NEXT: .p2align 6
; CHECK-LABEL: _test_basic_cmn_csel:
@@ -210,7 +155,7 @@ entry:
ret i32 %sel
}
-; Test 15: CMN-CSEL instruction pair with small immediate (ADDSWri imm=7, qualifies)
+; Test 13: CMN-CSEL instruction pair with small immediate (ADDSWri imm=7, qualifies)
; CHECK: .globl _test_cmn_small_imm_csel
; CHECK-NEXT: .p2align 6
; CHECK-LABEL: _test_cmn_small_imm_csel:
@@ -221,7 +166,7 @@ entry:
ret i32 %sel
}
-; Test 16: CMP without CSEL - no false positive
+; Test 14: CMP without CSEL - no false positive
; CHECK: .globl _test_cmp_without_csel
; CHECK-NEXT: .p2align 2
; CHECK-LABEL: _test_cmp_without_csel:
>From f09c66307aa8e55a021981f37529a511a1d32b35 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Wed, 4 Mar 2026 23:34:24 +0200
Subject: [PATCH 3/4] fix: AnalysisUsage setPreservesAll()
---
llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp b/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
index f20765e372421..703c595887db1 100644
--- a/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CodeLayoutOpt.cpp
@@ -88,7 +88,7 @@ INITIALIZE_PASS(AArch64CodeLayoutOpt, "aarch64-code-layout-opt",
AARCH64_CODE_LAYOUT_OPT_NAME, false, false)
void AArch64CodeLayoutOpt::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.setPreservesCFG();
+ AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
>From 02b6739046206a4e15011df70a32381857c14e65 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Wed, 11 Mar 2026 11:50:44 +0200
Subject: [PATCH 4/4] revert changes introducing FeatureFuseFCmpFCSel
---
llvm/lib/Target/AArch64/AArch64Features.td | 4 ----
llvm/lib/Target/AArch64/AArch64Processors.td | 4 ----
2 files changed, 8 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 7e3c8097ef830..faee640a910d0 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -792,10 +792,6 @@ def FeatureFuseCmpCSet : SubtargetFeature<
"fuse-cset", "HasFuseCmpCSet", "true",
"CPU can fuse CMP and CSET operations">;
-def FeatureFuseFCmpFCSel : SubtargetFeature<
- "fuse-fcsel", "HasFuseFCmpFCSel", "true",
- "CPU can fuse FCMP and FCSEL operations">;
-
def FeatureFuseCryptoEOR : SubtargetFeature<
"fuse-crypto-eor", "HasFuseCryptoEOR", "true",
"CPU fuses AES/PMULL and EOR operations">;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 94617ca1ad244..7791e556f3b13 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -527,7 +527,6 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureFuseAES,
FeatureFuseArithmeticLogic,
FeatureFuseCmpCSel,
- FeatureFuseFCmpFCSel,
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureStorePairSuppress,
@@ -551,7 +550,6 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureFuseAES,
FeatureFuseArithmeticLogic,
FeatureFuseCmpCSel,
- FeatureFuseFCmpFCSel,
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureStorePairSuppress,
@@ -575,7 +573,6 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureFuseAES,
FeatureFuseArithmeticLogic,
FeatureFuseCmpCSel,
- FeatureFuseFCmpFCSel,
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureZCRegMoveGPR64,
@@ -598,7 +595,6 @@ def TuneAppleM5 : SubtargetFeature<"apple-m5", "ARMProcFamily", "AppleM5",
FeatureFuseAES,
FeatureFuseArithmeticLogic,
FeatureFuseCmpCSel,
- FeatureFuseFCmpFCSel,
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureZCRegMoveGPR64,
More information about the llvm-commits
mailing list