[llvm] [AArch64] Allow splitting bitmasks for EOR/ORR. (PR #150394)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 24 02:42:10 PDT 2025
https://github.com/rj-jesus created https://github.com/llvm/llvm-project/pull/150394
This patch extends #149095 for EOR and ORR.
It uses a simple partition scheme to try to find two suitable disjoint
bitmasks that can be used with EOR/ORR to reconstruct the original mask.
It also restructures the original code to allow reusing its
infrastructure and more easily adding other partition schemes in the
future.
Fixes: #148987.
>From 4f0ec2236a85ac5777aecd7cc9e3e2306aa9e04e Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Wed, 23 Jul 2025 08:47:13 -0700
Subject: [PATCH 1/2] Add tests.
---
... aarch64-split-logic-bitmask-immediate.ll} | 178 ++++++++++++++++++
1 file changed, 178 insertions(+)
rename llvm/test/CodeGen/AArch64/{aarch64-split-and-bitmask-immediate.ll => aarch64-split-logic-bitmask-immediate.ll} (70%)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
similarity index 70%
rename from llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
rename to llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
index 113eb14ca4803..3644a78e2b713 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
@@ -370,3 +370,181 @@ entry:
%r = select i1 %c, i64 %a, i64 %ands
ret i64 %r
}
+
+; Test EOR.
+define i32 @test1_eor(i32 %a) {
+; CHECK-LABEL: test1_eor:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #32, lsl #16
+; CHECK-NEXT: eor w0, w0, w8
+; CHECK-NEXT: ret
+entry:
+ %eor = xor i32 %a, 2098176
+ ret i32 %eor
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i32 @test2_eor(i32 %a) {
+; CHECK-LABEL: test2_eor:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #135 // =0x87
+; CHECK-NEXT: eor w0, w0, w8
+; CHECK-NEXT: ret
+entry:
+ %eor = xor i32 %a, 135
+ ret i32 %eor
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i32 @test3_eor(i32 %a) {
+; CHECK-LABEL: test3_eor:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #33, lsl #16
+; CHECK-NEXT: eor w0, w0, w8
+; CHECK-NEXT: ret
+entry:
+ %eor = xor i32 %a, 2163712
+ ret i32 %eor
+}
+
+define i64 @test4_eor(i64 %a) {
+; CHECK-LABEL: test4_eor:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #32, lsl #16
+; CHECK-NEXT: eor x0, x0, x8
+; CHECK-NEXT: ret
+entry:
+ %eor = xor i64 %a, 2098176
+ ret i64 %eor
+}
+
+define i64 @test5_eor(i64 %a) {
+; CHECK-LABEL: test5_eor:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, #16384 // =0x4000
+; CHECK-NEXT: movk x8, #2, lsl #32
+; CHECK-NEXT: eor x0, x0, x8
+; CHECK-NEXT: ret
+entry:
+ %eor = xor i64 %a, 8589950976
+ ret i64 %eor
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i64 @test6_eor(i64 %a) {
+; CHECK-LABEL: test6_eor:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #135 // =0x87
+; CHECK-NEXT: eor x0, x0, x8
+; CHECK-NEXT: ret
+entry:
+ %eor = xor i64 %a, 135
+ ret i64 %eor
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i64 @test7_eor(i64 %a) {
+; CHECK-LABEL: test7_eor:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #33, lsl #16
+; CHECK-NEXT: eor x0, x0, x8
+; CHECK-NEXT: ret
+entry:
+ %eor = xor i64 %a, 2163712
+ ret i64 %eor
+}
+
+; Test ORR.
+define i32 @test1_orr(i32 %a) {
+; CHECK-LABEL: test1_orr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #32, lsl #16
+; CHECK-NEXT: orr w0, w0, w8
+; CHECK-NEXT: ret
+entry:
+ %orr = or i32 %a, 2098176
+ ret i32 %orr
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i32 @test2_orr(i32 %a) {
+; CHECK-LABEL: test2_orr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #135 // =0x87
+; CHECK-NEXT: orr w0, w0, w8
+; CHECK-NEXT: ret
+entry:
+ %orr = or i32 %a, 135
+ ret i32 %orr
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i32 @test3_orr(i32 %a) {
+; CHECK-LABEL: test3_orr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #33, lsl #16
+; CHECK-NEXT: orr w0, w0, w8
+; CHECK-NEXT: ret
+entry:
+ %orr = or i32 %a, 2163712
+ ret i32 %orr
+}
+
+define i64 @test4_orr(i64 %a) {
+; CHECK-LABEL: test4_orr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #32, lsl #16
+; CHECK-NEXT: orr x0, x0, x8
+; CHECK-NEXT: ret
+entry:
+ %orr = or i64 %a, 2098176
+ ret i64 %orr
+}
+
+define i64 @test5_orr(i64 %a) {
+; CHECK-LABEL: test5_orr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, #16384 // =0x4000
+; CHECK-NEXT: movk x8, #2, lsl #32
+; CHECK-NEXT: orr x0, x0, x8
+; CHECK-NEXT: ret
+entry:
+ %orr = or i64 %a, 8589950976
+ ret i64 %orr
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i64 @test6_orr(i64 %a) {
+; CHECK-LABEL: test6_orr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #135 // =0x87
+; CHECK-NEXT: orr x0, x0, x8
+; CHECK-NEXT: ret
+entry:
+ %orr = or i64 %a, 135
+ ret i64 %orr
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i64 @test7_orr(i64 %a) {
+; CHECK-LABEL: test7_orr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #33, lsl #16
+; CHECK-NEXT: orr x0, x0, x8
+; CHECK-NEXT: ret
+entry:
+ %orr = or i64 %a, 2163712
+ ret i64 %orr
+}
>From 68a008d8db8bcd41955cf2c4904d4fdd5b302cad Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Wed, 16 Jul 2025 03:07:27 -0700
Subject: [PATCH 2/2] [AArch64] Allow splitting bitmasks for EOR/ORR.
This patch extends #149095 for EOR and ORR.
It uses a simple partition scheme to try to find two suitable disjoint
bitmasks that can be used with EOR/ORR to reconstruct the original mask.
It also restructures the original code to allow reusing its
infrastructure and more easily adding other partition schemes in the
future.
---
.../Target/AArch64/AArch64MIPeepholeOpt.cpp | 119 ++++++++++++++----
.../aarch64-split-logic-bitmask-immediate.ll | 30 ++---
2 files changed, 105 insertions(+), 44 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index abcd5505f735b..68a6f931c42d8 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -8,11 +8,11 @@
//
// This pass performs below peephole optimizations on MIR level.
//
-// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
-// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+// 1. MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+// MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
//
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
-// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
+// MOVi64imm + ADDXrr ==> ADDXri + ADDXri
//
// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
@@ -125,8 +125,14 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
template <typename T>
bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
+ // Strategy used to split logical immediate bitmasks.
+ enum class SplitStrategy {
+ Intersect,
+ Disjoint,
+ };
template <typename T>
- bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0);
+ bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
+ SplitStrategy Strategy, unsigned OtherOpc = 0);
bool visitORR(MachineInstr &MI);
bool visitCSEL(MachineInstr &MI);
bool visitINSERT(MachineInstr &MI);
@@ -158,14 +164,6 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
template <typename T>
static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
T UImm = static_cast<T>(Imm);
- if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
- return false;
-
- // If this immediate can be handled by one instruction, do not split it.
- SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
- AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
- if (Insn.size() == 1)
- return false;
// The bitmask immediate consists of consecutive ones. Let's say there is
// constant 0b00000000001000000000010000000000 which does not consist of
@@ -194,23 +192,72 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
}
template <typename T>
-bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI,
- unsigned OtherOpc) {
- // Try below transformation.
+static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc,
+ T &Imm2Enc) {
+ // Try to split a bitmask of the form 0b00000000011000000000011110000000 into
+ // two disjoint masks such as 0b00000000011000000000000000000000 and
+ // 0b00000000000000000000011110000000 where the inclusive/exclusive OR of the
+ // new masks match the original mask.
+ unsigned LowestBitSet = llvm::countr_zero(Imm);
+ unsigned LowestGapBitUnset =
+ LowestBitSet + llvm::countr_one(Imm >> LowestBitSet);
+
+ // Create a mask for the least significant group of consecutive ones.
+ T NewImm1 = (static_cast<T>(1) << LowestGapBitUnset) -
+ (static_cast<T>(1) << LowestBitSet);
+ // Create a disjoint mask for the remaining ones.
+ T NewImm2 = Imm & ~NewImm1;
+ assert(((NewImm1 & NewImm2) == 0) && "Non-disjoint immediates!");
+
+ if (AArch64_AM::isLogicalImmediate(NewImm2, RegSize)) {
+ assert(((NewImm1 | NewImm2) == Imm) && "Invalid immediates!");
+ Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
+ Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
+ return true;
+ }
+
+ return false;
+}
+
+template <typename T>
+bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
+ SplitStrategy Strategy,
+ unsigned OtherOpc) {
+ // Try the transformations below.
//
- // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
- // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+ // MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+ // MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
//
// The mov pseudo instruction could be expanded to multiple mov instructions
// later. Let's try to split the constant operand of mov instruction into two
- // bitmask immediates. It makes only two AND instructions instead of multiple
- // mov + and instructions.
+ // bitmask immediates based on the given split strategy. It makes only two
+ // logical instructions instead of multiple mov + logic instructions.
return splitTwoPartImm<T>(
MI,
- [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
- T &Imm1) -> std::optional<OpcodePair> {
- if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
+ [Opc, Strategy, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
+ T &Imm1) -> std::optional<OpcodePair> {
+ // If this immediate is already a suitable bitmask, don't do anything.
+ // TODO: Should we just combine the two instructions in this case?
+ if (AArch64_AM::isLogicalImmediate(Imm, RegSize))
+ return std::nullopt;
+
+ // If this immediate can be handled by one instruction, do not split it.
+ SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+ AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
+ if (Insn.size() == 1)
+ return std::nullopt;
+
+ bool SplitSucc = false;
+ switch (Strategy) {
+ case SplitStrategy::Intersect:
+ SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1);
+ break;
+ case SplitStrategy::Disjoint:
+ SplitSucc = splitDisjointBitmaskImm(Imm, RegSize, Imm0, Imm1);
+ break;
+ }
+ if (SplitSucc)
return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
return std::nullopt;
},
@@ -859,16 +906,36 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
Changed |= visitINSERT(MI);
break;
case AArch64::ANDWrr:
- Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
+ Changed |= trySplitLogicalImm<uint32_t>(AArch64::ANDWri, MI,
+ SplitStrategy::Intersect);
break;
case AArch64::ANDXrr:
- Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
+ Changed |= trySplitLogicalImm<uint64_t>(AArch64::ANDXri, MI,
+ SplitStrategy::Intersect);
break;
case AArch64::ANDSWrr:
- Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri);
+ Changed |= trySplitLogicalImm<uint32_t>(
+ AArch64::ANDWri, MI, SplitStrategy::Intersect, AArch64::ANDSWri);
break;
case AArch64::ANDSXrr:
- Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri);
+ Changed |= trySplitLogicalImm<uint64_t>(
+ AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri);
+ break;
+ case AArch64::EORWrr:
+ Changed |= trySplitLogicalImm<uint32_t>(AArch64::EORWri, MI,
+ SplitStrategy::Disjoint);
+ break;
+ case AArch64::EORXrr:
+ Changed |= trySplitLogicalImm<uint64_t>(AArch64::EORXri, MI,
+ SplitStrategy::Disjoint);
+ break;
+ case AArch64::ORRWrr:
+ Changed |= trySplitLogicalImm<uint32_t>(AArch64::ORRWri, MI,
+ SplitStrategy::Disjoint);
+ break;
+ case AArch64::ORRXrr:
+ Changed |= trySplitLogicalImm<uint64_t>(AArch64::ORRXri, MI,
+ SplitStrategy::Disjoint);
break;
case AArch64::ORRWrs:
Changed |= visitORR(MI);
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
index 3644a78e2b713..4db9db9185206 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
@@ -375,9 +375,8 @@ entry:
define i32 @test1_eor(i32 %a) {
; CHECK-LABEL: test1_eor:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #1024 // =0x400
-; CHECK-NEXT: movk w8, #32, lsl #16
-; CHECK-NEXT: eor w0, w0, w8
+; CHECK-NEXT: eor w8, w0, #0x400
+; CHECK-NEXT: eor w0, w8, #0x200000
; CHECK-NEXT: ret
entry:
%eor = xor i32 %a, 2098176
@@ -413,9 +412,8 @@ entry:
define i64 @test4_eor(i64 %a) {
; CHECK-LABEL: test4_eor:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #1024 // =0x400
-; CHECK-NEXT: movk w8, #32, lsl #16
-; CHECK-NEXT: eor x0, x0, x8
+; CHECK-NEXT: eor x8, x0, #0x400
+; CHECK-NEXT: eor x0, x8, #0x200000
; CHECK-NEXT: ret
entry:
%eor = xor i64 %a, 2098176
@@ -425,9 +423,8 @@ entry:
define i64 @test5_eor(i64 %a) {
; CHECK-LABEL: test5_eor:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov x8, #16384 // =0x4000
-; CHECK-NEXT: movk x8, #2, lsl #32
-; CHECK-NEXT: eor x0, x0, x8
+; CHECK-NEXT: eor x8, x0, #0x4000
+; CHECK-NEXT: eor x0, x8, #0x200000000
; CHECK-NEXT: ret
entry:
%eor = xor i64 %a, 8589950976
@@ -464,9 +461,8 @@ entry:
define i32 @test1_orr(i32 %a) {
; CHECK-LABEL: test1_orr:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #1024 // =0x400
-; CHECK-NEXT: movk w8, #32, lsl #16
-; CHECK-NEXT: orr w0, w0, w8
+; CHECK-NEXT: orr w8, w0, #0x400
+; CHECK-NEXT: orr w0, w8, #0x200000
; CHECK-NEXT: ret
entry:
%orr = or i32 %a, 2098176
@@ -502,9 +498,8 @@ entry:
define i64 @test4_orr(i64 %a) {
; CHECK-LABEL: test4_orr:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #1024 // =0x400
-; CHECK-NEXT: movk w8, #32, lsl #16
-; CHECK-NEXT: orr x0, x0, x8
+; CHECK-NEXT: orr x8, x0, #0x400
+; CHECK-NEXT: orr x0, x8, #0x200000
; CHECK-NEXT: ret
entry:
%orr = or i64 %a, 2098176
@@ -514,9 +509,8 @@ entry:
define i64 @test5_orr(i64 %a) {
; CHECK-LABEL: test5_orr:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov x8, #16384 // =0x4000
-; CHECK-NEXT: movk x8, #2, lsl #32
-; CHECK-NEXT: orr x0, x0, x8
+; CHECK-NEXT: orr x8, x0, #0x4000
+; CHECK-NEXT: orr x0, x8, #0x200000000
; CHECK-NEXT: ret
entry:
%orr = or i64 %a, 8589950976
More information about the llvm-commits
mailing list