[llvm] [AArch64] Allow splitting bitmasks for EOR/ORR. (PR #150394)

Ricardo Jesus via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 6 04:05:59 PDT 2025


https://github.com/rj-jesus updated https://github.com/llvm/llvm-project/pull/150394

>From cfce02ae3b30417d34c328f342b5de874534731e Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Wed, 23 Jul 2025 08:47:13 -0700
Subject: [PATCH 1/4] Add tests.

---
 ... aarch64-split-logic-bitmask-immediate.ll} | 178 ++++++++++++++++++
 1 file changed, 178 insertions(+)
 rename llvm/test/CodeGen/AArch64/{aarch64-split-and-bitmask-immediate.ll => aarch64-split-logic-bitmask-immediate.ll} (70%)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
similarity index 70%
rename from llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
rename to llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
index 113eb14ca4803..3644a78e2b713 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
@@ -370,3 +370,181 @@ entry:
   %r = select i1 %c, i64 %a, i64 %ands
   ret i64 %r
 }
+
+; Test EOR.
+define i32 @test1_eor(i32 %a) {
+; CHECK-LABEL: test1_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #32, lsl #16
+; CHECK-NEXT:    eor w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i32 %a, 2098176
+  ret i32 %eor
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i32 @test2_eor(i32 %a) {
+; CHECK-LABEL: test2_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    eor w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i32 %a, 135
+  ret i32 %eor
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i32 @test3_eor(i32 %a) {
+; CHECK-LABEL: test3_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    eor w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i32 %a, 2163712
+  ret i32 %eor
+}
+
+define i64 @test4_eor(i64 %a) {
+; CHECK-LABEL: test4_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #32, lsl #16
+; CHECK-NEXT:    eor x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i64 %a, 2098176
+  ret i64 %eor
+}
+
+define i64 @test5_eor(i64 %a) {
+; CHECK-LABEL: test5_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, #16384 // =0x4000
+; CHECK-NEXT:    movk x8, #2, lsl #32
+; CHECK-NEXT:    eor x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i64 %a, 8589950976
+  ret i64 %eor
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i64 @test6_eor(i64 %a) {
+; CHECK-LABEL: test6_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    eor x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i64 %a, 135
+  ret i64 %eor
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i64 @test7_eor(i64 %a) {
+; CHECK-LABEL: test7_eor:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    eor x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %eor = xor i64 %a, 2163712
+  ret i64 %eor
+}
+
+; Test ORR.
+define i32 @test1_orr(i32 %a) {
+; CHECK-LABEL: test1_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #32, lsl #16
+; CHECK-NEXT:    orr w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i32 %a, 2098176
+  ret i32 %orr
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i32 @test2_orr(i32 %a) {
+; CHECK-LABEL: test2_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    orr w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i32 %a, 135
+  ret i32 %orr
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i32 @test3_orr(i32 %a) {
+; CHECK-LABEL: test3_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    orr w0, w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i32 %a, 2163712
+  ret i32 %orr
+}
+
+define i64 @test4_orr(i64 %a) {
+; CHECK-LABEL: test4_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #32, lsl #16
+; CHECK-NEXT:    orr x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i64 %a, 2098176
+  ret i64 %orr
+}
+
+define i64 @test5_orr(i64 %a) {
+; CHECK-LABEL: test5_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, #16384 // =0x4000
+; CHECK-NEXT:    movk x8, #2, lsl #32
+; CHECK-NEXT:    orr x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i64 %a, 8589950976
+  ret i64 %orr
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i64 @test6_orr(i64 %a) {
+; CHECK-LABEL: test6_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    orr x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i64 %a, 135
+  ret i64 %orr
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i64 @test7_orr(i64 %a) {
+; CHECK-LABEL: test7_orr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    orr x0, x0, x8
+; CHECK-NEXT:    ret
+entry:
+  %orr = or i64 %a, 2163712
+  ret i64 %orr
+}

>From a2424d81f68075ce5b3aff5bbfc08fb0c3a21547 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Wed, 16 Jul 2025 03:07:27 -0700
Subject: [PATCH 2/4] [AArch64] Allow splitting bitmasks for EOR/ORR.

This patch extends #149095 for EOR and ORR.

It uses a simple partition scheme to try to find two suitable disjoint
bitmasks that can be used with EOR/ORR to reconstruct the original mask.
---
 .../Target/AArch64/AArch64MIPeepholeOpt.cpp   | 63 ++++++++++++++++---
 .../aarch64-split-logic-bitmask-immediate.ll  | 30 ++++-----
 2 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index b97d6229b1d01..a32c21617860c 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -8,8 +8,8 @@
 //
 // This pass performs below peephole optimizations on MIR level.
 //
-// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
-//    MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+// 1. MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+//    MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
 //
 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
 //    MOVi64imm + ADDXrr ==> ADDXri + ADDXri
@@ -128,6 +128,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   // Strategy used to split logical immediate bitmasks.
   enum class SplitStrategy {
     Intersect,
+    Disjoint,
   };
   template <typename T>
   bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
@@ -190,19 +191,48 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
   return true;
 }
 
+template <typename T>
+static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc,
+                                    T &Imm2Enc) {
+  // Try to split a bitmask of the form 0b00000000011000000000011110000000 into
+  // two disjoint masks such as 0b00000000011000000000000000000000 and
+  // 0b00000000000000000000011110000000 where the inclusive/exclusive OR of the
+  // new masks match the original mask.
+  unsigned LowestBitSet = llvm::countr_zero(Imm);
+  unsigned LowestGapBitUnset =
+      LowestBitSet + llvm::countr_one(Imm >> LowestBitSet);
+
+  // Create a mask for the least significant group of consecutive ones.
+  T NewImm1 = (static_cast<T>(1) << LowestGapBitUnset) -
+              (static_cast<T>(1) << LowestBitSet);
+  // Create a disjoint mask for the remaining ones.
+  T NewImm2 = Imm & ~NewImm1;
+  assert(((NewImm1 & NewImm2) == 0) && "Non-disjoint immediates!");
+
+  if (AArch64_AM::isLogicalImmediate(NewImm2, RegSize)) {
+    assert(((NewImm1 | NewImm2) == Imm) && "Invalid immediates!");
+    assert(((NewImm1 ^ NewImm2) == Imm) && "Invalid immediates!");
+    Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
+    Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
+    return true;
+  }
+
+  return false;
+}
+
 template <typename T>
 bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
                                               SplitStrategy Strategy,
                                               unsigned OtherOpc) {
-  // Try below transformation.
+  // Try below transformations.
   //
-  // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
-  // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+  // MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+  // MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
   //
   // The mov pseudo instruction could be expanded to multiple mov instructions
   // later. Let's try to split the constant operand of mov instruction into two
-  // bitmask immediates. It makes only two AND instructions instead of multiple
-  // mov + and instructions.
+  // bitmask immediates based on the given split strategy. It makes only two
+  // logical instructions instead of multiple mov + logic instructions.
 
   return splitTwoPartImm<T>(
       MI,
@@ -224,6 +254,9 @@ bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
         case SplitStrategy::Intersect:
           SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1);
           break;
+        case SplitStrategy::Disjoint:
+          SplitSucc = splitDisjointBitmaskImm(Imm, RegSize, Imm0, Imm1);
+          break;
         }
         if (SplitSucc)
           return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
@@ -889,6 +922,22 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
         Changed |= trySplitLogicalImm<uint64_t>(
             AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri);
         break;
+      case AArch64::EORWrr:
+        Changed |= trySplitLogicalImm<uint32_t>(AArch64::EORWri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
+      case AArch64::EORXrr:
+        Changed |= trySplitLogicalImm<uint64_t>(AArch64::EORXri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
+      case AArch64::ORRWrr:
+        Changed |= trySplitLogicalImm<uint32_t>(AArch64::ORRWri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
+      case AArch64::ORRXrr:
+        Changed |= trySplitLogicalImm<uint64_t>(AArch64::ORRXri, MI,
+                                                SplitStrategy::Disjoint);
+        break;
       case AArch64::ORRWrs:
         Changed |= visitORR(MI);
         break;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
index 3644a78e2b713..4db9db9185206 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
@@ -375,9 +375,8 @@ entry:
 define i32 @test1_eor(i32 %a) {
 ; CHECK-LABEL: test1_eor:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #1024 // =0x400
-; CHECK-NEXT:    movk w8, #32, lsl #16
-; CHECK-NEXT:    eor w0, w0, w8
+; CHECK-NEXT:    eor w8, w0, #0x400
+; CHECK-NEXT:    eor w0, w8, #0x200000
 ; CHECK-NEXT:    ret
 entry:
   %eor = xor i32 %a, 2098176
@@ -413,9 +412,8 @@ entry:
 define i64 @test4_eor(i64 %a) {
 ; CHECK-LABEL: test4_eor:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #1024 // =0x400
-; CHECK-NEXT:    movk w8, #32, lsl #16
-; CHECK-NEXT:    eor x0, x0, x8
+; CHECK-NEXT:    eor x8, x0, #0x400
+; CHECK-NEXT:    eor x0, x8, #0x200000
 ; CHECK-NEXT:    ret
 entry:
   %eor = xor i64 %a, 2098176
@@ -425,9 +423,8 @@ entry:
 define i64 @test5_eor(i64 %a) {
 ; CHECK-LABEL: test5_eor:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, #16384 // =0x4000
-; CHECK-NEXT:    movk x8, #2, lsl #32
-; CHECK-NEXT:    eor x0, x0, x8
+; CHECK-NEXT:    eor x8, x0, #0x4000
+; CHECK-NEXT:    eor x0, x8, #0x200000000
 ; CHECK-NEXT:    ret
 entry:
   %eor = xor i64 %a, 8589950976
@@ -464,9 +461,8 @@ entry:
 define i32 @test1_orr(i32 %a) {
 ; CHECK-LABEL: test1_orr:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #1024 // =0x400
-; CHECK-NEXT:    movk w8, #32, lsl #16
-; CHECK-NEXT:    orr w0, w0, w8
+; CHECK-NEXT:    orr w8, w0, #0x400
+; CHECK-NEXT:    orr w0, w8, #0x200000
 ; CHECK-NEXT:    ret
 entry:
   %orr = or i32 %a, 2098176
@@ -502,9 +498,8 @@ entry:
 define i64 @test4_orr(i64 %a) {
 ; CHECK-LABEL: test4_orr:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #1024 // =0x400
-; CHECK-NEXT:    movk w8, #32, lsl #16
-; CHECK-NEXT:    orr x0, x0, x8
+; CHECK-NEXT:    orr x8, x0, #0x400
+; CHECK-NEXT:    orr x0, x8, #0x200000
 ; CHECK-NEXT:    ret
 entry:
   %orr = or i64 %a, 2098176
@@ -514,9 +509,8 @@ entry:
 define i64 @test5_orr(i64 %a) {
 ; CHECK-LABEL: test5_orr:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, #16384 // =0x4000
-; CHECK-NEXT:    movk x8, #2, lsl #32
-; CHECK-NEXT:    orr x0, x0, x8
+; CHECK-NEXT:    orr x8, x0, #0x4000
+; CHECK-NEXT:    orr x0, x8, #0x200000000
 ; CHECK-NEXT:    ret
 entry:
   %orr = or i64 %a, 8589950976

>From 96275002a38ae7c4426ac3261809c2107726e0d9 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Thu, 31 Jul 2025 10:49:42 -0700
Subject: [PATCH 3/4] Improve asserts and restructure code flow.

---
 .../Target/AArch64/AArch64MIPeepholeOpt.cpp   | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index a32c21617860c..fb7a16de20673 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -164,6 +164,7 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
 template <typename T>
 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
   T UImm = static_cast<T>(Imm);
+  assert(UImm && (UImm != ~static_cast<T>(0)) && "Invalid immediate!");
 
   // The bitmask immediate consists of consecutive ones.  Let's say there is
   // constant 0b00000000001000000000010000000000 which does not consist of
@@ -194,6 +195,8 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
 template <typename T>
 static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc,
                                     T &Imm2Enc) {
+  assert(Imm && (Imm != ~static_cast<T>(0)) && "Invalid immediate!");
+
   // Try to split a bitmask of the form 0b00000000011000000000011110000000 into
   // two disjoint masks such as 0b00000000011000000000000000000000 and
   // 0b00000000000000000000011110000000 where the inclusive/exclusive OR of the
@@ -207,17 +210,14 @@ static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc,
               (static_cast<T>(1) << LowestBitSet);
   // Create a disjoint mask for the remaining ones.
   T NewImm2 = Imm & ~NewImm1;
-  assert(((NewImm1 & NewImm2) == 0) && "Non-disjoint immediates!");
-
-  if (AArch64_AM::isLogicalImmediate(NewImm2, RegSize)) {
-    assert(((NewImm1 | NewImm2) == Imm) && "Invalid immediates!");
-    assert(((NewImm1 ^ NewImm2) == Imm) && "Invalid immediates!");
-    Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
-    Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
-    return true;
-  }
 
-  return false;
+  // Do not split if NewImm2 is not a valid bitmask immediate.
+  if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
+    return false;
+
+  Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
+  Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
+  return true;
 }
 
 template <typename T>

>From 7ca15a7ed1a1c5b3b1e271476ed3c58cfcfb4fd9 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Wed, 6 Aug 2025 02:57:17 -0700
Subject: [PATCH 4/4] Guard code against UB.

---
 llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index fb7a16de20673..fd4ef2aa28f8a 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -206,6 +206,7 @@ static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc,
       LowestBitSet + llvm::countr_one(Imm >> LowestBitSet);
 
   // Create a mask for the least significant group of consecutive ones.
+  assert(LowestGapBitUnset < sizeof(T) * CHAR_BIT && "Undefined behaviour!");
   T NewImm1 = (static_cast<T>(1) << LowestGapBitUnset) -
               (static_cast<T>(1) << LowestBitSet);
   // Create a disjoint mask for the remaining ones.



More information about the llvm-commits mailing list