[llvm] [AArch64] Change how we do bit computations for bfi (PR #142646)

Wed Jun 4 07:49:04 PDT 2025

https://github.com/AZero13 updated https://github.com/llvm/llvm-project/pull/142646

>From b10979eda2cc66d1bd08602eed8dc82c99e04632 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Sat, 24 May 2025 20:59:22 -0400
Subject: [PATCH 1/2] Test known zero with regressions

---
 llvm/test/CodeGen/AArch64/bitfield-insert.ll | 29 ++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
index eefb862c5313c..b778eb2d0ac58 100644
--- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -753,3 +753,32 @@ entry:
   store i16 %or, ptr %p
   ret i16 %and1
 }
+
+define i32 @test1_known_not_zero(i32 %a) {
+; CHECK-LABEL: test1_known_not_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #12288 // =0x3000
+; CHECK-NEXT:    and w9, w0, #0xfffffffe
+; CHECK-NEXT:    movk w8, #7, lsl #16
+; CHECK-NEXT:    and w9, w9, #0xfff00fff
+; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    ret
+  %and1 = and i32 %a, -1044482
+  %or = or disjoint i32 %and1, 471040
+  ret i32 %or
+}
+
+define i32 @test1_known_not_zero_mul(i32 %a) {
+; CHECK-LABEL: test1_known_not_zero_mul:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul w9, w0, w0
+; CHECK-NEXT:    mov w8, #12288 // =0x3000
+; CHECK-NEXT:    movk w8, #7, lsl #16
+; CHECK-NEXT:    and w9, w9, #0xfff03fff
+; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    ret
+  %mul = mul i32 %a, %a ; We set the second bit to 0.
+  %and = and i32 %mul, -1044483
+  %or = or disjoint i32 %and, 471040
+  ret i32 %or
+}

>From 03387c99785fd982fb2900bc1b28d541b3211c1c Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Sun, 25 May 2025 19:27:17 -0400
Subject: [PATCH 2/2] [AArch64] Change how we do bit computations for bfi

Instead of relying on known 0s, we can instead focus on what we do know about the final result, and bxfil that.
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 54 ++++++++++++-------
 llvm/test/CodeGen/AArch64/bitfield-insert.ll  | 22 ++++----
 2 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 34f6db9374cb5..63ecba17e5028 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -3316,28 +3316,46 @@ static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
       !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm))
     return false;
 
-  // Compute the Known Zero for the AND as this allows us to catch more general
-  // cases than just looking for AND with imm.
-  KnownBits Known = CurDAG->computeKnownBits(And);
+  // In the event that computeKnownBits gives us more bits than we bargained
+  // for, and cause the the result to not be a shifted mask, we
+  // should just focus on the maskImm
 
-  // Non-zero in the sense that they're not provably zero, which is the key
-  // point if we want to use this value.
-  uint64_t NotKnownZero = (~Known.Zero).getZExtValue();
+  // We can optimize any chain of bitwise operations that results in a series of
+  // contiguous bits being set if we know the end result will have those bits
+  // set.
 
-  // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
-  if (!isShiftedMask(Known.Zero.getZExtValue(), VT))
-    return false;
+  KnownBits Known = CurDAG->computeKnownBits(And.getOperand(0));
+  uint64_t OldOne = Known.One.getZExtValue();
+  uint64_t OldZero = Known.Zero.getZExtValue();
 
-  // The bits being inserted must only set those bits that are known to be zero.
-  if ((OrImm & NotKnownZero) != 0) {
-    // FIXME:  It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
-    // currently handle this case.
-    return false;
+  Known &= KnownBits::makeConstant(APInt(BitWidth, MaskImm));
+  Known |= KnownBits::makeConstant(APInt(BitWidth, OrImm));
+
+  // Now we need to see if any KnownBits changed
+  uint64_t NewOneVal = Known.One.getZExtValue();
+  uint64_t NewZeroVal = Known.Zero.getZExtValue();
+
+  uint64_t ChangedBits = (NewOneVal ^ OldOne) | (NewZeroVal ^ OldZero);
+
+  // Find smallest bitfield insert that encompasses all changed bits and any
+  // known bits between them.
+  if (ChangedBits == 0) {
+    // No change? Then we can just replace the node with the 1st AND operand.
+    CurDAG->ReplaceAllUsesWith(SDValue(N, 0), And->getOperand(0));
+    return true;
   }
 
-  // BFI/BFXIL dst, src, #lsb, #width.
-  int LSB = llvm::countr_one(NotKnownZero);
-  int Width = BitWidth - APInt(BitWidth, NotKnownZero).popcount();
+  // Find the span:
+  APInt ChangedBitsAPInt = APInt(BitWidth, ChangedBits);
+  int LSB = ChangedBitsAPInt.countr_zero();
+  int MSB = BitWidth - ChangedBitsAPInt.countl_zero();
+  int Width = MSB - LSB;
+  uint64_t SpanMask = ((1ULL << Width) - 1) << LSB;
+
+  // Bail if *any* bit in that run is still unknown:
+  uint64_t Unknown = ~(NewOneVal | NewZeroVal);
+  if ((Unknown & SpanMask) != 0)
+    return false;
 
   // BFI/BFXIL is an alias of BFM, so translate to BFM operands.
   unsigned ImmR = (BitWidth - LSB) % BitWidth;
@@ -3348,7 +3366,7 @@ static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
   // ORR.  A BFXIL will use the same constant as the original ORR, so the code
   // should be no worse in this case.
   bool IsBFI = LSB != 0;
-  uint64_t BFIImm = OrImm >> LSB;
+  uint64_t BFIImm = (NewOneVal & SpanMask) >> LSB;
   if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) {
     // We have a BFI instruction and we know the constant can't be materialized
     // with a ORR-immediate with the zero register.
diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
index b778eb2d0ac58..556c7ecde792f 100644
--- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -768,17 +768,17 @@ define i32 @test1_known_not_zero(i32 %a) {
   ret i32 %or
 }
 
-define i32 @test1_known_not_zero_mul(i32 %a) {
-; CHECK-LABEL: test1_known_not_zero_mul:
+define i32 @hole_test(i32 %a) {
+; CHECK-LABEL: hole_test:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mul w9, w0, w0
-; CHECK-NEXT:    mov w8, #12288 // =0x3000
-; CHECK-NEXT:    movk w8, #7, lsl #16
-; CHECK-NEXT:    and w9, w9, #0xfff03fff
-; CHECK-NEXT:    orr w0, w9, w8
-; CHECK-NEXT:    ret
-  %mul = mul i32 %a, %a ; We set the second bit to 0.
-  %and = and i32 %mul, -1044483
-  %or = or disjoint i32 %and, 471040
-  ret i32 %or
+; CHECK-NEXT:    mov w8, #65533 // =0xfffd
+; CHECK-NEXT:    movk w8, #65525, lsl #16
+; CHECK-NEXT:    orr w9, w9, #0xff0000
+; CHECK-NEXT:    and w0, w9, w8
+; CHECK-NEXT:    ret
+  %2 = mul i32 %a, %a
+  %3 = and i32 %2, -4128771
+  %4 = or i32 %3, 16056320
+  ret i32 %4
 }