[llvm] 0f9ef8b - [AArch64] Select BFI/BFXIL to ORR with shifted operand when one operand is the left or right shift of another operand

Fri Nov 11 14:01:50 PST 2022

Author: Mingming Liu
Date: 2022-11-11T14:01:02-08:00
New Revision: 0f9ef8b18055c9f7ca534fab24f74266331ec3e5

URL: https://github.com/llvm/llvm-project/commit/0f9ef8b18055c9f7ca534fab24f74266331ec3e5
DIFF: https://github.com/llvm/llvm-project/commit/0f9ef8b18055c9f7ca534fab24f74266331ec3e5.diff

LOG: [AArch64] Select BFI/BFXIL to ORR with shifted operand when one operand is the left or right shift of another operand

Use right shift [1] as an example
- Before, bfxil is generated (https://godbolt.org/z/EfzWMszPn)
- After, orr with right-shifted operand is generated (added test cases in `CodeGen/AArch64/bitfield-insert.ll`)

[1]
```
define i64 @test_orr_not_bfxil_i64(i64 %0) {
  %2 = and i64 %0, 1044480 ; 0xff000
  %3 = lshr i64 %2, 12
  %4 = or i64 %2, %3
  ret i64 %4
}
```

Differential Revision: https://reviews.llvm.org/D137689

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
    llvm/test/CodeGen/AArch64/bitfield-insert.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 7b7817650a29..ab9bbe124033 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2875,10 +2875,17 @@ static bool isWorthFoldingIntoOrrWithShift(SDValue Dst, SelectionDAG *CurDAG,
   return false;
 }
 
+// Given an 'ISD::OR' node that is going to be selected as BFM, analyze
+// the operands and select it to AArch64::ORR with shifted registers if
+// that's more efficient. Returns true iff selection to AArch64::ORR happens.
 static bool tryOrrWithShift(SDNode *N, SDValue OrOpd0, SDValue OrOpd1,
                             SDValue Src, SDValue Dst, SelectionDAG *CurDAG,
                             const bool BiggerPattern) {
   EVT VT = N->getValueType(0);
+  assert(N->getOpcode() == ISD::OR && "Expect N to be an OR node");
+  assert(((N->getOperand(0) == OrOpd0 && N->getOperand(1) == OrOpd1) ||
+          (N->getOperand(1) == OrOpd0 && N->getOperand(0) == OrOpd1)) &&
+         "Expect OrOpd0 and OrOpd1 to be operands of ISD::OR");
   assert((VT == MVT::i32 || VT == MVT::i64) &&
          "Expect result type to be i32 or i64 since N is combinable to BFM");
   SDLoc DL(N);
@@ -2887,6 +2894,7 @@ static bool tryOrrWithShift(SDNode *N, SDValue OrOpd0, SDValue OrOpd1,
   if (OrOpd1 != Dst)
     return false;
 
+  const unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
   // For "BFM Rd, Rn, #immr, #imms", it's known that BFM simplifies away fewer
   // nodes from Rn (or inserts additional shift node) if BiggerPattern is true.
   if (BiggerPattern) {
@@ -2903,7 +2911,6 @@ static bool tryOrrWithShift(SDNode *N, SDValue OrOpd0, SDValue OrOpd1,
       uint64_t EncodedShiftImm;
       if (isWorthFoldingIntoOrrWithShift(Dst, CurDAG, ShiftedOperand,
                                          EncodedShiftImm)) {
-        unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
         SDValue Ops[] = {OrOpd0, ShiftedOperand,
                          CurDAG->getTargetConstant(EncodedShiftImm, DL, VT)};
         CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
@@ -2915,16 +2922,58 @@ static bool tryOrrWithShift(SDNode *N, SDValue OrOpd0, SDValue OrOpd1,
 
   assert((!BiggerPattern) && "BiggerPattern should be handled above");
 
+  SDValue Op;
   uint64_t ShlImm;
-  if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SHL, ShlImm) &&
-      OrOpd0.getOperand(0) == Src && OrOpd0.hasOneUse()) {
-    unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
-    SDValue Ops[] = {
-        Dst, Src,
-        CurDAG->getTargetConstant(
-            AArch64_AM::getShifterImm(AArch64_AM::LSL, ShlImm), DL, VT)};
-    CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
-    return true;
+  if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SHL, ShlImm)) {
+    if (OrOpd0.getOperand(0) == Src && OrOpd0.hasOneUse()) {
+      SDValue Ops[] = {
+          Dst, Src,
+          CurDAG->getTargetConstant(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, ShlImm), DL, VT)};
+      CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
+      return true;
+    }
+
+    // Select the following pattern to left-shifted operand rather than BFI.
+    // %val1 = op ..
+    // %val2 = shl %val1, #imm
+    // %res = or %val1, %val2
+    //
+    // If N is selected to be BFI, we know that
+    // 1) OrOpd0 would be the operand from which extract bits (i.e., folded into
+    // BFI) 2) OrOpd1 would be the destination operand (i.e., preserved)
+    //
+    // Instead of selecting N to BFI, fold OrOpd0 as a left shift directly.
+    if (OrOpd0.getOperand(0) == OrOpd1) {
+      SDValue Ops[] = {
+          OrOpd1, OrOpd1,
+          CurDAG->getTargetConstant(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, ShlImm), DL, VT)};
+      CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
+      return true;
+    }
+  }
+
+  uint64_t SrlImm;
+  if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SRL, SrlImm)) {
+    // Select the following pattern to right-shifted operand rather than BFXIL.
+    // %val1 = op ..
+    // %val2 = lshr %val1, #imm
+    // %res = or %val1, %val2
+    //
+    // If N is selected to be BFXIL, we know that
+    // 1) OrOpd0 would be the operand from which extract bits (i.e., folded into
+    // BFXIL) 2) OrOpd1 would be the destination operand (i.e., preserved)
+    //
+    // Instead of selecting N to BFXIL, fold OrOpd0 as a right shift directly.
+    if (OrOpd0.getOperand(0) == OrOpd1) {
+      SDValue Ops[] = {
+          OrOpd1, OrOpd1,
+          CurDAG->getTargetConstant(
+              AArch64_AM::getShifterImm(AArch64_AM::LSR, SrlImm), DL, VT)};
+      CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
+      return true;
+    }
   }
 
   return false;

diff  --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
index 8a383e85a366..eeb1b544f57b 100644
--- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -638,13 +638,12 @@ define i32 @test_orr_not_bfxil_i32(i32 %0) {
 }
 
 ; For or operation, one operand is a left shift of another operand.
-; Use orr with left-shifted operand is better than bfi.
+; So orr with a left-shifted operand is generated (not bfi).
 define i64 @test_orr_not_bfi_i64(i64 %0) {
 ; CHECK-LABEL: test_orr_not_bfi_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and x8, x0, #0xff
-; CHECK-NEXT:    bfi x8, x0, #8, #8
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    orr x0, x8, x8, lsl #8
 ; CHECK-NEXT:    ret
   %2 = and i64 %0, 255
   %3 = shl i64 %2, 8
@@ -668,14 +667,13 @@ define i32 @test_bfi_not_orr_i32(i32 %0, i32 %1) {
   ret i32 %or_res
 }
 
-; orr is better than bfi, since both simplify away one instruction (%3)
+; orr is generated (not bfi), since both simplify away one instruction (%3)
 ; while orr has shorter latency and higher throughput.
 define i32 @test_orr_not_bfi_i32(i32 %0) {
 ; CHECK-LABEL: test_orr_not_bfi_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    bfi w8, w0, #8, #8
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    orr w0, w8, w8, lsl #8
 ; CHECK-NEXT:    ret
   %2 = and i32 %0, 255
   %3 = shl i32 %2, 8
@@ -698,14 +696,13 @@ define i64 @test_bfxil_not_orr_i64(i64 %0, i64 %1) {
   ret i64 %or_res
 }
 
-; orr is better than bfxil, since one operand is the right shift of  another
+; orr is generated (not bfxil), since one operand is the right shift of another
 ; operand.
 define i64 @orr_not_bfxil_test2_i64(i64 %0) {
 ; CHECK-LABEL: orr_not_bfxil_test2_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and x8, x0, #0xff000
-; CHECK-NEXT:    bfxil x8, x0, #12, #8
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    orr x0, x8, x8, lsr #12
 ; CHECK-NEXT:    ret
   %2 = and i64 %0, 1044480 ; 0xff000
   %3 = lshr i64 %2, 12
@@ -729,13 +726,12 @@ define i32 @test_bfxil_not_orr_i32(i32 %0, i32 %1) {
   ret i32 %or_res
 }
 
-; one operand is the shift of another operand, so orr is better.
+; one operand is the shift of another operand, so orr is generated (not bfxil).
 define i32 @orr_not_bfxil_test2_i32(i32 %0) {
 ; CHECK-LABEL: orr_not_bfxil_test2_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0xff000
-; CHECK-NEXT:    bfxil w8, w0, #12, #8
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    orr w0, w8, w8, lsr #12
 ; CHECK-NEXT:    ret
   %2 = and i32 %0, 1044480  ; 0xff000
   %3 = lshr i32 %2, 12