[llvm] ba6485e - [SDAG] add demanded bits transform for bswap

Mon Jan 17 15:33:55 PST 2022

Author: Sanjay Patel
Date: 2022-01-17T18:25:42-05:00
New Revision: ba6485e25fc56468f34cc8a6938d66d3c5f46596

URL: https://github.com/llvm/llvm-project/commit/ba6485e25fc56468f34cc8a6938d66d3c5f46596
DIFF: https://github.com/llvm/llvm-project/commit/ba6485e25fc56468f34cc8a6938d66d3c5f46596.diff

LOG: [SDAG] add demanded bits transform for bswap

A possible codegen regression for PowerPC is noted in D117406
because we don't recognize a pattern that demands only 1 byte
from a bswap.

This fold  has existed in IR since close to the beginning of LLVM:
https://github.com/llvm/llvm-project/blame/main/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp#L794
...so this patch copies that code as much as possible and adapts
it for SDAG.

The test for PowerPC that would change in D117406 is over-reduced
with undefs, so I recreated it for AArch64 and x86 by passing in
pointer args and renamed the values to make the logic clearer.

Differential Revision: https://reviews.llvm.org/D117508

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/AArch64/bswap-known-bits.ll
    llvm/test/CodeGen/X86/combine-bswap.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index dde4cb7820e0d..ea6a7e16bcdbd 100644

--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1806,6 +1806,35 @@ bool TargetLowering::SimplifyDemandedBits(
   }
   case ISD::BSWAP: {
     SDValue Src = Op.getOperand(0);
+
+    // If the only bits demanded come from one byte of the bswap result,
+    // just shift the input byte into position to eliminate the bswap.
+    unsigned NLZ = DemandedBits.countLeadingZeros();
+    unsigned NTZ = DemandedBits.countTrailingZeros();
+
+    // Round NTZ down to the next byte.  If we have 11 trailing zeros, then
+    // we need all the bits down to bit 8.  Likewise, round NLZ.  If we
+    // have 14 leading zeros, round to 8.
+    NLZ &= ~7;
+    NTZ &= ~7;
+    // If we need exactly one byte, we can do this transformation.
+    if (BitWidth - NLZ - NTZ == 8) {
+      unsigned ResultBit = NTZ;
+      unsigned InputBit = BitWidth - NTZ - 8;
+
+      // Replace this with either a left or right shift to get the byte into
+      // the right place.
+      unsigned ShiftOpcode = InputBit > ResultBit ? ISD::SRL : ISD::SHL;
+      if (!TLO.LegalOperations() || isOperationLegal(ShiftOpcode, VT)) {
+        EVT ShiftAmtTy = getShiftAmountTy(VT, DL);
+        unsigned ShiftAmount =
+            InputBit > ResultBit ? InputBit - ResultBit : ResultBit - InputBit;
+        SDValue ShAmt = TLO.DAG.getConstant(ShiftAmount, dl, ShiftAmtTy);
+        SDValue NewOp = TLO.DAG.getNode(ShiftOpcode, dl, VT, Src, ShAmt);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
     APInt DemandedSrcBits = DemandedBits.byteSwap();
     if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO,
                              Depth + 1))

diff  --git a/llvm/test/CodeGen/AArch64/bswap-known-bits.ll b/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
index 9048fc9aa72c4..442caf7d9b8e8 100644
--- a/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
+++ b/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
@@ -66,7 +66,7 @@ define i8 @demand_one_byte0(i32 %x) {
 define i32 @demand_one_byte1(i32 %x) {
 ; CHECK-LABEL: demand_one_byte1:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    rev w8, w0
+; CHECK-NEXT:    lsr w8, w0, #8
 ; CHECK-NEXT:    and w0, w8, #0xff00
 ; CHECK-NEXT:    ret
   %b = call i32 @llvm.bswap.i32(i32 %x)
@@ -77,7 +77,7 @@ define i32 @demand_one_byte1(i32 %x) {
 define i32 @demand_one_byte2(i32 %x) {
 ; CHECK-LABEL: demand_one_byte2:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    rev w8, w0
+; CHECK-NEXT:    lsl w8, w0, #8
 ; CHECK-NEXT:    orr w0, w8, #0xff00ffff
 ; CHECK-NEXT:    ret
   %b = call i32 @llvm.bswap.i32(i32 %x)
@@ -88,8 +88,7 @@ define i32 @demand_one_byte2(i32 %x) {
 define i64 @demand_one_byte3(i64 %x) {
 ; CHECK-LABEL: demand_one_byte3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    rev x8, x0
-; CHECK-NEXT:    lsr x0, x8, #56
+; CHECK-NEXT:    and x0, x0, #0xff
 ; CHECK-NEXT:    ret
   %b = call i64 @llvm.bswap.i64(i64 %x)
   %r = lshr i64 %b, 56
@@ -99,9 +98,7 @@ define i64 @demand_one_byte3(i64 %x) {
 define void @demand_one_loaded_byte(i64* %xp, i32* %yp) {
 ; CHECK-LABEL: demand_one_loaded_byte:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    lsr x8, x8, #8
-; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    ldrb w8, [x0, #4]
 ; CHECK-NEXT:    strb w8, [x1]
 ; CHECK-NEXT:    ret
   %x = load i64, i64* %xp, align 8

diff  --git a/llvm/test/CodeGen/X86/combine-bswap.ll b/llvm/test/CodeGen/X86/combine-bswap.ll
index 1d02e30fa2b41..4fbb3bf98171f 100644
--- a/llvm/test/CodeGen/X86/combine-bswap.ll
+++ b/llvm/test/CodeGen/X86/combine-bswap.ll
@@ -62,18 +62,13 @@ define void @demand_one_loaded_byte(i64* %xp, i32* %yp) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    shldl $24, %edx, %ecx
-; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movb 4(%ecx), %cl
 ; X86-NEXT:    movb %cl, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: demand_one_loaded_byte:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    shrq $8, %rax
-; X64-NEXT:    bswapl %eax
+; X64-NEXT:    movb 4(%rdi), %al
 ; X64-NEXT:    movb %al, (%rsi)
 ; X64-NEXT:    retq
   %x = load i64, i64* %xp, align 8