[llvm] 370ebc9 - [DAG] Attempt to fold bswap(shl(x,c)) -> zext(bswap(trunc(shl(x,c-bw/2))))

Thu Feb 24 11:34:00 PST 2022

Author: Simon Pilgrim
Date: 2022-02-24T19:33:51Z
New Revision: 370ebc9d9a573d6a0bcb0d7c3e0a57d55b7a7673

URL: https://github.com/llvm/llvm-project/commit/370ebc9d9a573d6a0bcb0d7c3e0a57d55b7a7673
DIFF: https://github.com/llvm/llvm-project/commit/370ebc9d9a573d6a0bcb0d7c3e0a57d55b7a7673.diff

LOG: [DAG] Attempt to fold bswap(shl(x,c)) -> zext(bswap(trunc(shl(x,c-bw/2))))

If the shl is at least half the bitwidth (i.e. the lower half of the bswap source is zero), then we can reduce the shift and perform the bswap at half the bitwidth and just zero extend.

Based off PR51391 + PR53867

Differential Revision: https://reviews.llvm.org/D120192

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AArch64/load-combine-big-endian.ll
    llvm/test/CodeGen/AArch64/load-combine.ll
    llvm/test/CodeGen/X86/combine-bswap.ll
    llvm/test/CodeGen/X86/load-combine.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 483496a53a6e1..c3bb838aac5c0 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9610,6 +9610,26 @@ SDValue DAGCombiner::visitBSWAP(SDNode *N) {
     return DAG.getNode(ISD::BITREVERSE, DL, VT, BSwap);
   }
 
+  // fold (bswap shl(x,c)) -> (zext(bswap(trunc(shl(x,sub(c,bw/2))))))
+  // iff x >= bw/2 (i.e. lower half is known zero)
+  unsigned BW = VT.getScalarSizeInBits();
+  if (BW >= 32 && N0.getOpcode() == ISD::SHL && N0.hasOneUse()) {
+    auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), BW / 2);
+    if (ShAmt && ShAmt->getAPIntValue().ult(BW) &&
+        ShAmt->getZExtValue() >= (BW / 2) &&
+        (ShAmt->getZExtValue() % 16) == 0 && TLI.isTruncateFree(VT, HalfVT) &&
+        (!LegalOperations || hasOperation(ISD::BSWAP, HalfVT))) {
+      SDValue Res = N0.getOperand(0);
+      if (uint64_t NewShAmt = (ShAmt->getZExtValue() - (BW / 2)))
+        Res = DAG.getNode(ISD::SHL, DL, VT, Res,
+                          DAG.getConstant(NewShAmt, DL, getShiftAmountTy(VT)));
+      Res = DAG.getZExtOrTrunc(Res, DL, HalfVT);
+      Res = DAG.getNode(ISD::BSWAP, DL, HalfVT, Res);
+      return DAG.getZExtOrTrunc(Res, DL, VT);
+    }
+  }
+
   return SDValue();
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll
index 14a0162d5269c..7579d550efb8d 100644
--- a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll
+++ b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll
@@ -442,8 +442,8 @@ define i32 @zext_load_i32_by_i8(i32* %arg) {
 ; CHECK-LABEL: zext_load_i32_by_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    rev w0, w8
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    lsr w0, w8, #16
 ; CHECK-NEXT:    ret
   %tmp = bitcast i32* %arg to i8*
   %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0

diff  --git a/llvm/test/CodeGen/AArch64/load-combine.ll b/llvm/test/CodeGen/AArch64/load-combine.ll
index 066ecb21dc107..d743ab3f643ce 100644
--- a/llvm/test/CodeGen/AArch64/load-combine.ll
+++ b/llvm/test/CodeGen/AArch64/load-combine.ll
@@ -499,8 +499,8 @@ define i32 @zext_load_i32_by_i8_bswap(i32* %arg) {
 ; CHECK-LABEL: zext_load_i32_by_i8_bswap:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    rev w0, w8
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    lsr w0, w8, #16
 ; CHECK-NEXT:    ret
 
   %tmp = bitcast i32* %arg to i8*

diff  --git a/llvm/test/CodeGen/X86/combine-bswap.ll b/llvm/test/CodeGen/X86/combine-bswap.ll
index c20f54d3e3582..f4814a9adc35d 100644
--- a/llvm/test/CodeGen/X86/combine-bswap.ll
+++ b/llvm/test/CodeGen/X86/combine-bswap.ll
@@ -87,17 +87,16 @@ define void @demand_one_loaded_byte(i64* %xp, i32* %yp) {
 define i64 @test_bswap64_shift48_zext(i16 %a0) {
 ; X86-LABEL: test_bswap64_shift48_zext:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_bswap64_shift48_zext:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shlq $48, %rax
-; X64-NEXT:    bswapq %rax
+; X64-NEXT:    rolw $8, %di
+; X64-NEXT:    movzwl %di, %eax
 ; X64-NEXT:    retq
   %z = zext i16 %a0 to i64
   %s = shl i64 %z, 48
@@ -109,16 +108,15 @@ define i64 @test_bswap64_shift48(i64 %a0) {
 ; X86-LABEL: test_bswap64_shift48:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    bswapl %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_bswap64_shift48:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shlq $48, %rax
-; X64-NEXT:    bswapq %rax
+; X64-NEXT:    rolw $8, %di
+; X64-NEXT:    movzwl %di, %eax
 ; X64-NEXT:    retq
   %s = shl i64 %a0, 48
   %b = call i64 @llvm.bswap.i64(i64 %s)

diff  --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll
index a046b55aab333..6f5c60d20b4c6 100644
--- a/llvm/test/CodeGen/X86/load-combine.ll
+++ b/llvm/test/CodeGen/X86/load-combine.ll
@@ -1209,20 +1209,33 @@ define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) {
 ; i8* p;
 ; (i32) p[1] | ((i32) p[0] << 8)
 define i32 @zext_load_i32_by_i8_bswap(i32* %arg) {
-; CHECK-LABEL: zext_load_i32_by_i8_bswap:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movzwl (%eax), %eax
-; CHECK-NEXT:    shll $16, %eax
-; CHECK-NEXT:    bswapl %eax
-; CHECK-NEXT:    retl
+; BSWAP-LABEL: zext_load_i32_by_i8_bswap:
+; BSWAP:       # %bb.0:
+; BSWAP-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BSWAP-NEXT:    movzwl (%eax), %eax
+; BSWAP-NEXT:    rolw $8, %ax
+; BSWAP-NEXT:    movzwl %ax, %eax
+; BSWAP-NEXT:    retl
 ;
-; CHECK64-LABEL: zext_load_i32_by_i8_bswap:
-; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    movzwl (%rdi), %eax
-; CHECK64-NEXT:    shll $16, %eax
-; CHECK64-NEXT:    bswapl %eax
-; CHECK64-NEXT:    retq
+; MOVBE-LABEL: zext_load_i32_by_i8_bswap:
+; MOVBE:       # %bb.0:
+; MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MOVBE-NEXT:    movbew (%eax), %ax
+; MOVBE-NEXT:    movzwl %ax, %eax
+; MOVBE-NEXT:    retl
+;
+; BSWAP64-LABEL: zext_load_i32_by_i8_bswap:
+; BSWAP64:       # %bb.0:
+; BSWAP64-NEXT:    movzwl (%rdi), %eax
+; BSWAP64-NEXT:    rolw $8, %ax
+; BSWAP64-NEXT:    movzwl %ax, %eax
+; BSWAP64-NEXT:    retq
+;
+; MOVBE64-LABEL: zext_load_i32_by_i8_bswap:
+; MOVBE64:       # %bb.0:
+; MOVBE64-NEXT:    movbew (%rdi), %ax
+; MOVBE64-NEXT:    movzwl %ax, %eax
+; MOVBE64-NEXT:    retq
   %tmp = bitcast i32* %arg to i8*
   %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
   %tmp2 = load i8, i8* %tmp1, align 1