[llvm] [SDAG] Don't treat ISD::SHL as a uniform binary operator in `ShrinkDemandedOp` (PR #92753)

Wed May 22 04:31:27 PDT 2024

https://github.com/dtcxzyw updated https://github.com/llvm/llvm-project/pull/92753

>From a9dcfd813ebe24d5c958ac9e73ee943a8463246b Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Mon, 20 May 2024 21:37:41 +0800
Subject: [PATCH 1/2] [SDAG] Don't treat ISD::SHL as a uniform binary operator
 in `ShrinkDemandedOp`

---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  31 ++++-
 llvm/test/CodeGen/AArch64/bitfield-insert.ll  |  11 +-
 llvm/test/CodeGen/AArch64/trunc-to-tbl.ll     | 118 +++++++++---------
 llvm/test/CodeGen/X86/pr92720.ll              |  15 +++
 4 files changed, 104 insertions(+), 71 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/pr92720.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 3ec6b9b795079..83dee6db3787e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -587,6 +587,10 @@ bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
   if (VT.isVector())
     return false;
 
+  assert(Op.getOperand(0).getValueType().getScalarSizeInBits() == BitWidth &&
+         Op.getOperand(1).getValueType().getScalarSizeInBits() == BitWidth &&
+         "ShrinkDemandedOp only supports operands that have the same size!");
+
   // Don't do this if the node has another user, which may require the
   // full value.
   if (!Op.getNode()->hasOneUse())
@@ -1834,9 +1838,30 @@ bool TargetLowering::SimplifyDemandedBits(
 
       // Try shrinking the operation as long as the shift amount will still be
       // in range.
-      if ((ShAmt < DemandedBits.getActiveBits()) &&
-          ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
-        return true;
+      if (ShAmt < DemandedBits.getActiveBits() && !VT.isVector() &&
+          Op.getNode()->hasOneUse()) {
+        // Search for the smallest integer type with free casts to and from
+        // Op's type. For expedience, just check power-of-2 integer types.
+        unsigned DemandedSize = DemandedBits.getActiveBits();
+        for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
+             SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
+          EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
+          if (isNarrowingProfitable(VT, SmallVT) &&
+              isTypeDesirableForOp(ISD::SHL, SmallVT) &&
+              isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
+              (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
+            assert(DemandedSize <= SmallVTBits &&
+                   "Narrowed below demanded bits?");
+            // We found a type with free casts.
+            SDValue NarrowShl = TLO.DAG.getNode(
+                ISD::SHL, dl, SmallVT,
+                TLO.DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)),
+                TLO.DAG.getShiftAmountConstant(ShAmt, SmallVT, dl));
+            return TLO.CombineTo(
+                Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, NarrowShl));
+          }
+        }
+      }
 
       // Narrow shift to lower half - similar to ShrinkDemandedOp.
       // (shl i64:x, K) -> (i64 zero_extend (shl (i32 (trunc i64:x)), K))
diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
index 30b5e86c1e6dc..14a594e8028d5 100644
--- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -193,11 +193,10 @@ define void @test_64bit_badmask(ptr %existing, ptr %new) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    mov w10, #135 // =0x87
-; CHECK-NEXT:    mov w11, #664 // =0x298
-; CHECK-NEXT:    lsl w9, w9, #3
-; CHECK-NEXT:    and x8, x8, x10
-; CHECK-NEXT:    and x9, x9, x11
+; CHECK-NEXT:    mov w10, #664 // =0x298
+; CHECK-NEXT:    mov w11, #135 // =0x87
+; CHECK-NEXT:    and x9, x10, x9, lsl #3
+; CHECK-NEXT:    and x8, x8, x11
 ; CHECK-NEXT:    orr x8, x8, x9
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
@@ -579,7 +578,6 @@ define <2 x i32> @test_complex_type(ptr %addr, i64 %in, ptr %bf ) {
 define i64 @test_truncated_shift(i64 %x, i64 %y) {
 ; CHECK-LABEL: test_truncated_shift:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $w1 killed $w1 killed $x1 def $x1
 ; CHECK-NEXT:    bfi x0, x1, #25, #5
 ; CHECK-NEXT:    ret
 entry:
@@ -593,7 +591,6 @@ entry:
 define i64 @test_and_extended_shift_with_imm(i64 %0) {
 ; CHECK-LABEL: test_and_extended_shift_with_imm:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0 def $x0
 ; CHECK-NEXT:    ubfiz x0, x0, #7, #8
 ; CHECK-NEXT:    ret
   %2 = shl i64 %0, 7
diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
index 18cd4cc2111a4..c4a58ba12dc6b 100644
--- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
@@ -571,29 +571,27 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB5_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp x10, x9, [x0]
-; CHECK-NEXT:    ldrb w13, [x0, #18]
-; CHECK-NEXT:    ldrh w14, [x0, #16]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldrb w14, [x0, #18]
+; CHECK-NEXT:    ldrh w15, [x0, #16]
 ; CHECK-NEXT:    add x0, x0, #32
-; CHECK-NEXT:    ubfx x12, x9, #12, #20
-; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    lsr x11, x10, #19
-; CHECK-NEXT:    lsr x15, x9, #31
-; CHECK-NEXT:    fmov s1, w12
-; CHECK-NEXT:    lsr x12, x9, #50
-; CHECK-NEXT:    mov.s v0[1], w11
-; CHECK-NEXT:    orr w11, w14, w13, lsl #16
-; CHECK-NEXT:    lsr x13, x10, #38
-; CHECK-NEXT:    lsr x10, x10, #57
-; CHECK-NEXT:    mov.s v1[1], w15
-; CHECK-NEXT:    orr w12, w12, w11, lsl #14
-; CHECK-NEXT:    orr w9, w10, w9, lsl #7
-; CHECK-NEXT:    lsr w10, w11, #5
-; CHECK-NEXT:    mov.s v0[2], w13
+; CHECK-NEXT:    ubfx x12, x10, #12, #20
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    lsr x11, x9, #19
+; CHECK-NEXT:    lsr x13, x10, #31
+; CHECK-NEXT:    fmov s0, w12
+; CHECK-NEXT:    lsr x12, x9, #38
+; CHECK-NEXT:    extr x9, x10, x9, #57
+; CHECK-NEXT:    mov.s v1[1], w11
+; CHECK-NEXT:    orr x11, x15, x14, lsl #16
+; CHECK-NEXT:    mov.s v0[1], w13
+; CHECK-NEXT:    extr x13, x11, x10, #50
+; CHECK-NEXT:    ubfx x10, x11, #5, #27
 ; CHECK-NEXT:    mov.s v1[2], w12
-; CHECK-NEXT:    mov.s v0[3], w9
-; CHECK-NEXT:    mov.s v1[3], w10
-; CHECK-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-NEXT:    mov.s v0[2], w13
+; CHECK-NEXT:    mov.s v1[3], w9
+; CHECK-NEXT:    mov.s v0[3], w10
+; CHECK-NEXT:    uzp1.8h v0, v1, v0
 ; CHECK-NEXT:    xtn.8b v0, v0
 ; CHECK-NEXT:    str d0, [x1, x8, lsl #3]
 ; CHECK-NEXT:    add x8, x8, #1
@@ -608,35 +606,34 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-BE-NEXT:  .LBB5_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ldp x10, x9, [x0]
-; CHECK-BE-NEXT:    ldrb w16, [x0, #18]
-; CHECK-BE-NEXT:    lsr x11, x9, #40
-; CHECK-BE-NEXT:    ubfx x12, x9, #33, #7
-; CHECK-BE-NEXT:    lsr x15, x10, #45
-; CHECK-BE-NEXT:    lsr x13, x10, #40
-; CHECK-BE-NEXT:    ubfx x14, x10, #26, #14
-; CHECK-BE-NEXT:    orr w11, w12, w11, lsl #7
-; CHECK-BE-NEXT:    ldrh w12, [x0, #16]
-; CHECK-BE-NEXT:    fmov s0, w15
-; CHECK-BE-NEXT:    orr w13, w14, w13, lsl #14
-; CHECK-BE-NEXT:    ubfx x14, x9, #14, #18
+; CHECK-BE-NEXT:    ldrh w16, [x0, #16]
+; CHECK-BE-NEXT:    ldrb w17, [x0, #18]
 ; CHECK-BE-NEXT:    add x0, x0, #32
-; CHECK-BE-NEXT:    fmov s1, w11
-; CHECK-BE-NEXT:    orr w11, w16, w12, lsl #8
-; CHECK-BE-NEXT:    lsl x12, x9, #24
-; CHECK-BE-NEXT:    mov v0.s[1], w13
+; CHECK-BE-NEXT:    lsl x11, x9, #24
+; CHECK-BE-NEXT:    lsr x12, x9, #40
+; CHECK-BE-NEXT:    lsr x13, x10, #45
+; CHECK-BE-NEXT:    lsl x14, x10, #24
+; CHECK-BE-NEXT:    lsr x15, x10, #40
+; CHECK-BE-NEXT:    extr x12, x12, x11, #57
+; CHECK-BE-NEXT:    fmov s0, w13
 ; CHECK-BE-NEXT:    ubfx x13, x10, #7, #25
+; CHECK-BE-NEXT:    extr x14, x15, x14, #50
+; CHECK-BE-NEXT:    ubfx x15, x9, #14, #18
 ; CHECK-BE-NEXT:    extr x9, x10, x9, #40
-; CHECK-BE-NEXT:    orr w12, w11, w12
-; CHECK-BE-NEXT:    mov v1.s[1], w14
-; CHECK-BE-NEXT:    lsr w12, w12, #19
+; CHECK-BE-NEXT:    fmov s1, w12
+; CHECK-BE-NEXT:    orr w12, w17, w16, lsl #8
+; CHECK-BE-NEXT:    mov v0.s[1], w14
 ; CHECK-BE-NEXT:    ubfx x9, x9, #12, #20
+; CHECK-BE-NEXT:    orr w11, w12, w11
+; CHECK-BE-NEXT:    mov v1.s[1], w15
+; CHECK-BE-NEXT:    lsr w11, w11, #19
 ; CHECK-BE-NEXT:    mov v0.s[2], w13
-; CHECK-BE-NEXT:    mov v1.s[2], w12
+; CHECK-BE-NEXT:    mov v1.s[2], w11
 ; CHECK-BE-NEXT:    mov v0.s[3], w9
 ; CHECK-BE-NEXT:    add x9, x1, x8, lsl #3
 ; CHECK-BE-NEXT:    add x8, x8, #1
 ; CHECK-BE-NEXT:    cmp x8, #1000
-; CHECK-BE-NEXT:    mov v1.s[3], w11
+; CHECK-BE-NEXT:    mov v1.s[3], w12
 ; CHECK-BE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-BE-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-BE-NEXT:    st1 { v0.8b }, [x9]
@@ -650,35 +647,34 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-DISABLE-NEXT:  .LBB5_1: // %loop
 ; CHECK-DISABLE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-DISABLE-NEXT:    ldp x10, x9, [x0]
-; CHECK-DISABLE-NEXT:    ldrb w16, [x0, #18]
-; CHECK-DISABLE-NEXT:    lsr x11, x9, #40
-; CHECK-DISABLE-NEXT:    ubfx x12, x9, #33, #7
-; CHECK-DISABLE-NEXT:    lsr x15, x10, #45
-; CHECK-DISABLE-NEXT:    lsr x13, x10, #40
-; CHECK-DISABLE-NEXT:    ubfx x14, x10, #26, #14
-; CHECK-DISABLE-NEXT:    orr w11, w12, w11, lsl #7
-; CHECK-DISABLE-NEXT:    ldrh w12, [x0, #16]
-; CHECK-DISABLE-NEXT:    fmov s0, w15
-; CHECK-DISABLE-NEXT:    orr w13, w14, w13, lsl #14
-; CHECK-DISABLE-NEXT:    ubfx x14, x9, #14, #18
+; CHECK-DISABLE-NEXT:    ldrh w16, [x0, #16]
+; CHECK-DISABLE-NEXT:    ldrb w17, [x0, #18]
 ; CHECK-DISABLE-NEXT:    add x0, x0, #32
-; CHECK-DISABLE-NEXT:    fmov s1, w11
-; CHECK-DISABLE-NEXT:    orr w11, w16, w12, lsl #8
-; CHECK-DISABLE-NEXT:    lsl x12, x9, #24
-; CHECK-DISABLE-NEXT:    mov v0.s[1], w13
+; CHECK-DISABLE-NEXT:    lsl x11, x9, #24
+; CHECK-DISABLE-NEXT:    lsr x12, x9, #40
+; CHECK-DISABLE-NEXT:    lsr x13, x10, #45
+; CHECK-DISABLE-NEXT:    lsl x14, x10, #24
+; CHECK-DISABLE-NEXT:    lsr x15, x10, #40
+; CHECK-DISABLE-NEXT:    extr x12, x12, x11, #57
+; CHECK-DISABLE-NEXT:    fmov s0, w13
 ; CHECK-DISABLE-NEXT:    ubfx x13, x10, #7, #25
+; CHECK-DISABLE-NEXT:    extr x14, x15, x14, #50
+; CHECK-DISABLE-NEXT:    ubfx x15, x9, #14, #18
 ; CHECK-DISABLE-NEXT:    extr x9, x10, x9, #40
-; CHECK-DISABLE-NEXT:    orr w12, w11, w12
-; CHECK-DISABLE-NEXT:    mov v1.s[1], w14
-; CHECK-DISABLE-NEXT:    lsr w12, w12, #19
+; CHECK-DISABLE-NEXT:    fmov s1, w12
+; CHECK-DISABLE-NEXT:    orr w12, w17, w16, lsl #8
+; CHECK-DISABLE-NEXT:    mov v0.s[1], w14
 ; CHECK-DISABLE-NEXT:    ubfx x9, x9, #12, #20
+; CHECK-DISABLE-NEXT:    orr w11, w12, w11
+; CHECK-DISABLE-NEXT:    mov v1.s[1], w15
+; CHECK-DISABLE-NEXT:    lsr w11, w11, #19
 ; CHECK-DISABLE-NEXT:    mov v0.s[2], w13
-; CHECK-DISABLE-NEXT:    mov v1.s[2], w12
+; CHECK-DISABLE-NEXT:    mov v1.s[2], w11
 ; CHECK-DISABLE-NEXT:    mov v0.s[3], w9
 ; CHECK-DISABLE-NEXT:    add x9, x1, x8, lsl #3
 ; CHECK-DISABLE-NEXT:    add x8, x8, #1
 ; CHECK-DISABLE-NEXT:    cmp x8, #1000
-; CHECK-DISABLE-NEXT:    mov v1.s[3], w11
+; CHECK-DISABLE-NEXT:    mov v1.s[3], w12
 ; CHECK-DISABLE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-DISABLE-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-DISABLE-NEXT:    st1 { v0.8b }, [x9]
diff --git a/llvm/test/CodeGen/X86/pr92720.ll b/llvm/test/CodeGen/X86/pr92720.ll
new file mode 100644
index 0000000000000..b2543c08328c7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr92720.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s
+
+; Make sure we don't crash when shrinking the shift amount before legalization.
+define i64 @pr92720(i64 %x) {
+; CHECK-LABEL: pr92720:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $8589934592, %rax # imm = 0x200000000
+; CHECK-NEXT:    retq
+  %or = or i64 %x, 255
+  %sub = sub i64 0, %or
+  %shl = shl i64 1, %sub
+  %sext = shl i64 %shl, 32
+  ret i64 %sext
+}

>From 5ba8a4fd138b16b4c671b058ce4cdb7c65d7b7a0 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Wed, 22 May 2024 19:31:03 +0800
Subject: [PATCH 2/2] [SDAG] Add todo. NFC.

---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 83dee6db3787e..5064527045262 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1836,6 +1836,7 @@ bool TargetLowering::SimplifyDemandedBits(
         }
       }
 
+      // TODO: Can we merge this fold with the one below?
       // Try shrinking the operation as long as the shift amount will still be
       // in range.
       if (ShAmt < DemandedBits.getActiveBits() && !VT.isVector() &&