[llvm] aa2dac4 - [DAG] SimplifyDemandedBits - fold FSHR(X,Y,Amt) -> SRL(Y,Amt) (#182294)

Thu Feb 19 10:29:39 PST 2026

Author: Simon Pilgrim
Date: 2026-02-19T18:29:34Z
New Revision: aa2dac40de5c1d9c4e1d00d445d90621fe4996fc

URL: https://github.com/llvm/llvm-project/commit/aa2dac40de5c1d9c4e1d00d445d90621fe4996fc
DIFF: https://github.com/llvm/llvm-project/commit/aa2dac40de5c1d9c4e1d00d445d90621fe4996fc.diff

LOG: [DAG] SimplifyDemandedBits - fold FSHR(X,Y,Amt) -> SRL(Y,Amt) (#182294)

If a FSHR node's DemandedBits mask and maximum shift amount doesn't
demand any bits from the X upper register, then simplify to a SRL node.

FSHL is less useful but we could add it as a future patch if there's
interest

Based off a discussion on #182021

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/X86/add-sub-bool.ll
    llvm/test/CodeGen/X86/bittest-big-integer.ll
    llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
    llvm/test/CodeGen/X86/known-pow2.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index de0ec493aba7d..9dacc28f439d6 100644

--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2256,11 +2256,31 @@ bool TargetLowering::SimplifyDemandedBits(
       }
     }
 
-    // For pow-2 bitwidths we only demand the bottom modulo amt bits.
     if (isPowerOf2_32(BitWidth)) {
+      // Fold FSHR(Op0,Op1,Op2) -> SRL(Op1,Op2)
+      // iff we're guaranteed not to use Op0.
+      // TODO: Add FSHL equivalent?
+      if (!IsFSHL && !DemandedBits.isAllOnes() &&
+          (!TLO.LegalOperations() || isOperationLegal(ISD::SRL, VT))) {
+        KnownBits KnownAmt =
+            TLO.DAG.computeKnownBits(Op2, DemandedElts, Depth + 1);
+        unsigned MaxShiftAmt =
+            KnownAmt.getMaxValue().getLimitedValue(BitWidth - 1);
+        // Check we don't demand any shifted bits outside Op1.
+        if (DemandedBits.countl_zero() >= MaxShiftAmt) {
+          EVT AmtVT = Op2.getValueType();
+          SDValue NewAmt =
+              TLO.DAG.getNode(ISD::AND, dl, AmtVT, Op2,
+                              TLO.DAG.getConstant(BitWidth - 1, dl, AmtVT));
+          SDValue NewOp = TLO.DAG.getNode(ISD::SRL, dl, VT, Op1, NewAmt);
+          return TLO.CombineTo(Op, NewOp);
+        }
+      }
+
+      // For pow-2 bitwidths we only demand the bottom modulo amt bits.
       APInt DemandedAmtBits(Op2.getScalarValueSizeInBits(), BitWidth - 1);
-      if (SimplifyDemandedBits(Op2, DemandedAmtBits, DemandedElts,
-                               Known2, TLO, Depth + 1))
+      if (SimplifyDemandedBits(Op2, DemandedAmtBits, DemandedElts, Known2, TLO,
+                               Depth + 1))
         return true;
     }
     break;

diff  --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll
index 1df284fb9fe2c..85c14d1e0ac04 100644
--- a/llvm/test/CodeGen/X86/add-sub-bool.ll
+++ b/llvm/test/CodeGen/X86/add-sub-bool.ll
@@ -392,30 +392,27 @@ define i32 @test_i32_add_add_commute_var(i32 %x, i32 %y, i32 %z, i32 %w) nounwin
 define i64 @test_i64_add_add_var(i64 %x, i64 %y, i64 %z, i64 %w) nounwind {
 ; X86-LABEL: test_i64_add_add_var:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    shrl %cl, %esi
 ; X86-NEXT:    shrl %cl, %edi
-; X86-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    jne .LBB15_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:  .LBB15_2:
-; X86-NEXT:    andl $1, %edi
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_i64_add_add_var:

diff  --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 023fb5065b892..7070848e3fe3e 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1039,14 +1039,14 @@ define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    movq 8(%rdi), %rax
-; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq 8(%rdi), %rdx
 ; SSE2-NEXT:    movl %esi, %ecx
 ; SSE2-NEXT:    andb $32, %cl
-; SSE2-NEXT:    shrdq %cl, %rax, %rdx
+; SSE2-NEXT:    shrq %cl, %rdx
 ; SSE2-NEXT:    shrq %cl, %rax
 ; SSE2-NEXT:    testb $64, %sil
-; SSE2-NEXT:    cmoveq %rdx, %rax
+; SSE2-NEXT:    cmovneq %rdx, %rax
 ; SSE2-NEXT:    btcl %esi, %eax
 ; SSE2-NEXT:    andl $96, %esi
 ; SSE2-NEXT:    shrl $3, %esi
@@ -1057,14 +1057,14 @@ define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    # kill: def $esi killed $esi def $rsi
 ; SSE4-NEXT:    movdqa (%rdi), %xmm0
-; SSE4-NEXT:    pextrq $1, %xmm0, %rax
-; SSE4-NEXT:    movq %xmm0, %rdx
+; SSE4-NEXT:    movq %xmm0, %rax
+; SSE4-NEXT:    pextrq $1, %xmm0, %rdx
 ; SSE4-NEXT:    movl %esi, %ecx
 ; SSE4-NEXT:    andb $32, %cl
-; SSE4-NEXT:    shrdq %cl, %rax, %rdx
+; SSE4-NEXT:    shrq %cl, %rdx
 ; SSE4-NEXT:    shrq %cl, %rax
 ; SSE4-NEXT:    testb $64, %sil
-; SSE4-NEXT:    cmoveq %rdx, %rax
+; SSE4-NEXT:    cmovneq %rdx, %rax
 ; SSE4-NEXT:    btcl %esi, %eax
 ; SSE4-NEXT:    andl $96, %esi
 ; SSE4-NEXT:    shrl $3, %esi
@@ -1075,14 +1075,14 @@ define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX-NEXT:    vmovq %xmm0, %rdx
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    andb $32, %cl
-; AVX-NEXT:    shrdq %cl, %rax, %rdx
-; AVX-NEXT:    shrxq %rcx, %rax, %rax
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX-NEXT:    movl %esi, %edx
+; AVX-NEXT:    andb $32, %dl
+; AVX-NEXT:    shrxq %rdx, %rcx, %rcx
+; AVX-NEXT:    shrxq %rdx, %rax, %rax
 ; AVX-NEXT:    testb $64, %sil
-; AVX-NEXT:    cmoveq %rdx, %rax
+; AVX-NEXT:    cmovneq %rcx, %rax
 ; AVX-NEXT:    btcl %esi, %eax
 ; AVX-NEXT:    andl $96, %esi
 ; AVX-NEXT:    shrl $3, %esi
@@ -1386,12 +1386,12 @@ define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
 ; SSE-NEXT:    xorq (%rdi), %r9
 ; SSE-NEXT:    movl %edx, %ecx
 ; SSE-NEXT:    andb $32, %cl
-; SSE-NEXT:    movq %r9, %rax
-; SSE-NEXT:    shrdq %cl, %rsi, %rax
-; SSE-NEXT:    movq %rsi, %r11
+; SSE-NEXT:    movq %rsi, %rax
+; SSE-NEXT:    shrq %cl, %rax
+; SSE-NEXT:    movq %r9, %r11
 ; SSE-NEXT:    shrq %cl, %r11
 ; SSE-NEXT:    testb $64, %dl
-; SSE-NEXT:    cmoveq %rax, %r11
+; SSE-NEXT:    cmovneq %rax, %r11
 ; SSE-NEXT:    btl %edx, %r11d
 ; SSE-NEXT:    setae %al
 ; SSE-NEXT:    orq %r10, %rsi
@@ -1415,23 +1415,22 @@ define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
 ; AVX2-NEXT:    xorl %r11d, %r11d
 ; AVX2-NEXT:    movl %eax, %ecx
 ; AVX2-NEXT:    shldq %cl, %r10, %r11
-; AVX2-NEXT:    shlxq %rax, %r10, %r10
+; AVX2-NEXT:    shlxq %rax, %r10, %rcx
 ; AVX2-NEXT:    testb $64, %al
-; AVX2-NEXT:    cmovneq %r10, %r11
-; AVX2-NEXT:    cmovneq %r9, %r10
+; AVX2-NEXT:    cmovneq %rcx, %r11
+; AVX2-NEXT:    cmovneq %r9, %rcx
 ; AVX2-NEXT:    xorq 8(%rdi), %rsi
 ; AVX2-NEXT:    xorq (%rdi), %r8
-; AVX2-NEXT:    movl %edx, %ecx
-; AVX2-NEXT:    andb $32, %cl
-; AVX2-NEXT:    movq %r8, %rax
-; AVX2-NEXT:    shrdq %cl, %rsi, %rax
-; AVX2-NEXT:    shrxq %rcx, %rsi, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andb $32, %al
+; AVX2-NEXT:    shrxq %rax, %rsi, %r9
+; AVX2-NEXT:    shrxq %rax, %r8, %rax
 ; AVX2-NEXT:    testb $64, %dl
-; AVX2-NEXT:    cmoveq %rax, %rcx
-; AVX2-NEXT:    btl %edx, %ecx
+; AVX2-NEXT:    cmovneq %r9, %rax
+; AVX2-NEXT:    btl %edx, %eax
 ; AVX2-NEXT:    setae %al
 ; AVX2-NEXT:    orq %r11, %rsi
-; AVX2-NEXT:    orq %r10, %r8
+; AVX2-NEXT:    orq %rcx, %r8
 ; AVX2-NEXT:    movq %r8, (%rdi)
 ; AVX2-NEXT:    movq %rsi, 8(%rdi)
 ; AVX2-NEXT:    retq
@@ -1451,23 +1450,22 @@ define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
 ; AVX512-NEXT:    xorl %r11d, %r11d
 ; AVX512-NEXT:    movl %eax, %ecx
 ; AVX512-NEXT:    shldq %cl, %r9, %r11
-; AVX512-NEXT:    shlxq %rax, %r9, %r9
+; AVX512-NEXT:    shlxq %rax, %r9, %rcx
 ; AVX512-NEXT:    testb $64, %al
-; AVX512-NEXT:    cmovneq %r9, %r11
-; AVX512-NEXT:    cmovneq %r10, %r9
+; AVX512-NEXT:    cmovneq %rcx, %r11
+; AVX512-NEXT:    cmovneq %r10, %rcx
 ; AVX512-NEXT:    xorq 8(%rdi), %rsi
 ; AVX512-NEXT:    xorq (%rdi), %r8
-; AVX512-NEXT:    movl %edx, %ecx
-; AVX512-NEXT:    andb $32, %cl
-; AVX512-NEXT:    movq %r8, %rax
-; AVX512-NEXT:    shrdq %cl, %rsi, %rax
-; AVX512-NEXT:    shrxq %rcx, %rsi, %rcx
+; AVX512-NEXT:    movl %edx, %eax
+; AVX512-NEXT:    andb $32, %al
+; AVX512-NEXT:    shrxq %rax, %rsi, %r9
+; AVX512-NEXT:    shrxq %rax, %r8, %rax
 ; AVX512-NEXT:    testb $64, %dl
-; AVX512-NEXT:    cmoveq %rax, %rcx
-; AVX512-NEXT:    btl %edx, %ecx
+; AVX512-NEXT:    cmovneq %r9, %rax
+; AVX512-NEXT:    btl %edx, %eax
 ; AVX512-NEXT:    setae %al
 ; AVX512-NEXT:    orq %r11, %rsi
-; AVX512-NEXT:    orq %r9, %r8
+; AVX512-NEXT:    orq %rcx, %r8
 ; AVX512-NEXT:    movq %r8, (%rdi)
 ; AVX512-NEXT:    movq %rsi, 8(%rdi)
 ; AVX512-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index fd58efc235b1f..274b31173e6ab 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -468,22 +468,22 @@ define i1 @scalar_i128_lowestbit_eq(i128 %x, i128 %y) nounwind {
 ; X64-BMI1:       # %bb.0:
 ; X64-BMI1-NEXT:    movl %edx, %ecx
 ; X64-BMI1-NEXT:    andb $32, %cl
-; X64-BMI1-NEXT:    shrdq %cl, %rsi, %rdi
 ; X64-BMI1-NEXT:    shrq %cl, %rsi
+; X64-BMI1-NEXT:    shrq %cl, %rdi
 ; X64-BMI1-NEXT:    testb $64, %dl
-; X64-BMI1-NEXT:    cmoveq %rdi, %rsi
-; X64-BMI1-NEXT:    btl %edx, %esi
+; X64-BMI1-NEXT:    cmovneq %rsi, %rdi
+; X64-BMI1-NEXT:    btl %edx, %edi
 ; X64-BMI1-NEXT:    setae %al
 ; X64-BMI1-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: scalar_i128_lowestbit_eq:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movl %edx, %ecx
-; X64-BMI2-NEXT:    andb $32, %cl
-; X64-BMI2-NEXT:    shrdq %cl, %rsi, %rdi
-; X64-BMI2-NEXT:    shrxq %rcx, %rsi, %rax
+; X64-BMI2-NEXT:    movl %edx, %eax
+; X64-BMI2-NEXT:    andb $32, %al
+; X64-BMI2-NEXT:    shrxq %rax, %rsi, %rcx
+; X64-BMI2-NEXT:    shrxq %rax, %rdi, %rax
 ; X64-BMI2-NEXT:    testb $64, %dl
-; X64-BMI2-NEXT:    cmoveq %rdi, %rax
+; X64-BMI2-NEXT:    cmovneq %rcx, %rax
 ; X64-BMI2-NEXT:    btl %edx, %eax
 ; X64-BMI2-NEXT:    setae %al
 ; X64-BMI2-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll
index 2ef5def9c0fd8..2662767b9e2db 100644
--- a/llvm/test/CodeGen/X86/known-pow2.ll
+++ b/llvm/test/CodeGen/X86/known-pow2.ll
@@ -877,11 +877,11 @@ define i1 @pow2_and_i128(i128 %num, i128 %shift) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edx, %ecx
 ; CHECK-NEXT:    andb $32, %cl
-; CHECK-NEXT:    shrdq %cl, %rsi, %rdi
 ; CHECK-NEXT:    shrq %cl, %rsi
+; CHECK-NEXT:    shrq %cl, %rdi
 ; CHECK-NEXT:    testb $64, %dl
-; CHECK-NEXT:    cmoveq %rdi, %rsi
-; CHECK-NEXT:    btl %edx, %esi
+; CHECK-NEXT:    cmovneq %rsi, %rdi
+; CHECK-NEXT:    btl %edx, %edi
 ; CHECK-NEXT:    setae %al
 ; CHECK-NEXT:    retq
   %mask = shl nuw i128 1, %shift