[llvm] [GISel] funnel shift combiner port from SelectionDAG ISel to GlobalISel (PR #135132)

Wed Apr 16 01:11:06 PDT 2025

================
@@ -105,3 +105,55 @@ define i16 @test_shl_i48_2(i48 %x, i48 %y) {
   %trunc = trunc i48 %shl to i16
   ret i16 %trunc
 }
+
+define i16 @test_fshl_i32(i32 %x, i32 %_, i32 %y) {
+; RV32-LABEL: test_fshl_i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    not a3, a2
+; RV32-NEXT:    sll a0, a0, a2
+; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    srl a1, a1, a3
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_fshl_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    not a3, a2
+; RV64-NEXT:    sllw a0, a0, a2
+; RV64-NEXT:    srliw a1, a1, 1
+; RV64-NEXT:    srlw a1, a1, a3
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+
+    %fshl = call i32 @llvm.fshl.i32(i32 %x, i32 %_, i32 %y)
+    %shl = shl i32 %x, %y
+    %or = or i32 %fshl, %shl
+    %trunc = trunc i32 %or to i16
+    ret i16 %trunc
+}
+
+define i16 @test_fshr_i32(i32 %_, i32 %x, i32 %y) {
+; RV32-LABEL: test_fshr_i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    not a3, a2
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    sll a0, a0, a3
+; RV32-NEXT:    srl a1, a1, a2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_fshr_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    not a3, a2
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    sllw a0, a0, a3
+; RV64-NEXT:    srlw a1, a1, a2
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+
+    %fshr = call i32 @llvm.fshr.i32(i32 %_, i32 %x, i32 %y)
+    %lshr = lshr i32 %x, %y
+    %or = or i32 %fshr, %lshr
+    %trunc = trunc i32 %or to i16
+    ret i16 %trunc
+}
----------------
axelcool1234 wrote:

> That's going to be true in either case. As long as it actually compiles, it's not a problem and gives a point of reference for whenever the code improves for illegal types.

Before I go forward and introduce i48 funnel shift calls to the original tests (in order to also test for multi-use and if it correctly ensures the intermediate instructions are still emitted), I want to further clarify what I meant. The following is a test that uses an i32 fshl and another test that uses an i48 fshl. The difference in generated code (via `update_llc_test_checks`) is quite large; I assume that isn't much of a concern here? Once I introduce these calls to the original tests their generated code checks will also explode in size.  

```
define i32 @test_fshl_i32(i32 %x, i32 %y) {
; RV32-LABEL: test_fshl_i32:
; RV32:       # %bb.0:
; RV32-NEXT:    not a2, a1
; RV32-NEXT:    sll a0, a0, a1
; RV32-NEXT:    li a1, 8
; RV32-NEXT:    srl a1, a1, a2
; RV32-NEXT:    or a0, a0, a1
; RV32-NEXT:    ret
;
; RV64-LABEL: test_fshl_i32:
; RV64:       # %bb.0:
; RV64-NEXT:    not a2, a1
; RV64-NEXT:    sllw a0, a0, a1
; RV64-NEXT:    li a1, 8
; RV64-NEXT:    srlw a1, a1, a2
; RV64-NEXT:    or a0, a0, a1
; RV64-NEXT:    ret
  %fshl = call i32 @llvm.fshl.i32(i32 %x, i32 16, i32 %y)
  ret i32 %fshl
}

define i48 @test_fshl_i48(i48 %x, i48 %y) {
; RV32-LABEL: test_fshl_i48:
; RV32:       # %bb.0:
; RV32-NEXT:    addi sp, sp, -16
; RV32-NEXT:    .cfi_def_cfa_offset 16
; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
; RV32-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
; RV32-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
; RV32-NEXT:    .cfi_offset ra, -4
; RV32-NEXT:    .cfi_offset s0, -8
; RV32-NEXT:    .cfi_offset s1, -12
; RV32-NEXT:    .cfi_offset s2, -16
; RV32-NEXT:    mv s1, a0
; RV32-NEXT:    mv s0, a1
; RV32-NEXT:    mv a0, a2
; RV32-NEXT:    li s2, 47
; RV32-NEXT:    slli a1, a3, 16
; RV32-NEXT:    srli a1, a1, 16
; RV32-NEXT:    li a2, 48
; RV32-NEXT:    li a3, 0
; RV32-NEXT:    call __umoddi3
; RV32-NEXT:    li a2, 32
; RV32-NEXT:    bltu a0, a2, .LBB2_2
; RV32-NEXT:  # %bb.1:
; RV32-NEXT:    li a1, 0
; RV32-NEXT:    sll a4, s1, a0
; RV32-NEXT:    sub a3, s2, a0
; RV32-NEXT:    bnez a0, .LBB2_3
; RV32-NEXT:    j .LBB2_4
; RV32-NEXT:  .LBB2_2:
; RV32-NEXT:    sll a1, s1, a0
; RV32-NEXT:    neg a3, a0
; RV32-NEXT:    srl a3, s1, a3
; RV32-NEXT:    sll a4, s0, a0
; RV32-NEXT:    or a4, a3, a4
; RV32-NEXT:    sub a3, s2, a0
; RV32-NEXT:    beqz a0, .LBB2_4
; RV32-NEXT:  .LBB2_3:
; RV32-NEXT:    mv s0, a4
; RV32-NEXT:  .LBB2_4:
; RV32-NEXT:    li a0, 8
; RV32-NEXT:    bltu a3, a2, .LBB2_6
; RV32-NEXT:  # %bb.5:
; RV32-NEXT:    li a2, 0
; RV32-NEXT:    bnez a3, .LBB2_7
; RV32-NEXT:    j .LBB2_8
; RV32-NEXT:  .LBB2_6:
; RV32-NEXT:    srl a2, a0, a3
; RV32-NEXT:    beqz a3, .LBB2_8
; RV32-NEXT:  .LBB2_7:
; RV32-NEXT:    mv a0, a2
; RV32-NEXT:  .LBB2_8:
; RV32-NEXT:    or a0, a1, a0
; RV32-NEXT:    mv a1, s0
; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
; RV32-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
; RV32-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
; RV32-NEXT:    .cfi_restore ra
; RV32-NEXT:    .cfi_restore s0
; RV32-NEXT:    .cfi_restore s1
; RV32-NEXT:    .cfi_restore s2
; RV32-NEXT:    addi sp, sp, 16
; RV32-NEXT:    .cfi_def_cfa_offset 0
; RV32-NEXT:    ret
;
; RV64-LABEL: test_fshl_i48:
; RV64:       # %bb.0:
; RV64-NEXT:    addi sp, sp, -32
; RV64-NEXT:    .cfi_def_cfa_offset 32
; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
; RV64-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
; RV64-NEXT:    .cfi_offset ra, -8
; RV64-NEXT:    .cfi_offset s0, -16
; RV64-NEXT:    .cfi_offset s1, -24
; RV64-NEXT:    mv s0, a0
; RV64-NEXT:    li s1, 47
; RV64-NEXT:    slli a0, a1, 16
; RV64-NEXT:    srli a0, a0, 16
; RV64-NEXT:    li a1, 48
; RV64-NEXT:    call __umoddi3
; RV64-NEXT:    subw s1, s1, a0
; RV64-NEXT:    sll a0, s0, a0
; RV64-NEXT:    li a1, 8
; RV64-NEXT:    srl a1, a1, s1
; RV64-NEXT:    or a0, a0, a1
; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
; RV64-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
; RV64-NEXT:    .cfi_restore ra
; RV64-NEXT:    .cfi_restore s0
; RV64-NEXT:    .cfi_restore s1
; RV64-NEXT:    addi sp, sp, 32
; RV64-NEXT:    .cfi_def_cfa_offset 0
; RV64-NEXT:    ret
  %fshl = call i48 @llvm.fshl.i48(i48 %x, i48 16, i48 %y)
  ret i48 %fshl
}
```

> you also wouldn't need to think too hard about that if reusing the original test, which I hope covers some vectors

There aren't any vector tests here, but I'll gladly add some myself!

https://github.com/llvm/llvm-project/pull/135132