[llvm] [LLVM] treat `@llvm.ssub.sat` the same as `@llvm.aarch64.neon.sqsub` (PR #140454)

Sun May 18 07:54:52 PDT 2025

folkertdev wrote:

Yeah, two tests fail in `llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll`, I believe this is a regression

```llvm
define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
; CHECK-LABEL: optimize_dup:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    fmla v0.4s, v1.4s, v3.s[3]
; CHECK-NEXT:    fmls v0.4s, v2.4s, v3.s[3]
; CHECK-NEXT:    ret
entry:
  %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
  %lane2 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
	%1 = fmul <4 x float> %lane2, %c
	%s = fsub <4 x float> %0, %1
  ret <4 x float> %s
}

define <4 x float> @no_optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
; CHECK-LABEL: no_optimize_dup:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    fmla v0.4s, v1.4s, v3.s[3]
; CHECK-NEXT:    fmls v0.4s, v2.4s, v3.s[1]
; CHECK-NEXT:    ret
entry:
  %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
  %lane2 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
	%1 = fmul <4 x float> %lane2, %c
	%s = fsub <4 x float> %0, %1
  ret <4 x float> %s
}
```

```
        3738:  .type optimize_dup, at function 
        3739: optimize_dup: // @optimize_dup 
        3740:  .cfi_startproc 
        3741: // %bb.0: // %entry 
        3742:  fmul v2.4s, v2.4s, v3.s[3] 
        3743:  fmla v0.4s, v1.4s, v3.s[3] 
next:4507      !~~~~~~~~~~~~~~~~~~~~~~~~~  error: match on wrong line
        3744:  fsub v0.4s, v0.4s, v2.4s 
        3745:  ret 
        3746: .Lfunc_end298: 
        3747:  .size optimize_dup, .Lfunc_end298-optimize_dup 
        3748:  .cfi_endproc 
        3749:  // -- End function 
        3750:  .globl no_optimize_dup // -- Begin function no_optimize_dup 
        3751:  .p2align 2 
        3752:  .type no_optimize_dup, at function 
        3753: no_optimize_dup: // @no_optimize_dup 
        3754:  .cfi_startproc 
        3755: // %bb.0: // %entry 
        3756:  fmul v2.4s, v2.4s, v3.s[1] 
        3757:  fmla v0.4s, v1.4s, v3.s[3] 
next:4522      !~~~~~~~~~~~~~~~~~~~~~~~~~  error: match on wrong line
        3758:  fsub v0.4s, v0.4s, v2.4s 
        3759:  ret 
        3760: .Lfunc_end299: 
        3761:  .size no_optimize_dup, .Lfunc_end299-no_optimize_dup 
        3762:  .cfi_endproc 
```

It looks like the test assumes that the operations are fused, and with these changes they no longer are. 

Does that sound right? I guess there is some other rule with fma that we've missed?

https://github.com/llvm/llvm-project/pull/140454