[llvm] [X86] Add rewrite pattern for SSE41/AVX1 roundss/sd + blendps/pd (PR #172056)

Fri Dec 12 09:54:33 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: Gergo Stomfai (stomfaig)

<details>
<summary>Changes</summary>

Due to a previous PR (https://github.com/llvm/llvm-project/pull/171227), operations like `_mm_ceil_sd` compile to suboptimal assembly:
```asm
roundsd xmm1, xmm1, 10
blendpd xmm0, xmm1, 1
```
This PR introduces a rewrite pattern to mitigate this, and fuse the corresponding operations.

However, note that since `ROUNDSSri_INT`  is defined via:

https://github.com/llvm/llvm-project/blob/26ff16663777fc995e8c6b46fa2433610dab4f64/llvm/lib/Target/X86/X86InstrSSE.td#L5692C1-L5694C63

in some cases we still end up with two instructions like (see diff also):

```asm
roundsd $9, %xmm0, %xmm1
movapd %xmm1, %xmm0
```

I propose rewriting the definition of `ROUNDSSri_INT` (or adding another record, maybe `ROUNDSS_rri_INT`) to 
```
defm ROUND  : sse41_fp_unop_s_int<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
                                  v4f32, v2f64, X86RndScales, 0>, VVVV;
```
but I would like to discuss this first before implementing it.



---
Full diff: https://github.com/llvm/llvm-project/pull/172056.diff


2 Files Affected:

- (modified) llvm/lib/Target/X86/X86InstrSSE.td (+25) 
- (modified) llvm/test/CodeGen/X86/vec_floor.ll (+12-16) 


``````````diff

diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index e4aaa1e1b594a..6c6e8386e4b58 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5707,6 +5707,31 @@ let Predicates = [UseSSE41, OptForSize] in {
             (ROUNDSDmi addr:$src1, timm:$src2)>;
 }
 
+multiclass scalar_unary_math_patterns_with_immediate<
+    SDPatternOperator OpNode, string OpcPrefix, SDNode Move, ValueType VT> {
+  let Predicates = [UseSSE41] in {
+    def : Pat<(VT(Move VT:$dst, (scalar_to_vector(OpNode
+                                    (extractelt VT:$src, (i64 0)),
+                                    i32:$imm)))),
+              (!cast<Instruction>(OpcPrefix#ri_Int) VT:$dst, VT:$src,
+                  i32:$imm)>;
+  }
+
+  // Repeat for AVX versions of the instructions.
+  let Predicates = [UseAVX] in {
+    def : Pat<(VT(Move VT:$dst, (scalar_to_vector(OpNode
+                                    (extractelt VT:$src, (i64 0)),
+                                    i32:$imm)))),
+              (!cast<Instruction>("V"#OpcPrefix#ri_Int) VT:$dst, VT:$src,
+                  i32:$imm)>;
+  }
+}
+
+defm : scalar_unary_math_patterns_with_immediate<X86any_VRndScale, "ROUNDSS",
+                                                 X86Movss, v4f32>;
+defm : scalar_unary_math_patterns_with_immediate<X86any_VRndScale, "ROUNDSD",
+                                                 X86Movsd, v2f64>;
+
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Packed Bit Test
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll
index 7f4ed3394d10d..ffe493cdac1a8 100644
--- a/llvm/test/CodeGen/X86/vec_floor.ll
+++ b/llvm/test/CodeGen/X86/vec_floor.ll
@@ -821,14 +821,13 @@ define <4 x float> @const_trunc_v4f32() {
 define <4 x float> @floor_ss(<4 x float> %x, <4 x float> %y) nounwind {
 ; SSE41-LABEL: floor_ss:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    roundss $9, %xmm0, %xmm0
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    roundss $9, %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: floor_ss:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vroundss $9, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_ss:
@@ -846,14 +845,13 @@ declare float @llvm.floor.f32(float %s)
 define <2 x double> @floor_sd(<2 x double> %x, <2 x double> %y) nounwind {
 ; SSE41-LABEL: floor_sd:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    roundsd $9, %xmm0, %xmm0
-; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    roundsd $9, %xmm0, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: floor_sd:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vroundsd $9, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_sd:
@@ -1811,14 +1809,13 @@ define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) noun
 define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind {
 ; SSE41-LABEL: ceil_ss:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    roundss $10, %xmm0, %xmm0
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    roundss $10, %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: ceil_ss:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vroundss $10, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_ss:
@@ -1836,14 +1833,13 @@ declare float @llvm.ceil.f32(float %s)
 define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind {
 ; SSE41-LABEL: ceil_sd:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    roundsd $10, %xmm0, %xmm0
-; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    roundsd $10, %xmm0, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: ceil_sd:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vroundsd $10, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_sd:

``````````

</details>


https://github.com/llvm/llvm-project/pull/172056