[llvm] [X86] Add rewrite pattern for SSE41/AVX1 roundss/sd + blendps/pd (PR #172056)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 12 09:54:33 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Gergo Stomfai (stomfaig)
<details>
<summary>Changes</summary>
Due to a previous PR (https://github.com/llvm/llvm-project/pull/171227), operations likeĀ `_mm_ceil_sd` compile to suboptimal assembly:
```asm
roundsd xmm1, xmm1, 10
blendpd xmm0, xmm1, 1
```
This PR introduces a rewrite pattern to mitigate this, and fuse the corresponding operations.
However, note that since `ROUNDSSri_INT` is defined via:
https://github.com/llvm/llvm-project/blob/26ff16663777fc995e8c6b46fa2433610dab4f64/llvm/lib/Target/X86/X86InstrSSE.td#L5692C1-L5694C63
in some cases we still end up with two instructions like (see diff also):
```asm
roundsd $9, %xmm0, %xmm1
movapd %xmm1, %xmm0
```
I propose rewriting the definition of `ROUNDSSri_INT` (or adding another record, maybe `ROUNDSS_rri_INT`) to
```
defm ROUND : sse41_fp_unop_s_int<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
v4f32, v2f64, X86RndScales, 0>, VVVV;
```
but I would like to discuss this first before implementing it.
---
Full diff: https://github.com/llvm/llvm-project/pull/172056.diff
2 Files Affected:
- (modified) llvm/lib/Target/X86/X86InstrSSE.td (+25)
- (modified) llvm/test/CodeGen/X86/vec_floor.ll (+12-16)
``````````diff
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index e4aaa1e1b594a..6c6e8386e4b58 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5707,6 +5707,31 @@ let Predicates = [UseSSE41, OptForSize] in {
(ROUNDSDmi addr:$src1, timm:$src2)>;
}
+multiclass scalar_unary_math_patterns_with_immediate<
+ SDPatternOperator OpNode, string OpcPrefix, SDNode Move, ValueType VT> {
+ let Predicates = [UseSSE41] in {
+ def : Pat<(VT(Move VT:$dst, (scalar_to_vector(OpNode
+ (extractelt VT:$src, (i64 0)),
+ i32:$imm)))),
+ (!cast<Instruction>(OpcPrefix#ri_Int) VT:$dst, VT:$src,
+ i32:$imm)>;
+ }
+
+ // Repeat for AVX versions of the instructions.
+ let Predicates = [UseAVX] in {
+ def : Pat<(VT(Move VT:$dst, (scalar_to_vector(OpNode
+ (extractelt VT:$src, (i64 0)),
+ i32:$imm)))),
+ (!cast<Instruction>("V"#OpcPrefix#ri_Int) VT:$dst, VT:$src,
+ i32:$imm)>;
+ }
+}
+
+defm : scalar_unary_math_patterns_with_immediate<X86any_VRndScale, "ROUNDSS",
+ X86Movss, v4f32>;
+defm : scalar_unary_math_patterns_with_immediate<X86any_VRndScale, "ROUNDSD",
+ X86Movsd, v2f64>;
+
//===----------------------------------------------------------------------===//
// SSE4.1 - Packed Bit Test
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll
index 7f4ed3394d10d..ffe493cdac1a8 100644
--- a/llvm/test/CodeGen/X86/vec_floor.ll
+++ b/llvm/test/CodeGen/X86/vec_floor.ll
@@ -821,14 +821,13 @@ define <4 x float> @const_trunc_v4f32() {
define <4 x float> @floor_ss(<4 x float> %x, <4 x float> %y) nounwind {
; SSE41-LABEL: floor_ss:
; SSE41: ## %bb.0:
-; SSE41-NEXT: roundss $9, %xmm0, %xmm0
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: roundss $9, %xmm0, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: floor_ss:
; AVX: ## %bb.0:
-; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vroundss $9, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_ss:
@@ -846,14 +845,13 @@ declare float @llvm.floor.f32(float %s)
define <2 x double> @floor_sd(<2 x double> %x, <2 x double> %y) nounwind {
; SSE41-LABEL: floor_sd:
; SSE41: ## %bb.0:
-; SSE41-NEXT: roundsd $9, %xmm0, %xmm0
-; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: roundsd $9, %xmm0, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: floor_sd:
; AVX: ## %bb.0:
-; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: vroundsd $9, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_sd:
@@ -1811,14 +1809,13 @@ define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) noun
define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind {
; SSE41-LABEL: ceil_ss:
; SSE41: ## %bb.0:
-; SSE41-NEXT: roundss $10, %xmm0, %xmm0
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: roundss $10, %xmm0, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: ceil_ss:
; AVX: ## %bb.0:
-; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vroundss $10, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_ss:
@@ -1836,14 +1833,13 @@ declare float @llvm.ceil.f32(float %s)
define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind {
; SSE41-LABEL: ceil_sd:
; SSE41: ## %bb.0:
-; SSE41-NEXT: roundsd $10, %xmm0, %xmm0
-; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: roundsd $10, %xmm0, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: ceil_sd:
; AVX: ## %bb.0:
-; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: vroundsd $10, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_sd:
``````````
</details>
https://github.com/llvm/llvm-project/pull/172056
More information about the llvm-commits
mailing list