[llvm] [X86] Add tests showing failure to concat RCPPS + RSQRPS intrinsics together. (PR #170098)

Mon Dec 1 02:56:35 PST 2025

https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/170098

Can only do this for 128->256 cases as we can't safely convert to the RCP14/RSQRT14 variants

>From 0e567870b1e51277ecece534048a686dab49b816 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 1 Dec 2025 10:55:48 +0000
Subject: [PATCH] [X86] Add tests showing failure to concat RCPPS + RSQRPS
 intrinsics together.

Can only do this for 128->256 cases as we can't safely convert to the RCP14/RSQRT14 variants
---
 llvm/test/CodeGen/X86/combine-rcp.ll   | 65 ++++++++++++++++++++++++++
 llvm/test/CodeGen/X86/combine-rsqrt.ll | 65 ++++++++++++++++++++++++++
 2 files changed, 130 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/combine-rcp.ll
 create mode 100644 llvm/test/CodeGen/X86/combine-rsqrt.ll

diff --git a/llvm/test/CodeGen/X86/combine-rcp.ll b/llvm/test/CodeGen/X86/combine-rcp.ll
new file mode 100644
index 0000000000000..7de3e96d592db
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-rcp.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <8 x float> @concat_rcp_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_rcp_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rcpps %xmm0, %xmm0
+; SSE-NEXT:    rcpps %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_rcp_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpps %xmm0, %xmm0
+; AVX-NEXT:    vrcpps %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+; Ensure we don't convert rcpps to rcp14ps
+define <16 x float> @concat_rcp_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_rcp_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rcpps %xmm0, %xmm0
+; SSE-NEXT:    rcpps %xmm1, %xmm1
+; SSE-NEXT:    rcpps %xmm2, %xmm2
+; SSE-NEXT:    rcpps %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_rcp_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vrcpps %xmm0, %xmm0
+; AVX1OR2-NEXT:    vrcpps %xmm1, %xmm1
+; AVX1OR2-NEXT:    vrcpps %xmm2, %xmm2
+; AVX1OR2-NEXT:    vrcpps %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_rcp_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcpps %xmm0, %xmm0
+; AVX512-NEXT:    vrcpps %xmm1, %xmm1
+; AVX512-NEXT:    vrcpps %xmm2, %xmm2
+; AVX512-NEXT:    vrcpps %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-rsqrt.ll b/llvm/test/CodeGen/X86/combine-rsqrt.ll
new file mode 100644
index 0000000000000..78688701f8cd3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-rsqrt.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <8 x float> @concat_rsqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_rsqrt_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rsqrtps %xmm0, %xmm0
+; SSE-NEXT:    rsqrtps %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_rsqrt_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrsqrtps %xmm0, %xmm0
+; AVX-NEXT:    vrsqrtps %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+; Ensure we don't convert rsqrtps to rsqrt14ps
+define <16 x float> @concat_rsqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_rsqrt_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rsqrtps %xmm0, %xmm0
+; SSE-NEXT:    rsqrtps %xmm1, %xmm1
+; SSE-NEXT:    rsqrtps %xmm2, %xmm2
+; SSE-NEXT:    rsqrtps %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_rsqrt_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vrsqrtps %xmm0, %xmm0
+; AVX1OR2-NEXT:    vrsqrtps %xmm1, %xmm1
+; AVX1OR2-NEXT:    vrsqrtps %xmm2, %xmm2
+; AVX1OR2-NEXT:    vrsqrtps %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_rsqrt_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrsqrtps %xmm0, %xmm0
+; AVX512-NEXT:    vrsqrtps %xmm1, %xmm1
+; AVX512-NEXT:    vrsqrtps %xmm2, %xmm2
+; AVX512-NEXT:    vrsqrtps %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}