[llvm] 0e721b7 - [X86] Add tests showing failure to concat RCPPS + RSQRTPS intrinsics together. (#170098)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 1 03:28:38 PST 2025
Author: Simon Pilgrim
Date: 2025-12-01T11:28:34Z
New Revision: 0e721b75aaa39181c71e798d5a95102eb349bf1c
URL: https://github.com/llvm/llvm-project/commit/0e721b75aaa39181c71e798d5a95102eb349bf1c
DIFF: https://github.com/llvm/llvm-project/commit/0e721b75aaa39181c71e798d5a95102eb349bf1c.diff
LOG: [X86] Add tests showing failure to concat RCPPS + RSQRTPS intrinsics together. (#170098)
Can only do this for 128->256 cases as we can't safely convert to the RCP14/RSQRT14 variants
Added:
llvm/test/CodeGen/X86/combine-rcp.ll
llvm/test/CodeGen/X86/combine-rsqrt.ll
Modified:
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/combine-rcp.ll b/llvm/test/CodeGen/X86/combine-rcp.ll
new file mode 100644
index 0000000000000..7de3e96d592db
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-rcp.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <8 x float> @concat_rcp_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_rcp_v8f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: rcpps %xmm0, %xmm0
+; SSE-NEXT: rcpps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_rcp_v8f32_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vrcpps %xmm0, %xmm0
+; AVX-NEXT: vrcpps %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1)
+ %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+; Ensure we don't convert rcpps to rcp14ps
+define <16 x float> @concat_rcp_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_rcp_v16f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: rcpps %xmm0, %xmm0
+; SSE-NEXT: rcpps %xmm1, %xmm1
+; SSE-NEXT: rcpps %xmm2, %xmm2
+; SSE-NEXT: rcpps %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_rcp_v16f32_v4f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vrcpps %xmm0, %xmm0
+; AVX1OR2-NEXT: vrcpps %xmm1, %xmm1
+; AVX1OR2-NEXT: vrcpps %xmm2, %xmm2
+; AVX1OR2-NEXT: vrcpps %xmm3, %xmm3
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_rcp_v16f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vrcpps %xmm0, %xmm0
+; AVX512-NEXT: vrcpps %xmm1, %xmm1
+; AVX512-NEXT: vrcpps %xmm2, %xmm2
+; AVX512-NEXT: vrcpps %xmm3, %xmm3
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1)
+ %v2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a2)
+ %v3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a3)
+ %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-rsqrt.ll b/llvm/test/CodeGen/X86/combine-rsqrt.ll
new file mode 100644
index 0000000000000..78688701f8cd3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-rsqrt.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <8 x float> @concat_rsqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_rsqrt_v8f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: rsqrtps %xmm0, %xmm0
+; SSE-NEXT: rsqrtps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_rsqrt_v8f32_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vrsqrtps %xmm0, %xmm0
+; AVX-NEXT: vrsqrtps %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1)
+ %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+; Ensure we don't convert rsqrtps to rsqrt14ps
+define <16 x float> @concat_rsqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_rsqrt_v16f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: rsqrtps %xmm0, %xmm0
+; SSE-NEXT: rsqrtps %xmm1, %xmm1
+; SSE-NEXT: rsqrtps %xmm2, %xmm2
+; SSE-NEXT: rsqrtps %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_rsqrt_v16f32_v4f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vrsqrtps %xmm0, %xmm0
+; AVX1OR2-NEXT: vrsqrtps %xmm1, %xmm1
+; AVX1OR2-NEXT: vrsqrtps %xmm2, %xmm2
+; AVX1OR2-NEXT: vrsqrtps %xmm3, %xmm3
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_rsqrt_v16f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vrsqrtps %xmm0, %xmm0
+; AVX512-NEXT: vrsqrtps %xmm1, %xmm1
+; AVX512-NEXT: vrsqrtps %xmm2, %xmm2
+; AVX512-NEXT: vrsqrtps %xmm3, %xmm3
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1)
+ %v2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a2)
+ %v3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a3)
+ %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
More information about the llvm-commits
mailing list