[llvm] [X86] combineConcatVectorOps - require free concatenation of at least one operand of UNPCKL\H (PR #135366)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 11 06:28:36 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Stop just replacing 2*UNPCK+INSERT_SUBVECTOR with 2*INSERT_SUBVECTOR+UNPCK
Currently limited to sub-64-bit element cases until we've accounted for the remaining regressions from some build_vector style patterns.
---
Patch is 187.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135366.diff
8 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+12-5)
- (modified) llvm/test/CodeGen/X86/avx512fp16-frem.ll (+337-333)
- (modified) llvm/test/CodeGen/X86/vector-half-conversions.ll (+145-141)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll (+14-16)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll (+66-66)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll (+114-114)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll (+66-66)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll (+110-110)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4e94d2cef6bd8..51bb4fcdc88a4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58246,17 +58246,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
case X86ISD::UNPCKL: {
// TODO: UNPCK should use CombineSubOperand
// Don't concatenate build_vector patterns.
- if (!IsSplat && EltSizeInBits >= 32 &&
- ((VT.is256BitVector() && Subtarget.hasInt256()) ||
- (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
+ if (!IsSplat &&
+ ((VT.is256BitVector() &&
+ (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
+ (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
none_of(Ops, [](SDValue Op) {
return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
ISD::SCALAR_TO_VECTOR ||
peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
ISD::SCALAR_TO_VECTOR;
})) {
- return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
- ConcatSubOperand(VT, Ops, 1));
+ SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
+ SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
+ if (Concat0 || Concat1 ||
+ (Subtarget.hasInt256() && EltSizeInBits == 64))
+ return DAG.getNode(Opcode, DL, VT,
+ Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
+ Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
}
break;
}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-frem.ll b/llvm/test/CodeGen/X86/avx512fp16-frem.ll
index 57ee3468e196b..1d1bb7649edd5 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-frem.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-frem.ll
@@ -285,7 +285,7 @@ define <8 x half> @frem_vec8(<8 x half> %x, <8 x half> %y) nounwind {
define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind {
; CHECK-LABEL: frem_vec16:
; CHECK: # %bb.0:
-; CHECK-NEXT: subq $184, %rsp
+; CHECK-NEXT: subq $168, %rsp
; CHECK-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -293,7 +293,7 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind {
; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
-; CHECK-NEXT: vmovapd %xmm1, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vzeroupper
@@ -303,7 +303,7 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind {
; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpermilps $255, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = mem[3,3,3,3]
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
; CHECK-NEXT: callq fmodf at PLT
@@ -311,48 +311,48 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind {
; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm1 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
; CHECK-NEXT: callq fmodf at PLT
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,0]
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm1 = mem[3,3,3,3]
+; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm1 = mem[1,0]
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
; CHECK-NEXT: callq fmodf at PLT
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpsrldq $10, (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm1 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq fmodf at PLT
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
+; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm1 = mem[1,0]
+; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm1 = mem[3,3,3,3]
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
; CHECK-NEXT: callq fmodf at PLT
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
@@ -372,64 +372,64 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind {
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpsrlq $48, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq fmodf at PLT
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovshdup (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = mem[1,1,3,3]
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
; CHECK-NEXT: callq fmodf at PLT
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
; CHECK-NEXT: callq fmodf at PLT
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm1 = mem[1,1,3,3]
+; CHECK-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
; CHECK-NEXT: callq fmodf at PLT
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq fmodf at PLT
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpsrld $16, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm1 = mem[1,1,3,3]
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
; CHECK-NEXT: callq fmodf at PLT
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
@@ -447,12 +447,12 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind {
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
; CHECK-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; CHECK-NEXT: addq $184, %rsp
+; CHECK-NEXT: addq $168, %rsp
; CHECK-NEXT: retq
%r = frem <16 x half> %x, %y
ret <16 x half> %r
@@ -461,7 +461,7 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind {
define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind {
; CHECK-LABEL: frem_vec32:
; CHECK: # %bb.0:
-; CHECK-NEXT: subq $408, %rsp # imm = 0x198
+; CHECK-NEXT: subq $360, %rsp # imm = 0x168
; CHECK-NEXT: vmovupd %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
@@ -475,7 +475,7 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind {
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq fmodf at PLT
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
@@ -484,42 +484,38 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind {
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
; CHECK-NEXT: callq fmodf at PLT
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0
-; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1
-; CHECK-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq fmodf at PLT
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,0]
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm1 = mem[3,3,3,3]
+; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm1 = mem[1,0]
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
; CHECK-NEXT: callq fmodf at PLT
; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; CHECK-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; CHECK-NEXT: vextractf128 $1,...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/135366
More information about the llvm-commits
mailing list