[llvm] [X86] X86DAGToDAGISel - attempt to merge XMM/YMM loads with YMM/ZMM loads of the same ptr (PR #73126)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 22 06:41:08 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
If we are loading the same ptr at different vector widths, then reuse the larger load and just extract the low subvector.
This is mainly useful for better constant sharing.
Unlike the equivalent VBROADCAST_LOAD/SUBV_BROADCAST_LOAD folds which can occur in DAG, we have to wait until DAGISel otherwise we can hit infinite loops if constant folding recreates the original constant value.
---
Patch is 1.26 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73126.diff
37 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (+36)
- (modified) llvm/test/CodeGen/X86/avx512-regcall-Mask.ll (+1-3)
- (modified) llvm/test/CodeGen/X86/bfloat.ll (+3-3)
- (modified) llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll (+32-26)
- (modified) llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll (+36-30)
- (modified) llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll (+102-102)
- (modified) llvm/test/CodeGen/X86/constant-pool-sharing.ll (+8-8)
- (modified) llvm/test/CodeGen/X86/insert-into-constant-vector.ll (+20-22)
- (modified) llvm/test/CodeGen/X86/midpoint-int-vec-512.ll (+20-20)
- (modified) llvm/test/CodeGen/X86/pr57340.ll (+31-32)
- (modified) llvm/test/CodeGen/X86/splat-for-size.ll (+3-3)
- (modified) llvm/test/CodeGen/X86/subvector-broadcast.ll (+20-22)
- (modified) llvm/test/CodeGen/X86/vec_fabs.ll (+21-67)
- (modified) llvm/test/CodeGen/X86/vec_int_to_fp.ll (+104-111)
- (modified) llvm/test/CodeGen/X86/vector-fshl-256.ll (+16-16)
- (modified) llvm/test/CodeGen/X86/vector-fshl-512.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/vector-fshl-rot-256.ll (+19-20)
- (modified) llvm/test/CodeGen/X86/vector-fshr-256.ll (+167-167)
- (modified) llvm/test/CodeGen/X86/vector-fshr-512.ll (+94-94)
- (modified) llvm/test/CodeGen/X86/vector-fshr-rot-256.ll (+19-20)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll (+25-25)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll (+115-118)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll (+105-105)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll (+100-102)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll (+134-137)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll (+573-572)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll (+2776-2726)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll (+292-286)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll (+1192-1206)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll (+709-708)
- (modified) llvm/test/CodeGen/X86/vector-sext.ll (+6-4)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll (+12-12)
- (modified) llvm/test/CodeGen/X86/viabs.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/vselect-avx.ll (+4-5)
- (modified) llvm/test/CodeGen/X86/x86-interleaved-access.ll (+64-63)
- (modified) llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll (+7-5)
- (modified) llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll (+11-10)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 93e184eca9bc515..8b9393580dc45bf 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1036,6 +1036,42 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
break;
}
+ case ISD::LOAD: {
+ // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
+ // load, then just extract the lower subvector and avoid the second load.
+ auto *Ld = cast<LoadSDNode>(N);
+ MVT VT = N->getSimpleValueType(0);
+ if (ISD::isNormalLoad(Ld) && Ld->isSimple() &&
+ (VT.is128BitVector() || VT.is256BitVector())) {
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Chain = Ld->getChain();
+ for (SDNode *User : Ptr->uses()) {
+ auto *UserLd = dyn_cast<LoadSDNode>(N);
+ MVT UserVT = User->getSimpleValueType(0);
+ if (User != N && UserLd && ISD::isNormalLoad(User) &&
+ UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
+ !User->hasAnyUseOfValue(1) &&
+ (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
+ UserVT.getSizeInBits() > VT.getSizeInBits()) {
+ SDLoc dl(N);
+ unsigned NumSubElts =
+ VT.getSizeInBits() / UserVT.getScalarSizeInBits();
+ MVT SubVT = MVT::getVectorVT(UserVT.getScalarType(), NumSubElts);
+ SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
+ SDValue(User, 0),
+ CurDAG->getIntPtrConstant(0, dl));
+ SDValue Res = CurDAG->getBitcast(VT, Extract);
+ --I;
+ SDValue To[] = {Res, SDValue(UserLd, 1)};
+ CurDAG->ReplaceAllUsesWith(N, To);
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+ }
+ }
+ break;
+ }
case ISD::VSELECT: {
// Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
index 34a205a7baa8641..b3a0c7dffae117c 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -98,10 +98,8 @@ define dso_local i64 @caller_argv64i1() #0 {
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %edi
; X32-NEXT: subl $88, %esp
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = [2,1,2,1]
-; X32-NEXT: # xmm0 = mem[0,0]
-; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1]
+; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: vmovups %zmm0, (%esp)
; X32-NEXT: movl $1, {{[0-9]+}}(%esp)
; X32-NEXT: movl $2, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 7a82515ad24b72c..4792e8343d7589f 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -1401,9 +1401,9 @@ define <32 x bfloat> @pr63017_2() nounwind {
; AVXNC-NEXT: jne .LBB12_2
; AVXNC-NEXT: # %bb.1: # %cond.load
; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm1 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
-; AVXNC-NEXT: vpbroadcastw {{.*#+}} xmm0 = [49024,49024,49024,49024,49024,49024,49024,49024]
-; AVXNC-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm0
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; AVXNC-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVXNC-NEXT: .LBB12_2: # %else
; AVXNC-NEXT: xorl %eax, %eax
; AVXNC-NEXT: testb %al, %al
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index a995e93708456e7..adf7fa97b776527 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -209,8 +209,9 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8]
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -255,8 +256,9 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -303,8 +305,9 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -421,13 +424,15 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8]
+; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
+; AVX1-NEXT: vpcmpeqq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [16,32,64,128]
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
@@ -482,13 +487,15 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
@@ -549,17 +556,16 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT: vpcmpeqw %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i32_32i16:
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index 544d9b21eca7b9a..50c132b6c34de98 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -265,8 +265,9 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8]
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -324,8 +325,9 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -385,8 +387,9 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -544,15 +547,17 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8]
+; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
+; AVX1-NEXT: vpcmpeqq %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [16,32,64,128]
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -623,15 +628,17 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -705,22 +712,21 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm4
-; AVX1-NEXT: vpsrlw $15, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlw $15, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [256,512,1024,2048,4096,8192,16384,32768]
-; AVX1-NEXT: vpcmpeqw %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,512,1024,2048,4096,8192,16384,32768]
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpcmpeqw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index b69d22e04d7deb7..13f1451bbc8b027 100644
--- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -123,12 +123,12 @@ define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
; AVX-LABEL: f32xi8_i16:
; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f32xi8_i16:
@@ -140,12 +140,12 @@ define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
;
; AVX-64-LABEL: f32xi8_i16:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddb %xmm1, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f32xi8_i16:
@@ -245,12 +245,13 @@ define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
; AVX-LABEL: f32xi8_i128:
; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f32xi8_i128:
@@ -263,12 +264,13 @@ define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
;
; AVX-64-LABEL: f32xi8_i128:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: # ymm1 = m...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/73126
More information about the llvm-commits
mailing list