[llvm] FIXME: Revert "Revert "[X86] combineBROADCAST_LOAD - merge across chains"" (PR #128381)
Vitaly Buka via llvm-commits
llvm-commits at lists.llvm.org
Sat Feb 22 16:16:18 PST 2025
https://github.com/vitalybuka created https://github.com/llvm/llvm-project/pull/128381
Reverts llvm/llvm-project#128380
>From dde78a2e84edc0fe987266a7fcae774dd36532c3 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at gmail.com>
Date: Sat, 22 Feb 2025 16:15:48 -0800
Subject: [PATCH] Revert "Revert "[X86] combineBROADCAST_LOAD - merge across
chains" (#128380)"
This reverts commit 50b0669e8468279518ae0be27c8b6a134c4d95d1.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 16 +-
...d_vector_inreg_of_broadcast_from_memory.ll | 193 ++++++++----------
...d_vector_inreg_of_broadcast_from_memory.ll | 74 +++----
3 files changed, 128 insertions(+), 155 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2bc76d3814792..a4357197e2843 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59360,21 +59360,14 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
}
-// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
-// from. Limit this to cases where the loads have the same input chain and the
-// output chains are unused. This avoids any memory ordering issues.
+// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type");
- // Only do this if the chain result is unused.
- if (N->hasAnyUseOfValue(1))
- return SDValue();
-
auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
-
SDValue Ptr = MemIntrin->getBasePtr();
SDValue Chain = MemIntrin->getChain();
EVT VT = N->getSimpleValueType(0);
@@ -59388,12 +59381,15 @@ static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
MemVT.getSizeInBits() &&
- !User->hasAnyUseOfValue(1) &&
User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
+ assert(cast<MemIntrinsicSDNode>(User)->isSimple() &&
+ MemIntrin->isSimple() && "Illegal broadcast load type");
SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
VT.getSizeInBits());
Extract = DAG.getBitcast(VT, Extract);
- return DCI.CombineTo(N, Extract, SDValue(User, 1));
+ Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
+ DAG.makeEquivalentMemoryOrdering(SDValue(N, 1), Extract.getValue(1));
+ return Extract;
}
return SDValue();
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index d617cfb6aedee..ca589e63b671a 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1888,15 +1888,14 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
;
; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
-; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -2112,15 +2111,14 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
;
; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -2237,33 +2235,29 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
;
; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
-; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
+; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
-; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
+; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -2272,9 +2266,8 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
-; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2339,15 +2332,14 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
;
; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -2462,33 +2454,29 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
;
; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
-; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
+; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
-; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
+; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -2497,9 +2485,8 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
-; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2788,14 +2775,13 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
;
; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
-; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -2990,14 +2976,13 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
;
; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
-; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
+; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -3108,27 +3093,25 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
;
; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
+; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
-; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
-; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0
+; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index dbd9df36239bd..7f206fd3bf2aa 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1906,7 +1906,6 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm1
; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
@@ -1922,7 +1921,6 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm1
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
@@ -1933,14 +1931,13 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
;
; AVX512BW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm0
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm2
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm2
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-NEXT: vzeroupper
@@ -2891,46 +2888,43 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
;
; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
-; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
-; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
More information about the llvm-commits
mailing list