[llvm] fdbc30a - [X86][DAGCombiner][SelectionDAG] - Fold Zext Build Vector to Bitcast of widen Build Vector (#135010)
via llvm-commits
llvm-commits at lists.llvm.org
Tue May 6 00:25:14 PDT 2025
Author: Rohit Aggarwal
Date: 2025-05-06T15:25:10+08:00
New Revision: fdbc30a383973d89d738283e733ba0db98df6a77
URL: https://github.com/llvm/llvm-project/commit/fdbc30a383973d89d738283e733ba0db98df6a77
DIFF: https://github.com/llvm/llvm-project/commit/fdbc30a383973d89d738283e733ba0db98df6a77.diff
LOG: [X86][DAGCombiner][SelectionDAG] - Fold Zext Build Vector to Bitcast of widen Build Vector (#135010)
I am working on a problem in which a kernel is SLP vectorized and lead
to generation of insertelements followed by zext. On lowering, the
assembly looks like below:
vmovd %r9d, %xmm0
vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0
vpinsrb $3, (%rdi,%rcx), %xmm0, %xmm0
vpinsrb $4, (%rdi,%rsi,4), %xmm0, %xmm0
vpinsrb $5, (%rdi,%rax), %xmm0, %xmm0
vpinsrb $6, (%rdi,%rcx,2), %xmm0, %xmm0
vpinsrb $7, (%rdi,%r8), %xmm0, %xmm0
vpmovzxbw %xmm0, %xmm0 # xmm0 =
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
vpmaddwd (%rdx), %xmm0, %xmm0
After all the insrb, xmm0 looks like
xmm0=xmm0[0],xmm0[1],xmm0[2],xmm0[3],xmm0[4],xmm0[5],xmm0[6],xmm0[7],zero,zero,zero,zero,zero,zero,zero,zero
Here vpmovzxbw perform the extension of i8 to i16. But it is expensive
operation and I want to remove it.
Optimization
Place the value in correct location while inserting so that zext can be
avoid.
While lowering, we can write a custom lowerOperation for
zero_extend_vector_inreg opcode. We can override the current default
operation with my custom in the legalization step.
The changes proposed are state below:
vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0
vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0
vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0
vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0
vpinsrb $a, (%rdi,%rax), %xmm0, %xmm0
vpinsrb $c, (%rdi,%rcx,2), %xmm0, %xmm0
vpinsrb $e, (%rdi,%r8), %xmm0, %xmm0 # xmm0 =
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
vpmaddwd (%rdx), %xmm0, %xmm0
More details in the discourse topic
[https://discourse.llvm.org/t/improve-the-gathering-of-the-elements-so-that-unwanted-ext-operations-can-be-avoided/85443](url)
---------
Co-authored-by: Rohit Aggarwal <Rohit.Aggarwal at amd.com>
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3d9c76f3d05f5..cd1bbb8fbb7b7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55934,6 +55934,79 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
NegN2);
}
+// Try to widen the build vector and bitcast it to the type of zext.
+// This is a special case for the 128-bit vector types. Intention is to remove
+// the zext and replace it with a bitcast the wider type. While lowering
+// the bitcast is removed and extra commutation due to zext is avoided.
+// For example:
+// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
+// build_vector (x, 0, y, 0, z, w, 0)
+static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) {
+
+ if (Extend->getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+
+ EVT ExtendVT = Extend->getValueType(0);
+
+ SDValue BV = Extend->getOperand(0);
+ if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
+ return SDValue();
+
+ if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
+ // If the build vector has undef elements, we cannot widen it.
+ // The widening would create a vector with more undef elements, which
+ // is not valid.
+ return SDValue();
+ }
+
+ if (!all_of(BV->op_values(),
+ [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
+ // If the build vector any element other than \ISD::LOAD, we cannot widen
+ // it.
+ return SDValue();
+ }
+
+ SDLoc dl(BV);
+ EVT VT = BV.getValueType();
+ EVT EltVT = BV.getOperand(0).getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (TLI.getTypeAction(*DAG.getContext(), VT) !=
+ TargetLowering::TypeWidenVector)
+ return SDValue();
+
+ EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+ unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+ SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
+ assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
+ // Fill the new elements with Zero.
+ NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
+ // Compute the step to place the elements in the right place and control the
+ // iteration.
+ unsigned step = WidenNumElts / NumElts;
+ if (WidenVT.is128BitVector()) {
+ if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
+ for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
+ i--, j -= step) {
+ SDValue temp = NewOps[i];
+ NewOps[i] = NewOps[j];
+ NewOps[j] = temp;
+ }
+ // Create new build vector with WidenVT and NewOps
+ SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
+ // Replace the old build vector with the new one. Bitcast the
+ // new build vector to the type of the zext.
+ SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
+ return NewBV;
+ }
+ }
+ return SDValue();
+}
+
static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -55993,6 +56066,9 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
}
}
+ if (SDValue V = widenBuildVec(N, DAG))
+ return V;
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll b/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll
index 8c85dfa09fd2d..345014edd0e9d 100644
--- a/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll
+++ b/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll
@@ -31,88 +31,62 @@ define i32 @dot_ext_v8i8_v8i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
; SSE2-NEXT: pinsrw $6, %r9d, %xmm0
; SSE2-NEXT: pinsrw $7, %esi, %xmm0
; SSE2-NEXT: movdqu (%rdx), %xmm1
-; SSE2-NEXT: pmaddwd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT: pmaddwd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r14
; SSE2-NEXT: retq
;
; SSE4-LABEL: dot_ext_v8i8_v8i32:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: leaq (%rsi,%rsi,4), %rax
-; SSE4-NEXT: leaq (%rsi,%rsi,2), %rcx
-; SSE4-NEXT: leaq (,%rsi,8), %r8
-; SSE4-NEXT: movzbl (%rdi), %r9d
-; SSE4-NEXT: movd %r9d, %xmm0
-; SSE4-NEXT: pinsrb $1, (%rdi,%rsi), %xmm0
-; SSE4-NEXT: pinsrb $2, (%rdi,%rsi,2), %xmm0
-; SSE4-NEXT: pinsrb $3, (%rdi,%rcx), %xmm0
-; SSE4-NEXT: pinsrb $4, (%rdi,%rsi,4), %xmm0
-; SSE4-NEXT: pinsrb $5, (%rdi,%rax), %xmm0
-; SSE4-NEXT: pinsrb $6, (%rdi,%rcx,2), %xmm0
-; SSE4-NEXT: subq %rsi, %r8
-; SSE4-NEXT: pinsrb $7, (%rdi,%r8), %xmm0
+; SSE4-NEXT: movzbl (%rdi), %eax
+; SSE4-NEXT: leaq (%rsi,%rsi,4), %rcx
+; SSE4-NEXT: leaq (%rsi,%rsi,2), %r8
+; SSE4-NEXT: leaq (,%rsi,8), %r9
+; SSE4-NEXT: subq %rsi, %r9
+; SSE4-NEXT: movd %eax, %xmm0
+; SSE4-NEXT: pinsrb $2, (%rdi,%rsi), %xmm0
+; SSE4-NEXT: pinsrb $4, (%rdi,%rsi,2), %xmm0
+; SSE4-NEXT: pinsrb $6, (%rdi,%r8), %xmm0
+; SSE4-NEXT: pinsrb $8, (%rdi,%rsi,4), %xmm0
+; SSE4-NEXT: pinsrb $10, (%rdi,%rcx), %xmm0
+; SSE4-NEXT: pinsrb $12, (%rdi,%r8,2), %xmm0
+; SSE4-NEXT: pinsrb $14, (%rdi,%r9), %xmm0
; SSE4-NEXT: movdqu (%rdx), %xmm1
-; SSE4-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE4-NEXT: pmaddwd %xmm1, %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE4-NEXT: paddd %xmm0, %xmm1
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE4-NEXT: pmaddwd %xmm0, %xmm1
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE4-NEXT: paddd %xmm1, %xmm0
-; SSE4-NEXT: movd %xmm0, %eax
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-NEXT: paddd %xmm0, %xmm1
+; SSE4-NEXT: movd %xmm1, %eax
; SSE4-NEXT: retq
;
-; AVX2-LABEL: dot_ext_v8i8_v8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: leaq (%rsi,%rsi,4), %rax
-; AVX2-NEXT: leaq (%rsi,%rsi,2), %rcx
-; AVX2-NEXT: leaq (,%rsi,8), %r8
-; AVX2-NEXT: subq %rsi, %r8
-; AVX2-NEXT: movzbl (%rdi), %r9d
-; AVX2-NEXT: vmovd %r9d, %xmm0
-; AVX2-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $3, (%rdi,%rcx), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $4, (%rdi,%rsi,4), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $5, (%rdi,%rax), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $6, (%rdi,%rcx,2), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $7, (%rdi,%r8), %xmm0, %xmm0
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: dot_ext_v8i8_v8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: leaq (%rsi,%rsi,4), %rax
-; AVX512-NEXT: leaq (%rsi,%rsi,2), %rcx
-; AVX512-NEXT: leaq (,%rsi,8), %r8
-; AVX512-NEXT: movzbl (%rdi), %r9d
-; AVX512-NEXT: vmovd %r9d, %xmm0
-; AVX512-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $3, (%rdi,%rcx), %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $4, (%rdi,%rsi,4), %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $5, (%rdi,%rax), %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $6, (%rdi,%rcx,2), %xmm0, %xmm0
-; AVX512-NEXT: subq %rsi, %r8
-; AVX512-NEXT: vpinsrb $7, (%rdi,%r8), %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
+; AVX-LABEL: dot_ext_v8i8_v8i32:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: movzbl (%rdi), %eax
+; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx
+; AVX-NEXT: leaq (%rsi,%rsi,4), %r8
+; AVX-NEXT: leaq (,%rsi,8), %r9
+; AVX-NEXT: subq %rsi, %r9
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $10, (%rdi,%r8), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $12, (%rdi,%rcx,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $14, (%rdi,%r9), %xmm0, %xmm0
+; AVX-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
entry:
%var0 = load i8, ptr %a, align 1
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
@@ -175,14 +149,13 @@ define i32 @dot_ext_v4i8_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
;
; SSE4-LABEL: dot_ext_v4i8_v4i32:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: leaq (%rsi,%rsi,2), %rax
-; SSE4-NEXT: movzbl (%rdi), %ecx
-; SSE4-NEXT: movd %ecx, %xmm0
-; SSE4-NEXT: pinsrb $1, (%rdi,%rsi), %xmm0
-; SSE4-NEXT: pinsrb $2, (%rdi,%rsi,2), %xmm0
-; SSE4-NEXT: pinsrb $3, (%rdi,%rax), %xmm0
+; SSE4-NEXT: movzbl (%rdi), %eax
+; SSE4-NEXT: leaq (%rsi,%rsi,2), %rcx
+; SSE4-NEXT: movd %eax, %xmm0
+; SSE4-NEXT: pinsrb $4, (%rdi,%rsi), %xmm0
+; SSE4-NEXT: pinsrb $8, (%rdi,%rsi,2), %xmm0
+; SSE4-NEXT: pinsrb $12, (%rdi,%rcx), %xmm0
; SSE4-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE4-NEXT: pmaddwd %xmm0, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE4-NEXT: paddd %xmm1, %xmm0
@@ -194,12 +167,11 @@ define i32 @dot_ext_v4i8_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
; AVX-LABEL: dot_ext_v4i8_v4i32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: movzbl (%rdi), %eax
+; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx
; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0
-; AVX-NEXT: leaq (%rsi,%rsi,2), %rax
-; AVX-NEXT: vpinsrb $3, (%rdi,%rax), %xmm0, %xmm0
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $8, (%rdi,%rsi,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $12, (%rdi,%rcx), %xmm0, %xmm0
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -311,8 +283,7 @@ define i64 @dot_ext_v2i8_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
; SSE4: # %bb.0:
; SSE4-NEXT: movzbl (%rdi), %eax
; SSE4-NEXT: movd %eax, %xmm0
-; SSE4-NEXT: pinsrb $1, (%rdi,%rsi), %xmm0
-; SSE4-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE4-NEXT: pinsrb $8, (%rdi,%rsi), %xmm0
; SSE4-NEXT: pmovsxbq (%rdx), %xmm1
; SSE4-NEXT: pmuldq %xmm0, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
@@ -324,8 +295,7 @@ define i64 @dot_ext_v2i8_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
; AVX: # %bb.0:
; AVX-NEXT: movzbl (%rdi), %eax
; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpinsrb $8, (%rdi,%rsi), %xmm0, %xmm0
; AVX-NEXT: vpmovsxbq (%rdx), %xmm1
; AVX-NEXT: vpmuldq %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -374,14 +344,13 @@ define i32 @dot_ext_v4i16_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
;
; SSE4-LABEL: dot_ext_v4i16_v4i32:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: leaq (%rsi,%rsi,2), %rax
-; SSE4-NEXT: movzwl (%rdi), %ecx
-; SSE4-NEXT: movd %ecx, %xmm0
-; SSE4-NEXT: pinsrw $1, (%rdi,%rsi), %xmm0
-; SSE4-NEXT: pinsrw $2, (%rdi,%rsi,2), %xmm0
-; SSE4-NEXT: pinsrw $3, (%rdi,%rax), %xmm0
+; SSE4-NEXT: movzwl (%rdi), %eax
+; SSE4-NEXT: leaq (%rsi,%rsi,2), %rcx
+; SSE4-NEXT: movd %eax, %xmm0
+; SSE4-NEXT: pinsrw $2, (%rdi,%rsi), %xmm0
+; SSE4-NEXT: pinsrw $4, (%rdi,%rsi,2), %xmm0
+; SSE4-NEXT: pinsrw $6, (%rdi,%rcx), %xmm0
; SSE4-NEXT: pmovsxwd (%rdx), %xmm1
-; SSE4-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-NEXT: pmulld %xmm0, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE4-NEXT: paddd %xmm1, %xmm0
@@ -390,41 +359,22 @@ define i32 @dot_ext_v4i16_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
; SSE4-NEXT: movd %xmm1, %eax
; SSE4-NEXT: retq
;
-; AVX2-LABEL: dot_ext_v4i16_v4i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: leaq (%rsi,%rsi,2), %rax
-; AVX2-NEXT: movzwl (%rdi), %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrw $2, (%rdi,%rsi,2), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrw $3, (%rdi,%rax), %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxwd (%rdx), %xmm1
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: dot_ext_v4i16_v4i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: leaq (%rsi,%rsi,2), %rax
-; AVX512-NEXT: movzwl (%rdi), %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm0
-; AVX512-NEXT: vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0
-; AVX512-NEXT: vpinsrw $2, (%rdi,%rsi,2), %xmm0, %xmm0
-; AVX512-NEXT: vpinsrw $3, (%rdi,%rax), %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: vpmovsxwd (%rdx), %xmm1
-; AVX512-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
+; AVX-LABEL: dot_ext_v4i16_v4i32:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpinsrw $2, (%rdi,%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $4, (%rdi,%rsi,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $6, (%rdi,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpmovsxwd (%rdx), %xmm1
+; AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
entry:
%var0 = load i16, ptr %a, align 1
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
@@ -509,16 +459,15 @@ define i64 @dot_ext_v2i32_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: paddq %xmm0, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: psllq $32, %xmm2
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: paddq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddq %xmm1, %xmm0
; SSE2-NEXT: movq %xmm0, %rax
@@ -560,8 +509,8 @@ define i64 @dot_ext_v2i32_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
; AVX512-LABEL: dot_ext_v2i32_v2i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpinsrd $1, (%rdi,%rsi), %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vpmovsxdq (%rdx), %xmm1
; AVX512-NEXT: vpmullq %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
More information about the llvm-commits
mailing list