[llvm] 071671e - [X86] Allow pre-SSE41 targets to extract multiple v16i8 elements coming from the same DWORD/WORD super-element
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 31 09:08:49 PDT 2023
Author: Simon Pilgrim
Date: 2023-07-31T17:08:34+01:00
New Revision: 071671e15c3eda59165db435b102fbeb1c673a29
URL: https://github.com/llvm/llvm-project/commit/071671e15c3eda59165db435b102fbeb1c673a29
DIFF: https://github.com/llvm/llvm-project/commit/071671e15c3eda59165db435b102fbeb1c673a29.diff
LOG: [X86] Allow pre-SSE41 targets to extract multiple v16i8 elements coming from the same DWORD/WORD super-element
Pre-SSE41 targets tended to have weak (serial) GPR<->VEC moves, meaning we only allowed a single v16i8 extraction before spilling the vector to stack and loading the i8 elements instead. But this didn't make use of the DWORD/WORD extraction we had to use could extract multiple i8 elements at the same time.
This patch attempts to determine if all uses of a vector are element extractions, and works out whether all the extractions share the same WORD or (lowest) DWORD, in which case we can perform a single extraction and just shift/truncate the individual elements.
Differential Revision: https://reviews.llvm.org/D156350
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/bitcast-vector-bool.ll
llvm/test/CodeGen/X86/pr63108.ll
llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a38cb5d4bf3120..1ca182f073c30f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -17648,6 +17648,40 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
DAG.getIntPtrConstant(0, dl));
}
+// Helper to find all the extracted elements from a vector.
+static APInt getExtractedDemandedElts(SDNode *N) {
+ MVT VT = N->getSimpleValueType(0);
+ unsigned NumElts = VT.getVectorNumElements();
+ APInt DemandedElts = APInt::getZero(NumElts);
+ for (SDNode *User : N->uses()) {
+ switch (User->getOpcode()) {
+ case X86ISD::PEXTRB:
+ case X86ISD::PEXTRW:
+ case ISD::EXTRACT_VECTOR_ELT:
+ if (!isa<ConstantSDNode>(User->getOperand(1))) {
+ DemandedElts.setAllBits();
+ return DemandedElts;
+ }
+ DemandedElts.setBit(User->getConstantOperandVal(1));
+ break;
+ case ISD::BITCAST: {
+ if (!User->getValueType(0).isSimple() ||
+ !User->getValueType(0).isVector()) {
+ DemandedElts.setAllBits();
+ return DemandedElts;
+ }
+ APInt DemandedSrcElts = getExtractedDemandedElts(User);
+ DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
+ break;
+ }
+ default:
+ DemandedElts.setAllBits();
+ return DemandedElts;
+ }
+ }
+ return DemandedElts;
+}
+
SDValue
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
@@ -17739,13 +17773,16 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
return Res;
- // TODO: We only extract a single element from v16i8, we can probably afford
- // to be more aggressive here before using the default approach of spilling to
- // stack.
- if (VT == MVT::i8 && Op->isOnlyUserOf(Vec.getNode())) {
+ // Only extract a single element from a v16i8 source - determine the common
+ // DWORD/WORD that all extractions share, and extract the sub-byte.
+ // TODO: Add QWORD MOVQ extraction?
+ if (VT == MVT::i8) {
+ APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
+ assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
+
// Extract either the lowest i32 or any i16, and extract the sub-byte.
int DWordIdx = IdxVal / 4;
- if (DWordIdx == 0) {
+ if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Vec),
DAG.getIntPtrConstant(DWordIdx, dl));
@@ -17757,14 +17794,16 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
}
int WordIdx = IdxVal / 2;
- SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
- DAG.getBitcast(MVT::v8i16, Vec),
- DAG.getIntPtrConstant(WordIdx, dl));
- int ShiftVal = (IdxVal % 2) * 8;
- if (ShiftVal != 0)
- Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
- DAG.getConstant(ShiftVal, dl, MVT::i8));
- return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+ DAG.getBitcast(MVT::v8i16, Vec),
+ DAG.getIntPtrConstant(WordIdx, dl));
+ int ShiftVal = (IdxVal % 2) * 8;
+ if (ShiftVal != 0)
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
+ DAG.getConstant(ShiftVal, dl, MVT::i8));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ }
}
if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index 90fe8276171d19..1b4f965a26454e 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -214,23 +214,14 @@ define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind {
}
define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind {
-; SSE2-SSSE3-LABEL: bitcast_v16i8_to_v2i8:
-; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: retq
-;
-; SSE41-LABEL: bitcast_v16i8_to_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovmskb %xmm0, %ecx
-; SSE41-NEXT: movl %ecx, %eax
-; SSE41-NEXT: shrl $8, %eax
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
+; SSE-LABEL: bitcast_v16i8_to_v2i8:
+; SSE: # %bb.0:
+; SSE-NEXT: pmovmskb %xmm0, %ecx
+; SSE-NEXT: movl %ecx, %eax
+; SSE-NEXT: shrl $8, %eax
+; SSE-NEXT: addb %cl, %al
+; SSE-NEXT: # kill: def $al killed $al killed $eax
+; SSE-NEXT: retq
;
; AVX12-LABEL: bitcast_v16i8_to_v2i8:
; AVX12: # %bb.0:
@@ -447,25 +438,15 @@ define i1 @trunc_v8i132_cmp(<8 x i32> %a0) nounwind {
}
define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind {
-; SSE2-SSSE3-LABEL: bitcast_v16i16_to_v2i8:
-; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: retq
-;
-; SSE41-LABEL: bitcast_v16i16_to_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: packsswb %xmm1, %xmm0
-; SSE41-NEXT: pmovmskb %xmm0, %ecx
-; SSE41-NEXT: movl %ecx, %eax
-; SSE41-NEXT: shrl $8, %eax
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
+; SSE-LABEL: bitcast_v16i16_to_v2i8:
+; SSE: # %bb.0:
+; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %ecx
+; SSE-NEXT: movl %ecx, %eax
+; SSE-NEXT: shrl $8, %eax
+; SSE-NEXT: addb %cl, %al
+; SSE-NEXT: # kill: def $al killed $al killed $eax
+; SSE-NEXT: retq
;
; AVX1-LABEL: bitcast_v16i16_to_v2i8:
; AVX1: # %bb.0:
@@ -776,29 +757,17 @@ define i1 @trunc_v8i64_cmp(<8 x i64> %a0) nounwind {
}
define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind {
-; SSE2-SSSE3-LABEL: bitcast_v16i32_to_v2i8:
-; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: retq
-;
-; SSE41-LABEL: bitcast_v16i32_to_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: packssdw %xmm3, %xmm2
-; SSE41-NEXT: packssdw %xmm1, %xmm0
-; SSE41-NEXT: packsswb %xmm2, %xmm0
-; SSE41-NEXT: pmovmskb %xmm0, %ecx
-; SSE41-NEXT: movl %ecx, %eax
-; SSE41-NEXT: shrl $8, %eax
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
+; SSE-LABEL: bitcast_v16i32_to_v2i8:
+; SSE: # %bb.0:
+; SSE-NEXT: packssdw %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: packsswb %xmm2, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %ecx
+; SSE-NEXT: movl %ecx, %eax
+; SSE-NEXT: shrl $8, %eax
+; SSE-NEXT: addb %cl, %al
+; SSE-NEXT: # kill: def $al killed $al killed $eax
+; SSE-NEXT: retq
;
; AVX1-LABEL: bitcast_v16i32_to_v2i8:
; AVX1: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll
index 229b4b136bf06b..67785ce532966b 100644
--- a/llvm/test/CodeGen/X86/pr63108.ll
+++ b/llvm/test/CodeGen/X86/pr63108.ll
@@ -34,9 +34,10 @@ define i32 @PR63108() {
; SSE-NEXT: psrld $16, %xmm0
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: .LBB0_5: # %for.cond.cleanup
-; SSE-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ecx
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: movsbl %al, %ecx
+; SSE-NEXT: shrl $8, %eax
+; SSE-NEXT: movsbl %al, %eax
; SSE-NEXT: addl %ecx, %eax
; SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index 741f27de31e65c..1f9153d6620197 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -2434,134 +2434,128 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
;
; SSE2-ONLY-LABEL: vec384_v3i8:
; SSE2-ONLY: # %bb.0:
-; SSE2-ONLY-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-ONLY-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-ONLY-NEXT: pxor %xmm0, %xmm1
-; SSE2-ONLY-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-ONLY-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-ONLY-NEXT: movb %al, 2(%rsi)
-; SSE2-ONLY-NEXT: movd %xmm1, %ecx
-; SSE2-ONLY-NEXT: movw %cx, (%rsi)
-; SSE2-ONLY-NEXT: movb %al, 2(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, (%rdx)
-; SSE2-ONLY-NEXT: movb %al, 6(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 4(%rdx)
-; SSE2-ONLY-NEXT: movb %al, 10(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 8(%rdx)
-; SSE2-ONLY-NEXT: movb %al, 14(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 12(%rdx)
-; SSE2-ONLY-NEXT: movb %al, 18(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 16(%rdx)
-; SSE2-ONLY-NEXT: movb %al, 22(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 20(%rdx)
-; SSE2-ONLY-NEXT: movb %al, 26(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 24(%rdx)
-; SSE2-ONLY-NEXT: movb %al, 30(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 28(%rdx)
-; SSE2-ONLY-NEXT: movb %al, 34(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 32(%rdx)
-; SSE2-ONLY-NEXT: movb %al, 38(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 36(%rdx)
-; SSE2-ONLY-NEXT: movb %al, 42(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 40(%rdx)
-; SSE2-ONLY-NEXT: movb %al, 46(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 44(%rdx)
-; SSE2-ONLY-NEXT: movb %al, 50(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 48(%rdx)
-; SSE2-ONLY-NEXT: movb %al, 54(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 52(%rdx)
-; SSE2-ONLY-NEXT: movb %al, 58(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 56(%rdx)
-; SSE2-ONLY-NEXT: movb %al, 62(%rdx)
-; SSE2-ONLY-NEXT: movw %cx, 60(%rdx)
+; SSE2-ONLY-NEXT: movl (%rdi), %eax
+; SSE2-ONLY-NEXT: notl %eax
+; SSE2-ONLY-NEXT: movw %ax, (%rsi)
+; SSE2-ONLY-NEXT: movl %eax, %ecx
+; SSE2-ONLY-NEXT: shrl $16, %ecx
+; SSE2-ONLY-NEXT: movb %cl, 2(%rsi)
+; SSE2-ONLY-NEXT: movb %cl, 2(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, (%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 6(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 4(%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 10(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 8(%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 14(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 12(%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 18(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 16(%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 22(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 20(%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 26(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 24(%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 30(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 28(%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 34(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 32(%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 38(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 36(%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 42(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 40(%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 46(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 44(%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 50(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 48(%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 54(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 52(%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 58(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 56(%rdx)
+; SSE2-ONLY-NEXT: movb %cl, 62(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, 60(%rdx)
; SSE2-ONLY-NEXT: retq
;
; SSE3-LABEL: vec384_v3i8:
; SSE3: # %bb.0:
-; SSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE3-NEXT: movb %al, 2(%rsi)
-; SSE3-NEXT: movd %xmm1, %ecx
-; SSE3-NEXT: movw %cx, (%rsi)
-; SSE3-NEXT: movb %al, 2(%rdx)
-; SSE3-NEXT: movw %cx, (%rdx)
-; SSE3-NEXT: movb %al, 6(%rdx)
-; SSE3-NEXT: movw %cx, 4(%rdx)
-; SSE3-NEXT: movb %al, 10(%rdx)
-; SSE3-NEXT: movw %cx, 8(%rdx)
-; SSE3-NEXT: movb %al, 14(%rdx)
-; SSE3-NEXT: movw %cx, 12(%rdx)
-; SSE3-NEXT: movb %al, 18(%rdx)
-; SSE3-NEXT: movw %cx, 16(%rdx)
-; SSE3-NEXT: movb %al, 22(%rdx)
-; SSE3-NEXT: movw %cx, 20(%rdx)
-; SSE3-NEXT: movb %al, 26(%rdx)
-; SSE3-NEXT: movw %cx, 24(%rdx)
-; SSE3-NEXT: movb %al, 30(%rdx)
-; SSE3-NEXT: movw %cx, 28(%rdx)
-; SSE3-NEXT: movb %al, 34(%rdx)
-; SSE3-NEXT: movw %cx, 32(%rdx)
-; SSE3-NEXT: movb %al, 38(%rdx)
-; SSE3-NEXT: movw %cx, 36(%rdx)
-; SSE3-NEXT: movb %al, 42(%rdx)
-; SSE3-NEXT: movw %cx, 40(%rdx)
-; SSE3-NEXT: movb %al, 46(%rdx)
-; SSE3-NEXT: movw %cx, 44(%rdx)
-; SSE3-NEXT: movb %al, 50(%rdx)
-; SSE3-NEXT: movw %cx, 48(%rdx)
-; SSE3-NEXT: movb %al, 54(%rdx)
-; SSE3-NEXT: movw %cx, 52(%rdx)
-; SSE3-NEXT: movb %al, 58(%rdx)
-; SSE3-NEXT: movw %cx, 56(%rdx)
-; SSE3-NEXT: movb %al, 62(%rdx)
-; SSE3-NEXT: movw %cx, 60(%rdx)
+; SSE3-NEXT: movl (%rdi), %eax
+; SSE3-NEXT: notl %eax
+; SSE3-NEXT: movw %ax, (%rsi)
+; SSE3-NEXT: movl %eax, %ecx
+; SSE3-NEXT: shrl $16, %ecx
+; SSE3-NEXT: movb %cl, 2(%rsi)
+; SSE3-NEXT: movb %cl, 2(%rdx)
+; SSE3-NEXT: movw %ax, (%rdx)
+; SSE3-NEXT: movb %cl, 6(%rdx)
+; SSE3-NEXT: movw %ax, 4(%rdx)
+; SSE3-NEXT: movb %cl, 10(%rdx)
+; SSE3-NEXT: movw %ax, 8(%rdx)
+; SSE3-NEXT: movb %cl, 14(%rdx)
+; SSE3-NEXT: movw %ax, 12(%rdx)
+; SSE3-NEXT: movb %cl, 18(%rdx)
+; SSE3-NEXT: movw %ax, 16(%rdx)
+; SSE3-NEXT: movb %cl, 22(%rdx)
+; SSE3-NEXT: movw %ax, 20(%rdx)
+; SSE3-NEXT: movb %cl, 26(%rdx)
+; SSE3-NEXT: movw %ax, 24(%rdx)
+; SSE3-NEXT: movb %cl, 30(%rdx)
+; SSE3-NEXT: movw %ax, 28(%rdx)
+; SSE3-NEXT: movb %cl, 34(%rdx)
+; SSE3-NEXT: movw %ax, 32(%rdx)
+; SSE3-NEXT: movb %cl, 38(%rdx)
+; SSE3-NEXT: movw %ax, 36(%rdx)
+; SSE3-NEXT: movb %cl, 42(%rdx)
+; SSE3-NEXT: movw %ax, 40(%rdx)
+; SSE3-NEXT: movb %cl, 46(%rdx)
+; SSE3-NEXT: movw %ax, 44(%rdx)
+; SSE3-NEXT: movb %cl, 50(%rdx)
+; SSE3-NEXT: movw %ax, 48(%rdx)
+; SSE3-NEXT: movb %cl, 54(%rdx)
+; SSE3-NEXT: movw %ax, 52(%rdx)
+; SSE3-NEXT: movb %cl, 58(%rdx)
+; SSE3-NEXT: movw %ax, 56(%rdx)
+; SSE3-NEXT: movb %cl, 62(%rdx)
+; SSE3-NEXT: movw %ax, 60(%rdx)
; SSE3-NEXT: retq
;
; SSSE3-ONLY-LABEL: vec384_v3i8:
; SSSE3-ONLY: # %bb.0:
-; SSSE3-ONLY-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-ONLY-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-ONLY-NEXT: pxor %xmm0, %xmm1
-; SSSE3-ONLY-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSSE3-ONLY-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-ONLY-NEXT: movb %al, 2(%rsi)
-; SSSE3-ONLY-NEXT: movd %xmm1, %ecx
-; SSSE3-ONLY-NEXT: movw %cx, (%rsi)
-; SSSE3-ONLY-NEXT: movb %al, 2(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, (%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 6(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 4(%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 10(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 8(%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 14(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 12(%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 18(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 16(%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 22(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 20(%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 26(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 24(%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 30(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 28(%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 34(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 32(%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 38(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 36(%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 42(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 40(%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 46(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 44(%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 50(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 48(%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 54(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 52(%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 58(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 56(%rdx)
-; SSSE3-ONLY-NEXT: movb %al, 62(%rdx)
-; SSSE3-ONLY-NEXT: movw %cx, 60(%rdx)
+; SSSE3-ONLY-NEXT: movl (%rdi), %eax
+; SSSE3-ONLY-NEXT: notl %eax
+; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
+; SSSE3-ONLY-NEXT: movl %eax, %ecx
+; SSSE3-ONLY-NEXT: shrl $16, %ecx
+; SSSE3-ONLY-NEXT: movb %cl, 2(%rsi)
+; SSSE3-ONLY-NEXT: movb %cl, 2(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, (%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 6(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 10(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 8(%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 14(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 18(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 16(%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 22(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 26(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 24(%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 30(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 34(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 32(%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 38(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 42(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 40(%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 46(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 50(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 48(%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 54(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 58(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 56(%rdx)
+; SSSE3-ONLY-NEXT: movb %cl, 62(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx)
; SSSE3-ONLY-NEXT: retq
;
; SSE41-LABEL: vec384_v3i8:
More information about the llvm-commits
mailing list