[llvm] 071671e - [X86] Allow pre-SSE41 targets to extract multiple v16i8 elements coming from the same DWORD/WORD super-element

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 31 09:08:49 PDT 2023


Author: Simon Pilgrim
Date: 2023-07-31T17:08:34+01:00
New Revision: 071671e15c3eda59165db435b102fbeb1c673a29

URL: https://github.com/llvm/llvm-project/commit/071671e15c3eda59165db435b102fbeb1c673a29
DIFF: https://github.com/llvm/llvm-project/commit/071671e15c3eda59165db435b102fbeb1c673a29.diff

LOG: [X86] Allow pre-SSE41 targets to extract multiple v16i8 elements coming from the same DWORD/WORD super-element

Pre-SSE41 targets tended to have weak (serial) GPR<->VEC moves, meaning we only allowed a single v16i8 extraction before spilling the vector to stack and loading the i8 elements instead. But this didn't make use of the DWORD/WORD extraction we had to use could extract multiple i8 elements at the same time.

This patch attempts to determine if all uses of a vector are element extractions, and works out whether all the extractions share the same WORD or (lowest) DWORD, in which case we can perform a single extraction and just shift/truncate the individual elements.

Differential Revision: https://reviews.llvm.org/D156350

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/bitcast-vector-bool.ll
    llvm/test/CodeGen/X86/pr63108.ll
    llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a38cb5d4bf3120..1ca182f073c30f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -17648,6 +17648,40 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
                      DAG.getIntPtrConstant(0, dl));
 }
 
+// Helper to find all the extracted elements from a vector.
+static APInt getExtractedDemandedElts(SDNode *N) {
+  MVT VT = N->getSimpleValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  APInt DemandedElts = APInt::getZero(NumElts);
+  for (SDNode *User : N->uses()) {
+    switch (User->getOpcode()) {
+    case X86ISD::PEXTRB:
+    case X86ISD::PEXTRW:
+    case ISD::EXTRACT_VECTOR_ELT:
+      if (!isa<ConstantSDNode>(User->getOperand(1))) {
+        DemandedElts.setAllBits();
+        return DemandedElts;
+      }
+      DemandedElts.setBit(User->getConstantOperandVal(1));
+      break;
+    case ISD::BITCAST: {
+      if (!User->getValueType(0).isSimple() ||
+          !User->getValueType(0).isVector()) {
+        DemandedElts.setAllBits();
+        return DemandedElts;
+      }
+      APInt DemandedSrcElts = getExtractedDemandedElts(User);
+      DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
+      break;
+    }
+    default:
+      DemandedElts.setAllBits();
+      return DemandedElts;
+    }
+  }
+  return DemandedElts;
+}
+
 SDValue
 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                            SelectionDAG &DAG) const {
@@ -17739,13 +17773,16 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
       return Res;
 
-  // TODO: We only extract a single element from v16i8, we can probably afford
-  // to be more aggressive here before using the default approach of spilling to
-  // stack.
-  if (VT == MVT::i8 && Op->isOnlyUserOf(Vec.getNode())) {
+  // Only extract a single element from a v16i8 source - determine the common
+  // DWORD/WORD that all extractions share, and extract the sub-byte.
+  // TODO: Add QWORD MOVQ extraction?
+  if (VT == MVT::i8) {
+    APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
+    assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
+
     // Extract either the lowest i32 or any i16, and extract the sub-byte.
     int DWordIdx = IdxVal / 4;
-    if (DWordIdx == 0) {
+    if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                 DAG.getBitcast(MVT::v4i32, Vec),
                                 DAG.getIntPtrConstant(DWordIdx, dl));
@@ -17757,14 +17794,16 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     }
 
     int WordIdx = IdxVal / 2;
-    SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
-                              DAG.getBitcast(MVT::v8i16, Vec),
-                              DAG.getIntPtrConstant(WordIdx, dl));
-    int ShiftVal = (IdxVal % 2) * 8;
-    if (ShiftVal != 0)
-      Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
-                        DAG.getConstant(ShiftVal, dl, MVT::i8));
-    return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+    if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
+      SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+                                DAG.getBitcast(MVT::v8i16, Vec),
+                                DAG.getIntPtrConstant(WordIdx, dl));
+      int ShiftVal = (IdxVal % 2) * 8;
+      if (ShiftVal != 0)
+        Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
+                          DAG.getConstant(ShiftVal, dl, MVT::i8));
+      return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+    }
   }
 
   if (VT == MVT::f16 || VT.getSizeInBits() == 32) {

diff  --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index 90fe8276171d19..1b4f965a26454e 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -214,23 +214,14 @@ define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind {
 }
 
 define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind {
-; SSE2-SSSE3-LABEL: bitcast_v16i8_to_v2i8:
-; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    pmovmskb %xmm0, %eax
-; SSE2-SSSE3-NEXT:    movd %eax, %xmm0
-; SSE2-SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT:    addb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: bitcast_v16i8_to_v2i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovmskb %xmm0, %ecx
-; SSE41-NEXT:    movl %ecx, %eax
-; SSE41-NEXT:    shrl $8, %eax
-; SSE41-NEXT:    addb %cl, %al
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: bitcast_v16i8_to_v2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    shrl $8, %eax
+; SSE-NEXT:    addb %cl, %al
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    retq
 ;
 ; AVX12-LABEL: bitcast_v16i8_to_v2i8:
 ; AVX12:       # %bb.0:
@@ -447,25 +438,15 @@ define i1 @trunc_v8i132_cmp(<8 x i32> %a0) nounwind {
 }
 
 define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind {
-; SSE2-SSSE3-LABEL: bitcast_v16i16_to_v2i8:
-; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    packsswb %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    pmovmskb %xmm0, %eax
-; SSE2-SSSE3-NEXT:    movd %eax, %xmm0
-; SSE2-SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT:    addb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: bitcast_v16i16_to_v2i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    packsswb %xmm1, %xmm0
-; SSE41-NEXT:    pmovmskb %xmm0, %ecx
-; SSE41-NEXT:    movl %ecx, %eax
-; SSE41-NEXT:    shrl $8, %eax
-; SSE41-NEXT:    addb %cl, %al
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: bitcast_v16i16_to_v2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packsswb %xmm1, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    shrl $8, %eax
+; SSE-NEXT:    addb %cl, %al
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: bitcast_v16i16_to_v2i8:
 ; AVX1:       # %bb.0:
@@ -776,29 +757,17 @@ define i1 @trunc_v8i64_cmp(<8 x i64> %a0) nounwind {
 }
 
 define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind {
-; SSE2-SSSE3-LABEL: bitcast_v16i32_to_v2i8:
-; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    packssdw %xmm3, %xmm2
-; SSE2-SSSE3-NEXT:    packssdw %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    packsswb %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pmovmskb %xmm0, %eax
-; SSE2-SSSE3-NEXT:    movd %eax, %xmm0
-; SSE2-SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT:    addb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: bitcast_v16i32_to_v2i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    packssdw %xmm3, %xmm2
-; SSE41-NEXT:    packssdw %xmm1, %xmm0
-; SSE41-NEXT:    packsswb %xmm2, %xmm0
-; SSE41-NEXT:    pmovmskb %xmm0, %ecx
-; SSE41-NEXT:    movl %ecx, %eax
-; SSE41-NEXT:    shrl $8, %eax
-; SSE41-NEXT:    addb %cl, %al
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: bitcast_v16i32_to_v2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm3, %xmm2
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    packsswb %xmm2, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %ecx
+; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    shrl $8, %eax
+; SSE-NEXT:    addb %cl, %al
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: bitcast_v16i32_to_v2i8:
 ; AVX1:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll
index 229b4b136bf06b..67785ce532966b 100644
--- a/llvm/test/CodeGen/X86/pr63108.ll
+++ b/llvm/test/CodeGen/X86/pr63108.ll
@@ -34,9 +34,10 @@ define i32 @PR63108() {
 ; SSE-NEXT:    psrld $16, %xmm0
 ; SSE-NEXT:    pxor %xmm2, %xmm0
 ; SSE-NEXT:  .LBB0_5: # %for.cond.cleanup
-; SSE-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %ecx
-; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    movsbl %al, %ecx
+; SSE-NEXT:    shrl $8, %eax
+; SSE-NEXT:    movsbl %al, %eax
 ; SSE-NEXT:    addl %ecx, %eax
 ; SSE-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index 741f27de31e65c..1f9153d6620197 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -2434,134 +2434,128 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ;
 ; SSE2-ONLY-LABEL: vec384_v3i8:
 ; SSE2-ONLY:       # %bb.0:
-; SSE2-ONLY-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-ONLY-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE2-ONLY-NEXT:    pxor %xmm0, %xmm1
-; SSE2-ONLY-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-ONLY-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-ONLY-NEXT:    movb %al, 2(%rsi)
-; SSE2-ONLY-NEXT:    movd %xmm1, %ecx
-; SSE2-ONLY-NEXT:    movw %cx, (%rsi)
-; SSE2-ONLY-NEXT:    movb %al, 2(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, (%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 6(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 4(%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 10(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 8(%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 14(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 12(%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 18(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 16(%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 22(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 20(%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 26(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 24(%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 30(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 28(%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 34(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 32(%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 38(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 36(%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 42(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 40(%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 46(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 44(%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 50(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 48(%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 54(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 52(%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 58(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 56(%rdx)
-; SSE2-ONLY-NEXT:    movb %al, 62(%rdx)
-; SSE2-ONLY-NEXT:    movw %cx, 60(%rdx)
+; SSE2-ONLY-NEXT:    movl (%rdi), %eax
+; SSE2-ONLY-NEXT:    notl %eax
+; SSE2-ONLY-NEXT:    movw %ax, (%rsi)
+; SSE2-ONLY-NEXT:    movl %eax, %ecx
+; SSE2-ONLY-NEXT:    shrl $16, %ecx
+; SSE2-ONLY-NEXT:    movb %cl, 2(%rsi)
+; SSE2-ONLY-NEXT:    movb %cl, 2(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, (%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 6(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 4(%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 10(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 8(%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 14(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 12(%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 18(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 16(%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 22(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 20(%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 26(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 24(%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 30(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 28(%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 34(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 32(%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 38(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 36(%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 42(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 40(%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 46(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 44(%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 50(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 48(%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 54(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 52(%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 58(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 56(%rdx)
+; SSE2-ONLY-NEXT:    movb %cl, 62(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, 60(%rdx)
 ; SSE2-ONLY-NEXT:    retq
 ;
 ; SSE3-LABEL: vec384_v3i8:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE3-NEXT:    pxor %xmm0, %xmm1
-; SSE3-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE3-NEXT:    movb %al, 2(%rsi)
-; SSE3-NEXT:    movd %xmm1, %ecx
-; SSE3-NEXT:    movw %cx, (%rsi)
-; SSE3-NEXT:    movb %al, 2(%rdx)
-; SSE3-NEXT:    movw %cx, (%rdx)
-; SSE3-NEXT:    movb %al, 6(%rdx)
-; SSE3-NEXT:    movw %cx, 4(%rdx)
-; SSE3-NEXT:    movb %al, 10(%rdx)
-; SSE3-NEXT:    movw %cx, 8(%rdx)
-; SSE3-NEXT:    movb %al, 14(%rdx)
-; SSE3-NEXT:    movw %cx, 12(%rdx)
-; SSE3-NEXT:    movb %al, 18(%rdx)
-; SSE3-NEXT:    movw %cx, 16(%rdx)
-; SSE3-NEXT:    movb %al, 22(%rdx)
-; SSE3-NEXT:    movw %cx, 20(%rdx)
-; SSE3-NEXT:    movb %al, 26(%rdx)
-; SSE3-NEXT:    movw %cx, 24(%rdx)
-; SSE3-NEXT:    movb %al, 30(%rdx)
-; SSE3-NEXT:    movw %cx, 28(%rdx)
-; SSE3-NEXT:    movb %al, 34(%rdx)
-; SSE3-NEXT:    movw %cx, 32(%rdx)
-; SSE3-NEXT:    movb %al, 38(%rdx)
-; SSE3-NEXT:    movw %cx, 36(%rdx)
-; SSE3-NEXT:    movb %al, 42(%rdx)
-; SSE3-NEXT:    movw %cx, 40(%rdx)
-; SSE3-NEXT:    movb %al, 46(%rdx)
-; SSE3-NEXT:    movw %cx, 44(%rdx)
-; SSE3-NEXT:    movb %al, 50(%rdx)
-; SSE3-NEXT:    movw %cx, 48(%rdx)
-; SSE3-NEXT:    movb %al, 54(%rdx)
-; SSE3-NEXT:    movw %cx, 52(%rdx)
-; SSE3-NEXT:    movb %al, 58(%rdx)
-; SSE3-NEXT:    movw %cx, 56(%rdx)
-; SSE3-NEXT:    movb %al, 62(%rdx)
-; SSE3-NEXT:    movw %cx, 60(%rdx)
+; SSE3-NEXT:    movl (%rdi), %eax
+; SSE3-NEXT:    notl %eax
+; SSE3-NEXT:    movw %ax, (%rsi)
+; SSE3-NEXT:    movl %eax, %ecx
+; SSE3-NEXT:    shrl $16, %ecx
+; SSE3-NEXT:    movb %cl, 2(%rsi)
+; SSE3-NEXT:    movb %cl, 2(%rdx)
+; SSE3-NEXT:    movw %ax, (%rdx)
+; SSE3-NEXT:    movb %cl, 6(%rdx)
+; SSE3-NEXT:    movw %ax, 4(%rdx)
+; SSE3-NEXT:    movb %cl, 10(%rdx)
+; SSE3-NEXT:    movw %ax, 8(%rdx)
+; SSE3-NEXT:    movb %cl, 14(%rdx)
+; SSE3-NEXT:    movw %ax, 12(%rdx)
+; SSE3-NEXT:    movb %cl, 18(%rdx)
+; SSE3-NEXT:    movw %ax, 16(%rdx)
+; SSE3-NEXT:    movb %cl, 22(%rdx)
+; SSE3-NEXT:    movw %ax, 20(%rdx)
+; SSE3-NEXT:    movb %cl, 26(%rdx)
+; SSE3-NEXT:    movw %ax, 24(%rdx)
+; SSE3-NEXT:    movb %cl, 30(%rdx)
+; SSE3-NEXT:    movw %ax, 28(%rdx)
+; SSE3-NEXT:    movb %cl, 34(%rdx)
+; SSE3-NEXT:    movw %ax, 32(%rdx)
+; SSE3-NEXT:    movb %cl, 38(%rdx)
+; SSE3-NEXT:    movw %ax, 36(%rdx)
+; SSE3-NEXT:    movb %cl, 42(%rdx)
+; SSE3-NEXT:    movw %ax, 40(%rdx)
+; SSE3-NEXT:    movb %cl, 46(%rdx)
+; SSE3-NEXT:    movw %ax, 44(%rdx)
+; SSE3-NEXT:    movb %cl, 50(%rdx)
+; SSE3-NEXT:    movw %ax, 48(%rdx)
+; SSE3-NEXT:    movb %cl, 54(%rdx)
+; SSE3-NEXT:    movw %ax, 52(%rdx)
+; SSE3-NEXT:    movb %cl, 58(%rdx)
+; SSE3-NEXT:    movw %ax, 56(%rdx)
+; SSE3-NEXT:    movb %cl, 62(%rdx)
+; SSE3-NEXT:    movw %ax, 60(%rdx)
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-ONLY-LABEL: vec384_v3i8:
 ; SSSE3-ONLY:       # %bb.0:
-; SSSE3-ONLY-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-ONLY-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSSE3-ONLY-NEXT:    pxor %xmm0, %xmm1
-; SSSE3-ONLY-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSSE3-ONLY-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-ONLY-NEXT:    movb %al, 2(%rsi)
-; SSSE3-ONLY-NEXT:    movd %xmm1, %ecx
-; SSSE3-ONLY-NEXT:    movw %cx, (%rsi)
-; SSSE3-ONLY-NEXT:    movb %al, 2(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, (%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 6(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 4(%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 10(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 8(%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 14(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 12(%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 18(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 16(%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 22(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 20(%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 26(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 24(%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 30(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 28(%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 34(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 32(%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 38(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 36(%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 42(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 40(%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 46(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 44(%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 50(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 48(%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 54(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 52(%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 58(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 56(%rdx)
-; SSSE3-ONLY-NEXT:    movb %al, 62(%rdx)
-; SSSE3-ONLY-NEXT:    movw %cx, 60(%rdx)
+; SSSE3-ONLY-NEXT:    movl (%rdi), %eax
+; SSSE3-ONLY-NEXT:    notl %eax
+; SSSE3-ONLY-NEXT:    movw %ax, (%rsi)
+; SSSE3-ONLY-NEXT:    movl %eax, %ecx
+; SSSE3-ONLY-NEXT:    shrl $16, %ecx
+; SSSE3-ONLY-NEXT:    movb %cl, 2(%rsi)
+; SSSE3-ONLY-NEXT:    movb %cl, 2(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, (%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 6(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 4(%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 10(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 8(%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 14(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 12(%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 18(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 16(%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 22(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 20(%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 26(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 24(%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 30(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 28(%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 34(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 32(%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 38(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 36(%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 42(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 40(%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 46(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 44(%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 50(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 48(%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 54(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 52(%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 58(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 56(%rdx)
+; SSSE3-ONLY-NEXT:    movb %cl, 62(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, 60(%rdx)
 ; SSSE3-ONLY-NEXT:    retq
 ;
 ; SSE41-LABEL: vec384_v3i8:


        


More information about the llvm-commits mailing list