[llvm] r297568 - [X86][SSE] Improve extraction of elements from v16i8 (pre-SSE41)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Sat Mar 11 12:42:31 PST 2017


Author: rksimon
Date: Sat Mar 11 14:42:31 2017
New Revision: 297568

URL: http://llvm.org/viewvc/llvm-project?rev=297568&view=rev
Log:
[X86][SSE] Improve extraction of elements from v16i8 (pre-SSE41)

Without SSE41 (pextrb) we currently extract byte elements from a vector by spilling to stack and reloading the byte.

This patch is an initial attempt at using MOVD/PEXTRW to extract the relevant DWORD/WORD from the vector and then shift+truncate to collect the correct byte.

Extraction of multiple bytes this way would result in code bloat, but as explained in the patch we could probably afford to be more aggressive with the supported extractions before again falling back on spilling - possibly through counting the number of extracts and which DWORD/WORD they originate?

Differential Revision: https://reviews.llvm.org/D29841

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/extract-store.ll
    llvm/trunk/test/CodeGen/X86/extractelement-index.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=297568&r1=297567&r2=297568&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sat Mar 11 14:42:31 2017
@@ -13935,7 +13935,33 @@ X86TargetLowering::LowerEXTRACT_VECTOR_E
     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
       return Res;
 
-  // TODO: handle v16i8.
+  // TODO: We only extract a single element from v16i8, we can probably afford
+  // to be more aggressive here before using the default approach of spilling to
+  // stack.
+  if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
+    // Extract either the lowest i32 or any i16, and extract the sub-byte.
+    int DWordIdx = IdxVal / 4;
+    if (DWordIdx == 0) {
+      SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                DAG.getBitcast(MVT::v4i32, Vec),
+                                DAG.getIntPtrConstant(DWordIdx, dl));
+      int ShiftVal = (IdxVal % 4) * 8;
+      if (ShiftVal != 0)
+        Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
+                          DAG.getConstant(ShiftVal, dl, MVT::i32));
+      return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+    }
+
+    int WordIdx = IdxVal / 2;
+    SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+                              DAG.getBitcast(MVT::v8i16, Vec),
+                              DAG.getIntPtrConstant(WordIdx, dl));
+    int ShiftVal = (IdxVal % 2) * 8;
+    if (ShiftVal != 0)
+      Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
+                        DAG.getConstant(ShiftVal, dl, MVT::i16));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+  }
 
   if (VT.getSizeInBits() == 32) {
     if (IdxVal == 0)

Modified: llvm/trunk/test/CodeGen/X86/extract-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/extract-store.ll?rev=297568&r1=297567&r2=297568&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/extract-store.ll (original)
+++ llvm/trunk/test/CodeGen/X86/extract-store.ll Sat Mar 11 14:42:31 2017
@@ -9,22 +9,14 @@
 define void @extract_i8_0(i8* nocapture %dst, <16 x i8> %foo) nounwind {
 ; SSE2-X32-LABEL: extract_i8_0:
 ; SSE2-X32:       # BB#0:
-; SSE2-X32-NEXT:    pushl %ebp
-; SSE2-X32-NEXT:    movl %esp, %ebp
-; SSE2-X32-NEXT:    andl $-16, %esp
-; SSE2-X32-NEXT:    subl $32, %esp
-; SSE2-X32-NEXT:    movl 8(%ebp), %eax
-; SSE2-X32-NEXT:    movaps %xmm0, (%esp)
-; SSE2-X32-NEXT:    movb (%esp), %cl
+; SSE2-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE2-X32-NEXT:    movd %xmm0, %ecx
 ; SSE2-X32-NEXT:    movb %cl, (%eax)
-; SSE2-X32-NEXT:    movl %ebp, %esp
-; SSE2-X32-NEXT:    popl %ebp
 ; SSE2-X32-NEXT:    retl
 ;
 ; SSE2-X64-LABEL: extract_i8_0:
 ; SSE2-X64:       # BB#0:
-; SSE2-X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-X64-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-X64-NEXT:    movd %xmm0, %eax
 ; SSE2-X64-NEXT:    movb %al, (%rdi)
 ; SSE2-X64-NEXT:    retq
 ;
@@ -57,22 +49,16 @@ define void @extract_i8_0(i8* nocapture
 define void @extract_i8_3(i8* nocapture %dst, <16 x i8> %foo) nounwind {
 ; SSE2-X32-LABEL: extract_i8_3:
 ; SSE2-X32:       # BB#0:
-; SSE2-X32-NEXT:    pushl %ebp
-; SSE2-X32-NEXT:    movl %esp, %ebp
-; SSE2-X32-NEXT:    andl $-16, %esp
-; SSE2-X32-NEXT:    subl $32, %esp
-; SSE2-X32-NEXT:    movl 8(%ebp), %eax
-; SSE2-X32-NEXT:    movaps %xmm0, (%esp)
-; SSE2-X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; SSE2-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE2-X32-NEXT:    movd %xmm0, %ecx
+; SSE2-X32-NEXT:    shrl $24, %ecx
 ; SSE2-X32-NEXT:    movb %cl, (%eax)
-; SSE2-X32-NEXT:    movl %ebp, %esp
-; SSE2-X32-NEXT:    popl %ebp
 ; SSE2-X32-NEXT:    retl
 ;
 ; SSE2-X64-LABEL: extract_i8_3:
 ; SSE2-X64:       # BB#0:
-; SSE2-X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-X64-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-X64-NEXT:    movd %xmm0, %eax
+; SSE2-X64-NEXT:    shrl $24, %eax
 ; SSE2-X64-NEXT:    movb %al, (%rdi)
 ; SSE2-X64-NEXT:    retq
 ;
@@ -105,23 +91,15 @@ define void @extract_i8_3(i8* nocapture
 define void @extract_i8_15(i8* nocapture %dst, <16 x i8> %foo) nounwind {
 ; SSE2-X32-LABEL: extract_i8_15:
 ; SSE2-X32:       # BB#0:
-; SSE2-X32-NEXT:    pushl %ebp
-; SSE2-X32-NEXT:    movl %esp, %ebp
-; SSE2-X32-NEXT:    andl $-16, %esp
-; SSE2-X32-NEXT:    subl $32, %esp
-; SSE2-X32-NEXT:    movl 8(%ebp), %eax
-; SSE2-X32-NEXT:    movaps %xmm0, (%esp)
-; SSE2-X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; SSE2-X32-NEXT:    movb %cl, (%eax)
-; SSE2-X32-NEXT:    movl %ebp, %esp
-; SSE2-X32-NEXT:    popl %ebp
+; SSE2-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE2-X32-NEXT:    pextrw $7, %xmm0, %ecx
+; SSE2-X32-NEXT:    movb %ch, (%eax)
 ; SSE2-X32-NEXT:    retl
 ;
 ; SSE2-X64-LABEL: extract_i8_15:
 ; SSE2-X64:       # BB#0:
-; SSE2-X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-X64-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-X64-NEXT:    movb %al, (%rdi)
+; SSE2-X64-NEXT:    pextrw $7, %xmm0, %eax
+; SSE2-X64-NEXT:    movb %ah, (%rdi) # NOREX
 ; SSE2-X64-NEXT:    retq
 ;
 ; SSE41-X32-LABEL: extract_i8_15:

Modified: llvm/trunk/test/CodeGen/X86/extractelement-index.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/extractelement-index.ll?rev=297568&r1=297567&r2=297568&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/extractelement-index.ll (original)
+++ llvm/trunk/test/CodeGen/X86/extractelement-index.ll Sat Mar 11 14:42:31 2017
@@ -11,8 +11,9 @@
 define i8 @extractelement_v16i8_1(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: extractelement_v16i8_1:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    shrl $8, %eax
+; SSE2-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: extractelement_v16i8_1:
@@ -33,8 +34,9 @@ define i8 @extractelement_v16i8_1(<16 x
 define i8 @extractelement_v16i8_11(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: extractelement_v16i8_11:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    pextrw $5, %xmm0, %eax
+; SSE2-NEXT:    shrl $8, %eax
+; SSE2-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: extractelement_v16i8_11:
@@ -55,8 +57,8 @@ define i8 @extractelement_v16i8_11(<16 x
 define i8 @extractelement_v16i8_14(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: extractelement_v16i8_14:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    pextrw $7, %xmm0, %eax
+; SSE2-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: extractelement_v16i8_14:
@@ -77,8 +79,9 @@ define i8 @extractelement_v16i8_14(<16 x
 define i8 @extractelement_v32i8_1(<32 x i8> %a) nounwind {
 ; SSE2-LABEL: extractelement_v32i8_1:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    shrl $8, %eax
+; SSE2-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: extractelement_v32i8_1:
@@ -100,8 +103,9 @@ define i8 @extractelement_v32i8_1(<32 x
 define i8 @extractelement_v32i8_17(<32 x i8> %a) nounwind {
 ; SSE2-LABEL: extractelement_v32i8_17:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    shrl $8, %eax
+; SSE2-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: extractelement_v32i8_17:




More information about the llvm-commits mailing list