[llvm] 15b6aa7 - [X86] Enable the use of movlps for i64 atomic load on 32-bit targets with sse1.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sun Feb 23 15:14:25 PST 2020


Author: Craig Topper
Date: 2020-02-23T15:11:38-08:00
New Revision: 15b6aa744881b6e77a3d6773afa3016fc2f9f123

URL: https://github.com/llvm/llvm-project/commit/15b6aa744881b6e77a3d6773afa3016fc2f9f123
DIFF: https://github.com/llvm/llvm-project/commit/15b6aa744881b6e77a3d6773afa3016fc2f9f123.diff

LOG: [X86] Enable the use of movlps for i64 atomic load on 32-bit targets with sse1.

Still a little room for improvement by using movlps to store to
the stack temporary needed to move data out of the xmm register
after the load.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/atomic-fp.ll
    llvm/test/CodeGen/X86/atomic-non-integer.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b7ff255f0881..575eeb68bd2f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -27544,7 +27544,7 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
       LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
-      (Subtarget.hasSSE2() || Subtarget.hasX87()))
+      (Subtarget.hasSSE1() || Subtarget.hasX87()))
     return AtomicExpansionKind::None;
 
   return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
@@ -29667,15 +29667,27 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
             Attribute::NoImplicitFloat);
     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
       auto *Node = cast<AtomicSDNode>(N);
-      if (Subtarget.hasSSE2()) {
-        // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the
-        // lower 64-bits.
-        SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+      if (Subtarget.hasSSE1()) {
+        // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
+        // Then extract the lower 64-bits.
+        MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
+        SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
         SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
                                              MVT::i64, Node->getMemOperand());
-        SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+        if (Subtarget.hasSSE2()) {
+          SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+                                    DAG.getIntPtrConstant(0, dl));
+          Results.push_back(Res);
+          Results.push_back(Ld.getValue(1));
+          return;
+        }
+        // We use an alternative sequence for SSE1 that extracts as v2f32 and
+        // then casts to i64. This avoids a 128-bit stack temporary being
+        // created by type legalization if we were to cast v4f32->v2i64.
+        SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
                                   DAG.getIntPtrConstant(0, dl));
+        Res = DAG.getBitcast(MVT::i64, Res);
         Results.push_back(Res);
         Results.push_back(Ld.getValue(1));
         return;

diff  --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll
index 01e0480a815b..17b58a581db1 100644
--- a/llvm/test/CodeGen/X86/atomic-fp.ll
+++ b/llvm/test/CodeGen/X86/atomic-fp.ll
@@ -107,18 +107,17 @@ define void @fadd_64r(double* %loc, double %val) nounwind {
 ; X86-SSE1-NEXT:    pushl %ebp
 ; X86-SSE1-NEXT:    movl %esp, %ebp
 ; X86-SSE1-NEXT:    andl $-8, %esp
-; X86-SSE1-NEXT:    subl $24, %esp
+; X86-SSE1-NEXT:    subl $16, %esp
 ; X86-SSE1-NEXT:    movl 8(%ebp), %eax
-; X86-SSE1-NEXT:    fildll (%eax)
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %ecx, (%esp)
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    faddl 12(%ebp)
 ; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movlps %xmm0, (%eax)
 ; X86-SSE1-NEXT:    movl %ebp, %esp
@@ -274,17 +273,16 @@ define void @fadd_64g() nounwind {
 ; X86-SSE1-NEXT:    pushl %ebp
 ; X86-SSE1-NEXT:    movl %esp, %ebp
 ; X86-SSE1-NEXT:    andl $-8, %esp
-; X86-SSE1-NEXT:    subl $24, %esp
-; X86-SSE1-NEXT:    fildll glob64
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fld1
 ; X86-SSE1-NEXT:    faddl (%esp)
 ; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movlps %xmm0, glob64
 ; X86-SSE1-NEXT:    movl %ebp, %esp
@@ -438,17 +436,16 @@ define void @fadd_64imm() nounwind {
 ; X86-SSE1-NEXT:    pushl %ebp
 ; X86-SSE1-NEXT:    movl %esp, %ebp
 ; X86-SSE1-NEXT:    andl $-8, %esp
-; X86-SSE1-NEXT:    subl $24, %esp
-; X86-SSE1-NEXT:    fildll -559038737
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fld1
 ; X86-SSE1-NEXT:    faddl (%esp)
 ; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movlps %xmm0, -559038737
 ; X86-SSE1-NEXT:    movl %ebp, %esp
@@ -608,17 +605,16 @@ define void @fadd_64stack() nounwind {
 ; X86-SSE1-NEXT:    pushl %ebp
 ; X86-SSE1-NEXT:    movl %esp, %ebp
 ; X86-SSE1-NEXT:    andl $-8, %esp
-; X86-SSE1-NEXT:    subl $32, %esp
-; X86-SSE1-NEXT:    fildll {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    subl $24, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fld1
 ; X86-SSE1-NEXT:    faddl (%esp)
 ; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    movl %ebp, %esp
@@ -712,25 +708,22 @@ define void @fadd_array(i64* %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-SSE1:       # %bb.0: # %bb
 ; X86-SSE1-NEXT:    pushl %ebp
 ; X86-SSE1-NEXT:    movl %esp, %ebp
-; X86-SSE1-NEXT:    pushl %esi
 ; X86-SSE1-NEXT:    andl $-8, %esp
-; X86-SSE1-NEXT:    subl $32, %esp
+; X86-SSE1-NEXT:    subl $16, %esp
 ; X86-SSE1-NEXT:    movl 20(%ebp), %eax
 ; X86-SSE1-NEXT:    movl 8(%ebp), %ecx
-; X86-SSE1-NEXT:    fildll (%ecx,%eax,8)
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE1-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %edx, (%esp)
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
 ; X86-SSE1-NEXT:    faddl 12(%ebp)
 ; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movlps %xmm0, (%ecx,%eax,8)
-; X86-SSE1-NEXT:    leal -4(%ebp), %esp
-; X86-SSE1-NEXT:    popl %esi
+; X86-SSE1-NEXT:    movl %ebp, %esp
 ; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    retl
 ;

diff  --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
index 8b2ed638af2a..664f195e3884 100644
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -272,17 +272,16 @@ define float @load_float(float* %fptr) {
 define double @load_double(double* %fptr) {
 ; X86-SSE1-LABEL: load_double:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    subl $20, %esp
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 24
+; X86-SSE1-NEXT:    subl $12, %esp
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    fildll (%eax)
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
-; X86-SSE1-NEXT:    addl $20, %esp
+; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE1-NEXT:    retl
 ;
@@ -660,17 +659,16 @@ define float @load_float_seq_cst(float* %fptr) {
 define double @load_double_seq_cst(double* %fptr) {
 ; X86-SSE1-LABEL: load_double_seq_cst:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    subl $20, %esp
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 24
+; X86-SSE1-NEXT:    subl $12, %esp
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    fildll (%eax)
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movss %xmm0, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE1-NEXT:    fldl (%esp)
-; X86-SSE1-NEXT:    addl $20, %esp
+; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE1-NEXT:    retl
 ;


        


More information about the llvm-commits mailing list