[llvm] r265161 - [x86] avoid intermediate splat for non-zero memsets (PR27100)
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 1 10:36:45 PDT 2016
Author: spatel
Date: Fri Apr 1 12:36:45 2016
New Revision: 265161
URL: http://llvm.org/viewvc/llvm-project?rev=265161&view=rev
Log:
[x86] avoid intermediate splat for non-zero memsets (PR27100)
Follow-up to http://reviews.llvm.org/D18566 and http://reviews.llvm.org/D18676 -
where we noticed that an intermediate splat was being generated for memsets of
non-zero chars.
That was because we told getMemsetStores() to use a 32-bit vector element type,
and it happily obliged by producing that constant using an integer multiply.
The 16-byte test that was added in D18566 is now equivalent for AVX1 and AVX2
(no splats, just a vector load), but we have PR27141 to track that splat difference.
Note that the SSE1 path is not changed in this patch. That can be a follow-up.
This patch should resolve PR27100.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/memset-nonzero.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=265161&r1=265160&r2=265161&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Apr 1 12:36:45 2016
@@ -2039,7 +2039,8 @@ X86TargetLowering::getOptimalMemOpType(u
return MVT::v32i8;
}
if (Subtarget.hasSSE2())
- return MVT::v4i32;
+ return MVT::v16i8;
+ // TODO: Can SSE1 handle a byte vector?
if (Subtarget.hasSSE1())
return MVT::v4f32;
} else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
Modified: llvm/trunk/test/CodeGen/X86/memset-nonzero.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/memset-nonzero.ll?rev=265161&r1=265160&r2=265161&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/memset-nonzero.ll (original)
+++ llvm/trunk/test/CodeGen/X86/memset-nonzero.ll Fri Apr 1 12:36:45 2016
@@ -12,15 +12,10 @@ define void @memset_16_nonzero_bytes(i8*
; SSE2-NEXT: movq %rax, (%rdi)
; SSE2-NEXT: retq
;
-; AVX1-LABEL: memset_16_nonzero_bytes:
-; AVX1: vmovaps {{.*#+}} xmm0 = [707406378,707406378,707406378,707406378]
-; AVX1-NEXT: vmovups %xmm0, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: memset_16_nonzero_bytes:
-; AVX2: vbroadcastss {{.*}}(%rip), %xmm0
-; AVX2-NEXT: vmovups %xmm0, (%rdi)
-; AVX2-NEXT: retq
+; AVX-LABEL: memset_16_nonzero_bytes:
+; AVX: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vmovups %xmm0, (%rdi)
+; AVX-NEXT: retq
;
%call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1)
ret void
@@ -145,19 +140,16 @@ define void @memset_16_nonconst_bytes(i8
; SSE2-NEXT: retq
;
; AVX1-LABEL: memset_16_nonconst_bytes:
-; AVX1: movzbl %sil, %eax
-; AVX1-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1: vmovd %esi, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: memset_16_nonconst_bytes:
-; AVX2: movzbl %sil, %eax
-; AVX2-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
-; AVX2-NEXT: vmovups %xmm0, (%rdi)
+; AVX2: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
; AVX2-NEXT: retq
;
tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 16, i32 1, i1 false)
More information about the llvm-commits
mailing list