[llvm] r368544 - [X86] Add some reduction add test cases that show sub-optimal code on avx2 and later.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sun Aug 11 23:55:58 PDT 2019
Author: ctopper
Date: Sun Aug 11 23:55:58 2019
New Revision: 368544
URL: http://llvm.org/viewvc/llvm-project?rev=368544&view=rev
Log:
[X86] Add some reduction add test cases that show sub-optimal code on avx2 and later.
For v4i8 and v8i8 when the reduction starts with a load we end up
shifting the data in the scalar domain and copying to the vector
domain a second time using a broadcast.
We already copied it to the vector domain once. It's better to
just shuffle it there.
Modified:
llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll?rev=368544&r1=368543&r2=368544&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll Sun Aug 11 23:55:58 2019
@@ -979,6 +979,53 @@ define i8 @test_v2i8(<2 x i8> %a0) {
ret i8 %1
}
+define i8 @test_v2i8_load(<2 x i8>* %p) {
+; SSE2-LABEL: test_v2i8_load:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movzwl (%rdi), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v2i8_load:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movzwl (%rdi), %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: pextrb $0, %xmm1, %eax
+; SSE41-NEXT: # kill: def $al killed $al killed $eax
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v2i8_load:
+; AVX: # %bb.0:
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v2i8_load:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movzwl (%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: retq
+ %a0 = load <2 x i8>, <2 x i8>* %p
+ %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0)
+ ret i8 %1
+}
+
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
@@ -1027,6 +1074,89 @@ define i8 @test_v4i8(<4 x i8> %a0) {
ret i8 %1
}
+define i8 @test_v4i8_load(<4 x i8>* %p) {
+; SSE2-LABEL: test_v4i8_load:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4i8_load:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: paddb %xmm1, %xmm0
+; SSE41-NEXT: pextrb $0, %xmm0, %eax
+; SSE41-NEXT: # kill: def $al killed $al killed $eax
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v4i8_load:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v4i8_load:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: retq
+;
+; AVX512BW-LABEL: test_v4i8_load:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: movl (%rdi), %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm0
+; AVX512BW-NEXT: shrl $16, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm1
+; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v4i8_load:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movl (%rdi), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm0
+; AVX512VL-NEXT: shrl $16, %eax
+; AVX512VL-NEXT: vpbroadcastw %eax, %xmm1
+; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT: retq
+ %a0 = load <4 x i8>, <4 x i8>* %p
+ %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0)
+ ret i8 %1
+}
+
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
@@ -1082,6 +1212,101 @@ define i8 @test_v8i8(<8 x i8> %a0) {
%1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
ret i8 %1
}
+
+define i8 @test_v8i8_load(<8 x i8>* %p) {
+; SSE2-LABEL: test_v8i8_load:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8i8_load:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: paddb %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: pextrb $0, %xmm1, %eax
+; SSE41-NEXT: # kill: def $al killed $al killed $eax
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v8i8_load:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v8i8_load:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: shrq $32, %rax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: retq
+;
+; AVX512BW-LABEL: test_v8i8_load:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: movq (%rdi), %rax
+; AVX512BW-NEXT: vmovq %rax, %xmm0
+; AVX512BW-NEXT: shrq $32, %rax
+; AVX512BW-NEXT: vmovd %eax, %xmm1
+; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v8i8_load:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movq (%rdi), %rax
+; AVX512VL-NEXT: vmovq %rax, %xmm0
+; AVX512VL-NEXT: shrq $32, %rax
+; AVX512VL-NEXT: vpbroadcastd %eax, %xmm1
+; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT: retq
+ %a0 = load <8 x i8>, <8 x i8>* %p
+ %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
+ ret i8 %1
+}
define i8 @test_v16i8(<16 x i8> %a0) {
; SSE2-LABEL: test_v16i8:
More information about the llvm-commits
mailing list