[llvm] r326097 - [X86][AVX] Add AVX1 PSAD tests
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 26 07:55:25 PST 2018
Author: rksimon
Date: Mon Feb 26 07:55:25 2018
New Revision: 326097
URL: http://llvm.org/viewvc/llvm-project?rev=326097&view=rev
Log:
[X86][AVX] Add AVX1 PSAD tests
Cleanup check-prefixes to share more AVX/AVX512 codegen checks
Modified:
llvm/trunk/test/CodeGen/X86/sad.ll
llvm/trunk/test/CodeGen/X86/sad_variations.ll
Modified: llvm/trunk/test/CodeGen/X86/sad.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sad.ll?rev=326097&r1=326096&r2=326097&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sad.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sad.ll Mon Feb 26 07:55:25 2018
@@ -1,8 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
@a = global [1024 x i8] zeroinitializer, align 16
@b = global [1024 x i8] zeroinitializer, align 16
@@ -33,6 +34,34 @@ define i32 @sad_16i8() nounwind {
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
+; AVX1-LABEL: sad_16i8:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: .p2align 4, 0x90
+; AVX1-NEXT: .LBB0_1: # %vector.body
+; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX1-NEXT: vmovdqu a+1024(%rax), %xmm2
+; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: addq $4, %rax
+; AVX1-NEXT: jne .LBB0_1
+; AVX1-NEXT: # %bb.2: # %middle.block
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: sad_16i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
@@ -57,55 +86,30 @@ define i32 @sad_16i8() nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: sad_16i8:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX512F-NEXT: .p2align 4, 0x90
-; AVX512F-NEXT: .LBB0_1: # %vector.body
-; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512F-NEXT: vmovdqu a+1024(%rax), %xmm1
-; AVX512F-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
-; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT: addq $4, %rax
-; AVX512F-NEXT: jne .LBB0_1
-; AVX512F-NEXT: # %bb.2: # %middle.block
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: sad_16i8:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX512BW-NEXT: .p2align 4, 0x90
-; AVX512BW-NEXT: .LBB0_1: # %vector.body
-; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512BW-NEXT: vmovdqu a+1024(%rax), %xmm1
-; AVX512BW-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
-; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: addq $4, %rax
-; AVX512BW-NEXT: jne .LBB0_1
-; AVX512BW-NEXT: # %bb.2: # %middle.block
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: sad_16i8:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512-NEXT: .p2align 4, 0x90
+; AVX512-NEXT: .LBB0_1: # %vector.body
+; AVX512-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512-NEXT: vmovdqu a+1024(%rax), %xmm1
+; AVX512-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
+; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: addq $4, %rax
+; AVX512-NEXT: jne .LBB0_1
+; AVX512-NEXT: # %bb.2: # %middle.block
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
entry:
br label %vector.body
@@ -280,6 +284,86 @@ define i32 @sad_32i8() nounwind {
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
+; AVX1-LABEL: sad_32i8:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10
+; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
+; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12
+; AVX1-NEXT: .p2align 4, 0x90
+; AVX1-NEXT: .LBB1_1: # %vector.body
+; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm7, %xmm11, %xmm7
+; AVX1-NEXT: vpabsd %xmm3, %xmm11
+; AVX1-NEXT: vpabsd %xmm4, %xmm4
+; AVX1-NEXT: vpabsd %xmm5, %xmm5
+; AVX1-NEXT: vpabsd %xmm6, %xmm6
+; AVX1-NEXT: vpabsd %xmm0, %xmm0
+; AVX1-NEXT: vpabsd %xmm1, %xmm1
+; AVX1-NEXT: vpabsd %xmm2, %xmm2
+; AVX1-NEXT: vpabsd %xmm7, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpaddd %xmm9, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm9
+; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm10, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10
+; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpaddd %xmm8, %xmm5, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8
+; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpaddd %xmm12, %xmm11, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12
+; AVX1-NEXT: addq $4, %rax
+; AVX1-NEXT: jne .LBB1_1
+; AVX1-NEXT: # %bb.2: # %middle.block
+; AVX1-NEXT: vpaddd %xmm12, %xmm10, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm0, %xmm9, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: sad_32i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
@@ -306,59 +390,32 @@ define i32 @sad_32i8() nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: sad_32i8:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: .p2align 4, 0x90
-; AVX512F-NEXT: .LBB1_1: # %vector.body
-; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2
-; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
-; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1
-; AVX512F-NEXT: addq $4, %rax
-; AVX512F-NEXT: jne .LBB1_1
-; AVX512F-NEXT: # %bb.2: # %middle.block
-; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: sad_32i8:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: .p2align 4, 0x90
-; AVX512BW-NEXT: .LBB1_1: # %vector.body
-; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512BW-NEXT: vmovdqa a+1024(%rax), %ymm2
-; AVX512BW-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
-; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: addq $4, %rax
-; AVX512BW-NEXT: jne .LBB1_1
-; AVX512BW-NEXT: # %bb.2: # %middle.block
-; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: sad_32i8:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: .p2align 4, 0x90
+; AVX512-NEXT: .LBB1_1: # %vector.body
+; AVX512-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512-NEXT: vmovdqa a+1024(%rax), %ymm2
+; AVX512-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
+; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1
+; AVX512-NEXT: addq $4, %rax
+; AVX512-NEXT: jne .LBB1_1
+; AVX512-NEXT: # %bb.2: # %middle.block
+; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
entry:
br label %vector.body
@@ -694,6 +751,162 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: addq $200, %rsp
; SSE2-NEXT: retq
;
+; AVX1-LABEL: sad_avx64i8:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: subq $24, %rsp
+; AVX1-NEXT: vpxor %xmm14, %xmm14, %xmm14
+; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX1-NEXT: vpxor %xmm15, %xmm15, %xmm15
+; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
+; AVX1-NEXT: vpxor %xmm13, %xmm13, %xmm13
+; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
+; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10
+; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12
+; AVX1-NEXT: .p2align 4, 0x90
+; AVX1-NEXT: .LBB2_1: # %vector.body
+; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vmovdqa %ymm7, %ymm11
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm4
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm3
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm0
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpabsd %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm7
+; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpabsd %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm11, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm7
+; AVX1-NEXT: vpabsd %xmm6, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpabsd %xmm5, %xmm2
+; AVX1-NEXT: vpaddd %xmm15, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm15
+; AVX1-NEXT: vpabsd %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpabsd %xmm3, %xmm2
+; AVX1-NEXT: vpaddd %xmm14, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm14
+; AVX1-NEXT: vpabsd %xmm4, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: vpaddd %xmm13, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm13
+; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT: vpaddd %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT: vpaddd %xmm9, %xmm1, %xmm1
+; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9
+; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT: vpaddd %xmm10, %xmm1, %xmm1
+; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm10
+; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpabsd (%rsp), %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT: vpaddd %xmm12, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12
+; AVX1-NEXT: addq $4, %rax
+; AVX1-NEXT: jne .LBB2_1
+; AVX1-NEXT: # %bb.2: # %middle.block
+; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpaddd %xmm12, %xmm13, %xmm1
+; AVX1-NEXT: vpaddd %xmm10, %xmm7, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm8, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm14, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: addq $24, %rsp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: sad_avx64i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
@@ -911,71 +1124,27 @@ define i32 @sad_2i8() nounwind {
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
-; AVX2-LABEL: sad_2i8:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: .p2align 4, 0x90
-; AVX2-NEXT: .LBB3_1: # %vector.body
-; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX2-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpaddq %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: addq $4, %rax
-; AVX2-NEXT: jne .LBB3_1
-; AVX2-NEXT: # %bb.2: # %middle.block
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sad_2i8:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: .p2align 4, 0x90
-; AVX512F-NEXT: .LBB3_1: # %vector.body
-; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT: addq $4, %rax
-; AVX512F-NEXT: jne .LBB3_1
-; AVX512F-NEXT: # %bb.2: # %middle.block
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: sad_2i8:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: .p2align 4, 0x90
-; AVX512BW-NEXT: .LBB3_1: # %vector.body
-; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpaddq %xmm1, %xmm2, %xmm1
-; AVX512BW-NEXT: addq $4, %rax
-; AVX512BW-NEXT: jne .LBB3_1
-; AVX512BW-NEXT: # %bb.2: # %middle.block
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; AVX512BW-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: retq
+; AVX-LABEL: sad_2i8:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: .p2align 4, 0x90
+; AVX-NEXT: .LBB3_1: # %vector.body
+; AVX-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX-NEXT: addq $4, %rax
+; AVX-NEXT: jne .LBB3_1
+; AVX-NEXT: # %bb.2: # %middle.block
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
entry:
br label %vector.body
@@ -1016,29 +1185,13 @@ define i32 @sad_nonloop_4i8(<4 x i8>* no
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
-; AVX2-LABEL: sad_nonloop_4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sad_nonloop_4i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: sad_nonloop_4i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512BW-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: retq
+; AVX-LABEL: sad_nonloop_4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
%v1 = load <4 x i8>, <4 x i8>* %p, align 1
%z1 = zext <4 x i8> %v1 to <4 x i32>
%v2 = load <4 x i8>, <4 x i8>* %q, align 1
@@ -1064,29 +1217,13 @@ define i32 @sad_nonloop_8i8(<8 x i8>* no
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
-; AVX2-LABEL: sad_nonloop_8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sad_nonloop_8i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: sad_nonloop_8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: retq
+; AVX-LABEL: sad_nonloop_8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
%v1 = load <8 x i8>, <8 x i8>* %p, align 1
%z1 = zext <8 x i8> %v1 to <8 x i32>
%v2 = load <8 x i8>, <8 x i8>* %q, align 1
@@ -1116,32 +1253,14 @@ define i32 @sad_nonloop_16i8(<16 x i8>*
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
-; AVX2-LABEL: sad_nonloop_16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqu (%rdi), %xmm0
-; AVX2-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sad_nonloop_16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512F-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: sad_nonloop_16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512BW-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: retq
+; AVX-LABEL: sad_nonloop_16i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqu (%rdi), %xmm0
+; AVX-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
%v1 = load <16 x i8>, <16 x i8>* %p, align 1
%z1 = zext <16 x i8> %v1 to <16 x i32>
%v2 = load <16 x i8>, <16 x i8>* %q, align 1
@@ -1260,6 +1379,54 @@ define i32 @sad_nonloop_32i8(<32 x i8>*
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
+; AVX1-LABEL: sad_nonloop_32i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm7, %xmm4, %xmm4
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpabsd %xmm0, %xmm0
+; AVX1-NEXT: vpabsd %xmm1, %xmm1
+; AVX1-NEXT: vpabsd %xmm2, %xmm2
+; AVX1-NEXT: vpabsd %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpabsd %xmm4, %xmm1
+; AVX1-NEXT: vpabsd %xmm5, %xmm2
+; AVX1-NEXT: vpabsd %xmm6, %xmm3
+; AVX1-NEXT: vpabsd %xmm7, %xmm4
+; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: sad_nonloop_32i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
@@ -1272,29 +1439,17 @@ define i32 @sad_nonloop_32i8(<32 x i8>*
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: sad_nonloop_32i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512F-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: sad_nonloop_32i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BW-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: sad_nonloop_32i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%v1 = load <32 x i8>, <32 x i8>* %p, align 1
%z1 = zext <32 x i8> %v1 to <32 x i32>
%v2 = load <32 x i8>, <32 x i8>* %q, align 1
Modified: llvm/trunk/test/CodeGen/X86/sad_variations.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sad_variations.ll?rev=326097&r1=326096&r2=326097&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sad_variations.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sad_variations.ll Mon Feb 26 07:55:25 2018
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
define i32 @sad8_32bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #0 {
; SSE2-LABEL: sad8_32bit_icmp_sge:
@@ -12,21 +14,13 @@ define i32 @sad8_32bit_icmp_sge(i8* noca
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
-; AVX2-LABEL: sad8_32bit_icmp_sge:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sad8_32bit_icmp_sge:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: retq
+; AVX-LABEL: sad8_32bit_icmp_sge:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
entry:
%idx.ext = zext i32 %stride to i64
@@ -62,21 +56,13 @@ define i32 @sad8_32bit_icmp_sgt(i8* noca
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
-; AVX2-LABEL: sad8_32bit_icmp_sgt:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sad8_32bit_icmp_sgt:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: retq
+; AVX-LABEL: sad8_32bit_icmp_sgt:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
entry:
%idx.ext = zext i32 %stride to i64
br label %for.body
@@ -111,21 +97,13 @@ define i32 @sad8_32bit_icmp_sle(i8* noca
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
-; AVX2-LABEL: sad8_32bit_icmp_sle:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sad8_32bit_icmp_sle:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: retq
+; AVX-LABEL: sad8_32bit_icmp_sle:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
entry:
%idx.ext = zext i32 %stride to i64
br label %for.body
@@ -160,21 +138,13 @@ define i32 @sad8_32bit_icmp_slt(i8* noca
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
-; AVX2-LABEL: sad8_32bit_icmp_slt:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sad8_32bit_icmp_slt:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: retq
+; AVX-LABEL: sad8_32bit_icmp_slt:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
entry:
%idx.ext = zext i32 %stride to i64
br label %for.body
@@ -209,21 +179,13 @@ define i64 @sad8_64bit_icmp_sext_slt(i8*
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: retq
;
-; AVX2-LABEL: sad8_64bit_icmp_sext_slt:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sad8_64bit_icmp_sext_slt:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: retq
+; AVX-LABEL: sad8_64bit_icmp_sext_slt:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: retq
entry:
br label %for.body
@@ -258,21 +220,13 @@ define i64 @sad8_64bit_icmp_zext_slt(i8*
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: retq
;
-; AVX2-LABEL: sad8_64bit_icmp_zext_slt:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sad8_64bit_icmp_zext_slt:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: retq
+; AVX-LABEL: sad8_64bit_icmp_zext_slt:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: retq
entry:
br label %for.body
@@ -307,21 +261,13 @@ define i64 @sad8_early_64bit_icmp_zext_s
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: retq
;
-; AVX2-LABEL: sad8_early_64bit_icmp_zext_slt:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sad8_early_64bit_icmp_zext_slt:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: retq
+; AVX-LABEL: sad8_early_64bit_icmp_zext_slt:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: retq
entry:
br label %for.body
More information about the llvm-commits
mailing list