[llvm] ce98cb9 - [X86] Precommit test case for D148980.
via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 23 02:28:14 PDT 2023
Author: Luo, Yuanke
Date: 2023-04-23T17:27:15+08:00
New Revision: ce98cb9c64f78faffba1cd9499e7482a942821cf
URL: https://github.com/llvm/llvm-project/commit/ce98cb9c64f78faffba1cd9499e7482a942821cf
DIFF: https://github.com/llvm/llvm-project/commit/ce98cb9c64f78faffba1cd9499e7482a942821cf.diff
LOG: [X86] Precommit test case for D148980.
Added:
llvm/test/CodeGen/X86/avx512vnni-combine.ll
Modified:
llvm/test/CodeGen/X86/avxvnni-combine.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/avx512vnni-combine.ll b/llvm/test/CodeGen/X86/avx512vnni-combine.ll
new file mode 100644
index 000000000000..7a8d3af368f0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512vnni-combine.ll
@@ -0,0 +1,253 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sapphirerapids | FileCheck %s
+
+define <8 x i64> @foo_reg_512(<8 x i64> %0, <8 x i64> %1, <8 x i64> %2, <8 x i64> %3, <8 x i64> %4, <8 x i64> %5) {
+; CHECK-LABEL: foo_reg_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vpdpwssd %zmm3, %zmm1, %zmm0
+; CHECK-NEXT: vpdpwssd %zmm4, %zmm1, %zmm0
+; CHECK-NEXT: vpdpwssd %zmm5, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %7 = bitcast <8 x i64> %0 to <16 x i32>
+ %8 = bitcast <8 x i64> %1 to <16 x i32>
+ %9 = bitcast <8 x i64> %2 to <16 x i32>
+ %10 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %7, <16 x i32> %8, <16 x i32> %9)
+ %11 = bitcast <8 x i64> %3 to <16 x i32>
+ %12 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %10, <16 x i32> %8, <16 x i32> %11)
+ %13 = bitcast <8 x i64> %4 to <16 x i32>
+ %14 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %12, <16 x i32> %8, <16 x i32> %13)
+ %15 = bitcast <8 x i64> %5 to <16 x i32>
+ %16 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %14, <16 x i32> %8, <16 x i32> %15)
+ %17 = bitcast <16 x i32> %16 to <8 x i64>
+ ret <8 x i64> %17
+}
+
+; __m512i foo(int cnt, __m512i c, __m512i b, __m512i *p) {
+;
+; for (int i = 0; i < cnt; ++i) {
+; __m512i a = p[i];
+; __m512i m = _mm512_madd_epi16(b, a);
+; c = _mm512_add_epi32(m, c);
+; }
+;
+; return c;
+; }
+define <8 x i64> @foo_512(i32 %0, <8 x i64> %1, <8 x i64> %2, ptr %3) {
+; CHECK-LABEL: foo_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jle .LBB1_6
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: movl %edi, %edx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: andl $3, %eax
+; CHECK-NEXT: cmpl $4, %edi
+; CHECK-NEXT: jae .LBB1_7
+; CHECK-NEXT: # %bb.2:
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: jmp .LBB1_3
+; CHECK-NEXT: .LBB1_7:
+; CHECK-NEXT: andl $-4, %edx
+; CHECK-NEXT: leaq 192(%rsi), %rdi
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vpdpwssd -192(%rdi), %zmm1, %zmm0
+; CHECK-NEXT: vpdpwssd -128(%rdi), %zmm1, %zmm0
+; CHECK-NEXT: vpdpwssd -64(%rdi), %zmm1, %zmm0
+; CHECK-NEXT: vpdpwssd (%rdi), %zmm1, %zmm0
+; CHECK-NEXT: addq $4, %rcx
+; CHECK-NEXT: addq $256, %rdi # imm = 0x100
+; CHECK-NEXT: cmpq %rcx, %rdx
+; CHECK-NEXT: jne .LBB1_8
+; CHECK-NEXT: .LBB1_3:
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: je .LBB1_6
+; CHECK-NEXT: # %bb.4: # %.preheader
+; CHECK-NEXT: shlq $6, %rcx
+; CHECK-NEXT: addq %rcx, %rsi
+; CHECK-NEXT: shlq $6, %rax
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vpdpwssd (%rsi,%rcx), %zmm1, %zmm0
+; CHECK-NEXT: addq $64, %rcx
+; CHECK-NEXT: cmpq %rcx, %rax
+; CHECK-NEXT: jne .LBB1_5
+; CHECK-NEXT: .LBB1_6:
+; CHECK-NEXT: retq
+ %5 = icmp sgt i32 %0, 0
+ br i1 %5, label %6, label %33
+
+6: ; preds = %4
+ %7 = bitcast <8 x i64> %2 to <32 x i16>
+ %8 = bitcast <8 x i64> %1 to <16 x i32>
+ %9 = zext i32 %0 to i64
+ %10 = and i64 %9, 3
+ %11 = icmp ult i32 %0, 4
+ br i1 %11, label %14, label %12
+
+12: ; preds = %6
+ %13 = and i64 %9, 4294967292
+ br label %35
+
+14: ; preds = %35, %6
+ %15 = phi <16 x i32> [ undef, %6 ], [ %57, %35 ]
+ %16 = phi i64 [ 0, %6 ], [ %58, %35 ]
+ %17 = phi <16 x i32> [ %8, %6 ], [ %57, %35 ]
+ %18 = icmp eq i64 %10, 0
+ br i1 %18, label %30, label %19
+
+19: ; preds = %14, %19
+ %20 = phi i64 [ %27, %19 ], [ %16, %14 ]
+ %21 = phi <16 x i32> [ %26, %19 ], [ %17, %14 ]
+ %22 = phi i64 [ %28, %19 ], [ 0, %14 ]
+ %23 = getelementptr inbounds <8 x i64>, ptr %3, i64 %20
+ %24 = load <32 x i16>, ptr %23, align 64
+ %25 = tail call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %7, <32 x i16> %24)
+ %26 = add <16 x i32> %25, %21
+ %27 = add nuw nsw i64 %20, 1
+ %28 = add i64 %22, 1
+ %29 = icmp eq i64 %28, %10
+ br i1 %29, label %30, label %19
+
+30: ; preds = %19, %14
+ %31 = phi <16 x i32> [ %15, %14 ], [ %26, %19 ]
+ %32 = bitcast <16 x i32> %31 to <8 x i64>
+ br label %33
+
+33: ; preds = %30, %4
+ %34 = phi <8 x i64> [ %32, %30 ], [ %1, %4 ]
+ ret <8 x i64> %34
+
+35: ; preds = %35, %12
+ %36 = phi i64 [ 0, %12 ], [ %58, %35 ]
+ %37 = phi <16 x i32> [ %8, %12 ], [ %57, %35 ]
+ %38 = phi i64 [ 0, %12 ], [ %59, %35 ]
+ %39 = getelementptr inbounds <8 x i64>, ptr %3, i64 %36
+ %40 = load <32 x i16>, ptr %39, align 64
+ %41 = tail call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %7, <32 x i16> %40)
+ %42 = add <16 x i32> %41, %37
+ %43 = or i64 %36, 1
+ %44 = getelementptr inbounds <8 x i64>, ptr %3, i64 %43
+ %45 = load <32 x i16>, ptr %44, align 64
+ %46 = tail call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %7, <32 x i16> %45)
+ %47 = add <16 x i32> %46, %42
+ %48 = or i64 %36, 2
+ %49 = getelementptr inbounds <8 x i64>, ptr %3, i64 %48
+ %50 = load <32 x i16>, ptr %49, align 64
+ %51 = tail call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %7, <32 x i16> %50)
+ %52 = add <16 x i32> %51, %47
+ %53 = or i64 %36, 3
+ %54 = getelementptr inbounds <8 x i64>, ptr %3, i64 %53
+ %55 = load <32 x i16>, ptr %54, align 64
+ %56 = tail call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %7, <32 x i16> %55)
+ %57 = add <16 x i32> %56, %52
+ %58 = add nuw nsw i64 %36, 4
+ %59 = add i64 %38, 4
+ %60 = icmp eq i64 %59, %13
+ br i1 %60, label %14, label %35
+}
+
+; void bar(int cnt, __m512i *c, __m512i b, __m512i *p) {
+; for (int i = 0; i < cnt; ++i) {
+; __m512i a = p[i];
+; c[i] = _mm512_dpwssd_epi32(c[i], b, a);
+; }
+; }
+define void @bar_512(i32 %0, ptr %1, <8 x i64> %2, ptr %3) {
+; CHECK-LABEL: bar_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jle .LBB2_5
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: cmpl $1, %edi
+; CHECK-NEXT: jne .LBB2_6
+; CHECK-NEXT: # %bb.2:
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: jmp .LBB2_3
+; CHECK-NEXT: .LBB2_6:
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: andl $-2, %edi
+; CHECK-NEXT: movl $64, %r8d
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vmovdqa64 -64(%rsi,%r8), %zmm1
+; CHECK-NEXT: vmovdqa64 (%rsi,%r8), %zmm2
+; CHECK-NEXT: vpdpwssd -64(%rdx,%r8), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqa64 %zmm1, -64(%rsi,%r8)
+; CHECK-NEXT: vpdpwssd (%rdx,%r8), %zmm0, %zmm2
+; CHECK-NEXT: vmovdqa64 %zmm2, (%rsi,%r8)
+; CHECK-NEXT: addq $2, %rcx
+; CHECK-NEXT: subq $-128, %r8
+; CHECK-NEXT: cmpq %rcx, %rdi
+; CHECK-NEXT: jne .LBB2_7
+; CHECK-NEXT: .LBB2_3:
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: je .LBB2_5
+; CHECK-NEXT: # %bb.4:
+; CHECK-NEXT: shlq $6, %rcx
+; CHECK-NEXT: vmovdqa64 (%rsi,%rcx), %zmm1
+; CHECK-NEXT: vpdpwssd (%rdx,%rcx), %zmm0, %zmm1
+; CHECK-NEXT: vmovdqa64 %zmm1, (%rsi,%rcx)
+; CHECK-NEXT: .LBB2_5:
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %5 = icmp sgt i32 %0, 0
+ br i1 %5, label %6, label %22
+
+6: ; preds = %4
+ %7 = bitcast <8 x i64> %2 to <16 x i32>
+ %8 = zext i32 %0 to i64
+ %9 = and i64 %8, 1
+ %10 = icmp eq i32 %0, 1
+ br i1 %10, label %13, label %11
+
+11: ; preds = %6
+ %12 = and i64 %8, 4294967294
+ br label %23
+
+13: ; preds = %23, %6
+ %14 = phi i64 [ 0, %6 ], [ %37, %23 ]
+ %15 = icmp eq i64 %9, 0
+ br i1 %15, label %22, label %16
+
+16: ; preds = %13
+ %17 = getelementptr inbounds <8 x i64>, ptr %3, i64 %14
+ %18 = load <16 x i32>, ptr %17, align 64
+ %19 = getelementptr inbounds <8 x i64>, ptr %1, i64 %14
+ %20 = load <16 x i32>, ptr %19, align 64
+ %21 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %20, <16 x i32> %7, <16 x i32> %18)
+ store <16 x i32> %21, ptr %19, align 64
+ br label %22
+
+22: ; preds = %16, %13, %4
+ ret void
+
+23: ; preds = %23, %11
+ %24 = phi i64 [ 0, %11 ], [ %37, %23 ]
+ %25 = phi i64 [ 0, %11 ], [ %38, %23 ]
+ %26 = getelementptr inbounds <8 x i64>, ptr %3, i64 %24
+ %27 = load <16 x i32>, ptr %26, align 64
+ %28 = getelementptr inbounds <8 x i64>, ptr %1, i64 %24
+ %29 = load <16 x i32>, ptr %28, align 64
+ %30 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %29, <16 x i32> %7, <16 x i32> %27)
+ store <16 x i32> %30, ptr %28, align 64
+ %31 = or i64 %24, 1
+ %32 = getelementptr inbounds <8 x i64>, ptr %3, i64 %31
+ %33 = load <16 x i32>, ptr %32, align 64
+ %34 = getelementptr inbounds <8 x i64>, ptr %1, i64 %31
+ %35 = load <16 x i32>, ptr %34, align 64
+ %36 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %35, <16 x i32> %7, <16 x i32> %33)
+ store <16 x i32> %36, ptr %34, align 64
+ %37 = add nuw nsw i64 %24, 2
+ %38 = add i64 %25, 2
+ %39 = icmp eq i64 %38, %12
+ br i1 %39, label %13, label %23
+}
+
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>) #3
+declare <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>) #3
diff --git a/llvm/test/CodeGen/X86/avxvnni-combine.ll b/llvm/test/CodeGen/X86/avxvnni-combine.ll
index 3ee06a21040d..fc86c80e2441 100644
--- a/llvm/test/CodeGen/X86/avxvnni-combine.ll
+++ b/llvm/test/CodeGen/X86/avxvnni-combine.ll
@@ -1,5 +1,363 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sapphirerapids | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server | FileCheck %s --check-prefixes=AVX512
+
+define <2 x i64> @foo_reg_128(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, <2 x i64> %4, <2 x i64> %5) {
+; AVX-LABEL: foo_reg_128:
+; AVX: # %bb.0:
+; AVX-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0
+; AVX-NEXT: {vex} vpdpwssd %xmm3, %xmm1, %xmm0
+; AVX-NEXT: {vex} vpdpwssd %xmm4, %xmm1, %xmm0
+; AVX-NEXT: {vex} vpdpwssd %xmm5, %xmm1, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: foo_reg_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpdpwssd %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: vpdpwssd %xmm3, %xmm1, %xmm0
+; AVX512-NEXT: vpdpwssd %xmm4, %xmm1, %xmm0
+; AVX512-NEXT: vpdpwssd %xmm5, %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %7 = bitcast <2 x i64> %0 to <4 x i32>
+ %8 = bitcast <2 x i64> %1 to <4 x i32>
+ %9 = bitcast <2 x i64> %2 to <4 x i32>
+ %10 = tail call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %7, <4 x i32> %8, <4 x i32> %9)
+ %11 = bitcast <2 x i64> %3 to <4 x i32>
+ %12 = tail call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %10, <4 x i32> %8, <4 x i32> %11)
+ %13 = bitcast <2 x i64> %4 to <4 x i32>
+ %14 = tail call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %12, <4 x i32> %8, <4 x i32> %13)
+ %15 = bitcast <2 x i64> %5 to <4 x i32>
+ %16 = tail call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %14, <4 x i32> %8, <4 x i32> %15)
+ %17 = bitcast <4 x i32> %16 to <2 x i64>
+ ret <2 x i64> %17
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) #1
+
+define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) {
+; AVX-LABEL: foo_128:
+; AVX: # %bb.0:
+; AVX-NEXT: testl %edi, %edi
+; AVX-NEXT: jle .LBB1_6
+; AVX-NEXT: # %bb.1:
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: movl %edx, %eax
+; AVX-NEXT: andl $3, %eax
+; AVX-NEXT: cmpl $4, %edi
+; AVX-NEXT: jae .LBB1_7
+; AVX-NEXT: # %bb.2:
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: jmp .LBB1_3
+; AVX-NEXT: .LBB1_7:
+; AVX-NEXT: andl $-4, %edx
+; AVX-NEXT: leaq 48(%rsi), %rdi
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: .p2align 4, 0x90
+; AVX-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1
+; AVX-NEXT: {vex} vpdpwssd -48(%rdi), %xmm1, %xmm0
+; AVX-NEXT: {vex} vpdpwssd -32(%rdi), %xmm1, %xmm0
+; AVX-NEXT: {vex} vpdpwssd -16(%rdi), %xmm1, %xmm0
+; AVX-NEXT: {vex} vpdpwssd (%rdi), %xmm1, %xmm0
+; AVX-NEXT: addq $4, %rcx
+; AVX-NEXT: addq $64, %rdi
+; AVX-NEXT: cmpq %rcx, %rdx
+; AVX-NEXT: jne .LBB1_8
+; AVX-NEXT: .LBB1_3:
+; AVX-NEXT: testq %rax, %rax
+; AVX-NEXT: je .LBB1_6
+; AVX-NEXT: # %bb.4: # %.preheader
+; AVX-NEXT: shlq $4, %rcx
+; AVX-NEXT: addq %rcx, %rsi
+; AVX-NEXT: shlq $4, %rax
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: .p2align 4, 0x90
+; AVX-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1
+; AVX-NEXT: {vex} vpdpwssd (%rsi,%rcx), %xmm1, %xmm0
+; AVX-NEXT: addq $16, %rcx
+; AVX-NEXT: cmpq %rcx, %rax
+; AVX-NEXT: jne .LBB1_5
+; AVX-NEXT: .LBB1_6:
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: foo_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testl %edi, %edi
+; AVX512-NEXT: jle .LBB1_6
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: movl %edi, %edx
+; AVX512-NEXT: movl %edx, %eax
+; AVX512-NEXT: andl $3, %eax
+; AVX512-NEXT: cmpl $4, %edi
+; AVX512-NEXT: jae .LBB1_7
+; AVX512-NEXT: # %bb.2:
+; AVX512-NEXT: xorl %ecx, %ecx
+; AVX512-NEXT: jmp .LBB1_3
+; AVX512-NEXT: .LBB1_7:
+; AVX512-NEXT: andl $-4, %edx
+; AVX512-NEXT: leaq 48(%rsi), %rdi
+; AVX512-NEXT: xorl %ecx, %ecx
+; AVX512-NEXT: .p2align 4, 0x90
+; AVX512-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1
+; AVX512-NEXT: vpdpwssd -48(%rdi), %xmm1, %xmm0
+; AVX512-NEXT: vpdpwssd -32(%rdi), %xmm1, %xmm0
+; AVX512-NEXT: vpdpwssd -16(%rdi), %xmm1, %xmm0
+; AVX512-NEXT: vpdpwssd (%rdi), %xmm1, %xmm0
+; AVX512-NEXT: addq $4, %rcx
+; AVX512-NEXT: addq $64, %rdi
+; AVX512-NEXT: cmpq %rcx, %rdx
+; AVX512-NEXT: jne .LBB1_8
+; AVX512-NEXT: .LBB1_3:
+; AVX512-NEXT: testq %rax, %rax
+; AVX512-NEXT: je .LBB1_6
+; AVX512-NEXT: # %bb.4: # %.preheader
+; AVX512-NEXT: shlq $4, %rcx
+; AVX512-NEXT: addq %rcx, %rsi
+; AVX512-NEXT: shlq $4, %rax
+; AVX512-NEXT: xorl %ecx, %ecx
+; AVX512-NEXT: .p2align 4, 0x90
+; AVX512-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1
+; AVX512-NEXT: vpdpwssd (%rsi,%rcx), %xmm1, %xmm0
+; AVX512-NEXT: addq $16, %rcx
+; AVX512-NEXT: cmpq %rcx, %rax
+; AVX512-NEXT: jne .LBB1_5
+; AVX512-NEXT: .LBB1_6:
+; AVX512-NEXT: retq
+ %5 = icmp sgt i32 %0, 0
+ br i1 %5, label %6, label %33
+
+6: ; preds = %4
+ %7 = bitcast <2 x i64> %2 to <8 x i16>
+ %8 = bitcast <2 x i64> %1 to <4 x i32>
+ %9 = zext i32 %0 to i64
+ %10 = and i64 %9, 3
+ %11 = icmp ult i32 %0, 4
+ br i1 %11, label %14, label %12
+
+12: ; preds = %6
+ %13 = and i64 %9, 4294967292
+ br label %35
+
+14: ; preds = %35, %6
+ %15 = phi <4 x i32> [ undef, %6 ], [ %57, %35 ]
+ %16 = phi i64 [ 0, %6 ], [ %58, %35 ]
+ %17 = phi <4 x i32> [ %8, %6 ], [ %57, %35 ]
+ %18 = icmp eq i64 %10, 0
+ br i1 %18, label %30, label %19
+
+19: ; preds = %14, %19
+ %20 = phi i64 [ %27, %19 ], [ %16, %14 ]
+ %21 = phi <4 x i32> [ %26, %19 ], [ %17, %14 ]
+ %22 = phi i64 [ %28, %19 ], [ 0, %14 ]
+ %23 = getelementptr inbounds <2 x i64>, ptr %3, i64 %20
+ %24 = load <8 x i16>, ptr %23, align 16
+ %25 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %7, <8 x i16> %24)
+ %26 = add <4 x i32> %25, %21
+ %27 = add nuw nsw i64 %20, 1
+ %28 = add i64 %22, 1
+ %29 = icmp eq i64 %28, %10
+ br i1 %29, label %30, label %19
+
+30: ; preds = %19, %14
+ %31 = phi <4 x i32> [ %15, %14 ], [ %26, %19 ]
+ %32 = bitcast <4 x i32> %31 to <2 x i64>
+ br label %33
+
+33: ; preds = %30, %4
+ %34 = phi <2 x i64> [ %32, %30 ], [ %1, %4 ]
+ ret <2 x i64> %34
+
+35: ; preds = %35, %12
+ %36 = phi i64 [ 0, %12 ], [ %58, %35 ]
+ %37 = phi <4 x i32> [ %8, %12 ], [ %57, %35 ]
+ %38 = phi i64 [ 0, %12 ], [ %59, %35 ]
+ %39 = getelementptr inbounds <2 x i64>, ptr %3, i64 %36
+ %40 = load <8 x i16>, ptr %39, align 16
+ %41 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %7, <8 x i16> %40)
+ %42 = add <4 x i32> %41, %37
+ %43 = or i64 %36, 1
+ %44 = getelementptr inbounds <2 x i64>, ptr %3, i64 %43
+ %45 = load <8 x i16>, ptr %44, align 16
+ %46 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %7, <8 x i16> %45)
+ %47 = add <4 x i32> %46, %42
+ %48 = or i64 %36, 2
+ %49 = getelementptr inbounds <2 x i64>, ptr %3, i64 %48
+ %50 = load <8 x i16>, ptr %49, align 16
+ %51 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %7, <8 x i16> %50)
+ %52 = add <4 x i32> %51, %47
+ %53 = or i64 %36, 3
+ %54 = getelementptr inbounds <2 x i64>, ptr %3, i64 %53
+ %55 = load <8 x i16>, ptr %54, align 16
+ %56 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %7, <8 x i16> %55)
+ %57 = add <4 x i32> %56, %52
+ %58 = add nuw nsw i64 %36, 4
+ %59 = add i64 %38, 4
+ %60 = icmp eq i64 %59, %13
+ br i1 %60, label %14, label %35
+}
+
+define void @bar_128(i32 %0, ptr %1, <2 x i64> %2, ptr %3) {
+; AVX-LABEL: bar_128:
+; AVX: # %bb.0:
+; AVX-NEXT: testl %edi, %edi
+; AVX-NEXT: jle .LBB2_5
+; AVX-NEXT: # %bb.1:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: cmpl $1, %edi
+; AVX-NEXT: jne .LBB2_6
+; AVX-NEXT: # %bb.2:
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: jmp .LBB2_3
+; AVX-NEXT: .LBB2_6:
+; AVX-NEXT: movl %eax, %edi
+; AVX-NEXT: andl $-2, %edi
+; AVX-NEXT: movl $16, %r8d
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: .p2align 4, 0x90
+; AVX-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1
+; AVX-NEXT: vmovdqa -16(%rsi,%r8), %xmm1
+; AVX-NEXT: vmovdqa (%rsi,%r8), %xmm2
+; AVX-NEXT: {vex} vpdpwssd -16(%rdx,%r8), %xmm0, %xmm1
+; AVX-NEXT: vmovdqa %xmm1, -16(%rsi,%r8)
+; AVX-NEXT: {vex} vpdpwssd (%rdx,%r8), %xmm0, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, (%rsi,%r8)
+; AVX-NEXT: addq $2, %rcx
+; AVX-NEXT: addq $32, %r8
+; AVX-NEXT: cmpq %rcx, %rdi
+; AVX-NEXT: jne .LBB2_7
+; AVX-NEXT: .LBB2_3:
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je .LBB2_5
+; AVX-NEXT: # %bb.4:
+; AVX-NEXT: shlq $4, %rcx
+; AVX-NEXT: vmovdqa (%rsi,%rcx), %xmm1
+; AVX-NEXT: {vex} vpdpwssd (%rdx,%rcx), %xmm0, %xmm1
+; AVX-NEXT: vmovdqa %xmm1, (%rsi,%rcx)
+; AVX-NEXT: .LBB2_5:
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: bar_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testl %edi, %edi
+; AVX512-NEXT: jle .LBB2_5
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: movl %edi, %eax
+; AVX512-NEXT: cmpl $1, %edi
+; AVX512-NEXT: jne .LBB2_6
+; AVX512-NEXT: # %bb.2:
+; AVX512-NEXT: xorl %ecx, %ecx
+; AVX512-NEXT: jmp .LBB2_3
+; AVX512-NEXT: .LBB2_6:
+; AVX512-NEXT: movl %eax, %edi
+; AVX512-NEXT: andl $-2, %edi
+; AVX512-NEXT: movl $16, %r8d
+; AVX512-NEXT: xorl %ecx, %ecx
+; AVX512-NEXT: .p2align 4, 0x90
+; AVX512-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1
+; AVX512-NEXT: vmovdqa -16(%rsi,%r8), %xmm1
+; AVX512-NEXT: vmovdqa (%rsi,%r8), %xmm2
+; AVX512-NEXT: vpdpwssd -16(%rdx,%r8), %xmm0, %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, -16(%rsi,%r8)
+; AVX512-NEXT: vpdpwssd (%rdx,%r8), %xmm0, %xmm2
+; AVX512-NEXT: vmovdqa %xmm2, (%rsi,%r8)
+; AVX512-NEXT: addq $2, %rcx
+; AVX512-NEXT: addq $32, %r8
+; AVX512-NEXT: cmpq %rcx, %rdi
+; AVX512-NEXT: jne .LBB2_7
+; AVX512-NEXT: .LBB2_3:
+; AVX512-NEXT: testb $1, %al
+; AVX512-NEXT: je .LBB2_5
+; AVX512-NEXT: # %bb.4:
+; AVX512-NEXT: shlq $4, %rcx
+; AVX512-NEXT: vmovdqa (%rsi,%rcx), %xmm1
+; AVX512-NEXT: vpdpwssd (%rdx,%rcx), %xmm0, %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, (%rsi,%rcx)
+; AVX512-NEXT: .LBB2_5:
+; AVX512-NEXT: retq
+ %5 = icmp sgt i32 %0, 0
+ br i1 %5, label %6, label %22
+
+6: ; preds = %4
+ %7 = bitcast <2 x i64> %2 to <4 x i32>
+ %8 = zext i32 %0 to i64
+ %9 = and i64 %8, 1
+ %10 = icmp eq i32 %0, 1
+ br i1 %10, label %13, label %11
+
+11: ; preds = %6
+ %12 = and i64 %8, 4294967294
+ br label %23
+
+13: ; preds = %23, %6
+ %14 = phi i64 [ 0, %6 ], [ %37, %23 ]
+ %15 = icmp eq i64 %9, 0
+ br i1 %15, label %22, label %16
+
+16: ; preds = %13
+ %17 = getelementptr inbounds <2 x i64>, ptr %3, i64 %14
+ %18 = load <4 x i32>, ptr %17, align 16
+ %19 = getelementptr inbounds <2 x i64>, ptr %1, i64 %14
+ %20 = load <4 x i32>, ptr %19, align 16
+ %21 = tail call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %20, <4 x i32> %7, <4 x i32> %18)
+ store <4 x i32> %21, ptr %19, align 16
+ br label %22
+
+22: ; preds = %16, %13, %4
+ ret void
+
+23: ; preds = %23, %11
+ %24 = phi i64 [ 0, %11 ], [ %37, %23 ]
+ %25 = phi i64 [ 0, %11 ], [ %38, %23 ]
+ %26 = getelementptr inbounds <2 x i64>, ptr %3, i64 %24
+ %27 = load <4 x i32>, ptr %26, align 16
+ %28 = getelementptr inbounds <2 x i64>, ptr %1, i64 %24
+ %29 = load <4 x i32>, ptr %28, align 16
+ %30 = tail call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %29, <4 x i32> %7, <4 x i32> %27)
+ store <4 x i32> %30, ptr %28, align 16
+ %31 = or i64 %24, 1
+ %32 = getelementptr inbounds <2 x i64>, ptr %3, i64 %31
+ %33 = load <4 x i32>, ptr %32, align 16
+ %34 = getelementptr inbounds <2 x i64>, ptr %1, i64 %31
+ %35 = load <4 x i32>, ptr %34, align 16
+ %36 = tail call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %35, <4 x i32> %7, <4 x i32> %33)
+ store <4 x i32> %36, ptr %34, align 16
+ %37 = add nuw nsw i64 %24, 2
+ %38 = add i64 %25, 2
+ %39 = icmp eq i64 %38, %12
+ br i1 %39, label %13, label %23
+}
+
+declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) #1
+
+define <4 x i64> @foo_reg_256(<4 x i64> %0, <4 x i64> %1, <4 x i64> %2, <4 x i64> %3, <4 x i64> %4, <4 x i64> %5) {
+; AVX-LABEL: foo_reg_256:
+; AVX: # %bb.0:
+; AVX-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0
+; AVX-NEXT: {vex} vpdpwssd %ymm3, %ymm1, %ymm0
+; AVX-NEXT: {vex} vpdpwssd %ymm4, %ymm1, %ymm0
+; AVX-NEXT: {vex} vpdpwssd %ymm5, %ymm1, %ymm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: foo_reg_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpdpwssd %ymm2, %ymm1, %ymm0
+; AVX512-NEXT: vpdpwssd %ymm3, %ymm1, %ymm0
+; AVX512-NEXT: vpdpwssd %ymm4, %ymm1, %ymm0
+; AVX512-NEXT: vpdpwssd %ymm5, %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %7 = bitcast <4 x i64> %0 to <8 x i32>
+ %8 = bitcast <4 x i64> %1 to <8 x i32>
+ %9 = bitcast <4 x i64> %2 to <8 x i32>
+ %10 = tail call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %7, <8 x i32> %8, <8 x i32> %9)
+ %11 = bitcast <4 x i64> %3 to <8 x i32>
+ %12 = tail call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %10, <8 x i32> %8, <8 x i32> %11)
+ %13 = bitcast <4 x i64> %4 to <8 x i32>
+ %14 = tail call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %12, <8 x i32> %8, <8 x i32> %13)
+ %15 = bitcast <4 x i64> %5 to <8 x i32>
+ %16 = tail call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %14, <8 x i32> %8, <8 x i32> %15)
+ %17 = bitcast <8 x i32> %16 to <4 x i64>
+ ret <4 x i64> %17
+}
; __m256i foo(int cnt, __m256i c, __m256i b, __m256i *p) {
; for (int i = 0; i < cnt; ++i) {
@@ -10,50 +368,94 @@
; return c;
; }
-define dso_local <4 x i64> @foo(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) {
-; CHECK-LABEL: foo:
-; CHECK: # %bb.0:
-; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: jle .LBB0_6
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: movl %edx, %eax
-; CHECK-NEXT: andl $3, %eax
-; CHECK-NEXT: cmpl $4, %edi
-; CHECK-NEXT: jae .LBB0_7
-; CHECK-NEXT: # %bb.2:
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: jmp .LBB0_3
-; CHECK-NEXT: .LBB0_7:
-; CHECK-NEXT: andl $-4, %edx
-; CHECK-NEXT: leaq 96(%rsi), %rdi
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: .LBB0_8: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: {vex} vpdpwssd -96(%rdi), %ymm1, %ymm0
-; CHECK-NEXT: {vex} vpdpwssd -64(%rdi), %ymm1, %ymm0
-; CHECK-NEXT: {vex} vpdpwssd -32(%rdi), %ymm1, %ymm0
-; CHECK-NEXT: {vex} vpdpwssd (%rdi), %ymm1, %ymm0
-; CHECK-NEXT: addq $4, %rcx
-; CHECK-NEXT: subq $-128, %rdi
-; CHECK-NEXT: cmpq %rcx, %rdx
-; CHECK-NEXT: jne .LBB0_8
-; CHECK-NEXT: .LBB0_3:
-; CHECK-NEXT: testq %rax, %rax
-; CHECK-NEXT: je .LBB0_6
-; CHECK-NEXT: # %bb.4: # %.preheader
-; CHECK-NEXT: shlq $5, %rcx
-; CHECK-NEXT: addq %rcx, %rsi
-; CHECK-NEXT: shlq $5, %rax
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: .LBB0_5: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: {vex} vpdpwssd (%rsi,%rcx), %ymm1, %ymm0
-; CHECK-NEXT: addq $32, %rcx
-; CHECK-NEXT: cmpq %rcx, %rax
-; CHECK-NEXT: jne .LBB0_5
-; CHECK-NEXT: .LBB0_6:
-; CHECK-NEXT: retq
+define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) {
+; AVX-LABEL: foo_256:
+; AVX: # %bb.0:
+; AVX-NEXT: testl %edi, %edi
+; AVX-NEXT: jle .LBB4_6
+; AVX-NEXT: # %bb.1:
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: movl %edx, %eax
+; AVX-NEXT: andl $3, %eax
+; AVX-NEXT: cmpl $4, %edi
+; AVX-NEXT: jae .LBB4_7
+; AVX-NEXT: # %bb.2:
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: jmp .LBB4_3
+; AVX-NEXT: .LBB4_7:
+; AVX-NEXT: andl $-4, %edx
+; AVX-NEXT: leaq 96(%rsi), %rdi
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: .p2align 4, 0x90
+; AVX-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1
+; AVX-NEXT: {vex} vpdpwssd -96(%rdi), %ymm1, %ymm0
+; AVX-NEXT: {vex} vpdpwssd -64(%rdi), %ymm1, %ymm0
+; AVX-NEXT: {vex} vpdpwssd -32(%rdi), %ymm1, %ymm0
+; AVX-NEXT: {vex} vpdpwssd (%rdi), %ymm1, %ymm0
+; AVX-NEXT: addq $4, %rcx
+; AVX-NEXT: subq $-128, %rdi
+; AVX-NEXT: cmpq %rcx, %rdx
+; AVX-NEXT: jne .LBB4_8
+; AVX-NEXT: .LBB4_3:
+; AVX-NEXT: testq %rax, %rax
+; AVX-NEXT: je .LBB4_6
+; AVX-NEXT: # %bb.4: # %.preheader
+; AVX-NEXT: shlq $5, %rcx
+; AVX-NEXT: addq %rcx, %rsi
+; AVX-NEXT: shlq $5, %rax
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: .p2align 4, 0x90
+; AVX-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1
+; AVX-NEXT: {vex} vpdpwssd (%rsi,%rcx), %ymm1, %ymm0
+; AVX-NEXT: addq $32, %rcx
+; AVX-NEXT: cmpq %rcx, %rax
+; AVX-NEXT: jne .LBB4_5
+; AVX-NEXT: .LBB4_6:
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: foo_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testl %edi, %edi
+; AVX512-NEXT: jle .LBB4_6
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: movl %edi, %edx
+; AVX512-NEXT: movl %edx, %eax
+; AVX512-NEXT: andl $3, %eax
+; AVX512-NEXT: cmpl $4, %edi
+; AVX512-NEXT: jae .LBB4_7
+; AVX512-NEXT: # %bb.2:
+; AVX512-NEXT: xorl %ecx, %ecx
+; AVX512-NEXT: jmp .LBB4_3
+; AVX512-NEXT: .LBB4_7:
+; AVX512-NEXT: andl $-4, %edx
+; AVX512-NEXT: leaq 96(%rsi), %rdi
+; AVX512-NEXT: xorl %ecx, %ecx
+; AVX512-NEXT: .p2align 4, 0x90
+; AVX512-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1
+; AVX512-NEXT: vpdpwssd -96(%rdi), %ymm1, %ymm0
+; AVX512-NEXT: vpdpwssd -64(%rdi), %ymm1, %ymm0
+; AVX512-NEXT: vpdpwssd -32(%rdi), %ymm1, %ymm0
+; AVX512-NEXT: vpdpwssd (%rdi), %ymm1, %ymm0
+; AVX512-NEXT: addq $4, %rcx
+; AVX512-NEXT: subq $-128, %rdi
+; AVX512-NEXT: cmpq %rcx, %rdx
+; AVX512-NEXT: jne .LBB4_8
+; AVX512-NEXT: .LBB4_3:
+; AVX512-NEXT: testq %rax, %rax
+; AVX512-NEXT: je .LBB4_6
+; AVX512-NEXT: # %bb.4: # %.preheader
+; AVX512-NEXT: shlq $5, %rcx
+; AVX512-NEXT: addq %rcx, %rsi
+; AVX512-NEXT: shlq $5, %rax
+; AVX512-NEXT: xorl %ecx, %ecx
+; AVX512-NEXT: .p2align 4, 0x90
+; AVX512-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1
+; AVX512-NEXT: vpdpwssd (%rsi,%rcx), %ymm1, %ymm0
+; AVX512-NEXT: addq $32, %rcx
+; AVX512-NEXT: cmpq %rcx, %rax
+; AVX512-NEXT: jne .LBB4_5
+; AVX512-NEXT: .LBB4_6:
+; AVX512-NEXT: retq
%5 = icmp sgt i32 %0, 0
br i1 %5, label %6, label %33
@@ -134,46 +536,86 @@ declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
; c[i] = _mm256_dpwssd_epi32(c[i], b, a);
; }
; }
-define void @bar(i32 %0, ptr %1, <4 x i64> %2, ptr %3) {
-; CHECK-LABEL: bar:
-; CHECK: # %bb.0:
-; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: jle .LBB1_5
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: cmpl $1, %edi
-; CHECK-NEXT: jne .LBB1_6
-; CHECK-NEXT: # %bb.2:
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: jmp .LBB1_3
-; CHECK-NEXT: .LBB1_6:
-; CHECK-NEXT: movl %eax, %edi
-; CHECK-NEXT: andl $-2, %edi
-; CHECK-NEXT: movl $32, %r8d
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: .LBB1_7: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovdqa -32(%rsi,%r8), %ymm1
-; CHECK-NEXT: vmovdqa (%rsi,%r8), %ymm2
-; CHECK-NEXT: {vex} vpdpwssd -32(%rdx,%r8), %ymm0, %ymm1
-; CHECK-NEXT: vmovdqa %ymm1, -32(%rsi,%r8)
-; CHECK-NEXT: {vex} vpdpwssd (%rdx,%r8), %ymm0, %ymm2
-; CHECK-NEXT: vmovdqa %ymm2, (%rsi,%r8)
-; CHECK-NEXT: addq $2, %rcx
-; CHECK-NEXT: addq $64, %r8
-; CHECK-NEXT: cmpq %rcx, %rdi
-; CHECK-NEXT: jne .LBB1_7
-; CHECK-NEXT: .LBB1_3:
-; CHECK-NEXT: testb $1, %al
-; CHECK-NEXT: je .LBB1_5
-; CHECK-NEXT: # %bb.4:
-; CHECK-NEXT: shlq $5, %rcx
-; CHECK-NEXT: vmovdqa (%rsi,%rcx), %ymm1
-; CHECK-NEXT: {vex} vpdpwssd (%rdx,%rcx), %ymm0, %ymm1
-; CHECK-NEXT: vmovdqa %ymm1, (%rsi,%rcx)
-; CHECK-NEXT: .LBB1_5:
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+define void @bar_256(i32 %0, ptr %1, <4 x i64> %2, ptr %3) {
+; AVX-LABEL: bar_256:
+; AVX: # %bb.0:
+; AVX-NEXT: testl %edi, %edi
+; AVX-NEXT: jle .LBB5_5
+; AVX-NEXT: # %bb.1:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: cmpl $1, %edi
+; AVX-NEXT: jne .LBB5_6
+; AVX-NEXT: # %bb.2:
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: jmp .LBB5_3
+; AVX-NEXT: .LBB5_6:
+; AVX-NEXT: movl %eax, %edi
+; AVX-NEXT: andl $-2, %edi
+; AVX-NEXT: movl $32, %r8d
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: .p2align 4, 0x90
+; AVX-NEXT: .LBB5_7: # =>This Inner Loop Header: Depth=1
+; AVX-NEXT: vmovdqa -32(%rsi,%r8), %ymm1
+; AVX-NEXT: vmovdqa (%rsi,%r8), %ymm2
+; AVX-NEXT: {vex} vpdpwssd -32(%rdx,%r8), %ymm0, %ymm1
+; AVX-NEXT: vmovdqa %ymm1, -32(%rsi,%r8)
+; AVX-NEXT: {vex} vpdpwssd (%rdx,%r8), %ymm0, %ymm2
+; AVX-NEXT: vmovdqa %ymm2, (%rsi,%r8)
+; AVX-NEXT: addq $2, %rcx
+; AVX-NEXT: addq $64, %r8
+; AVX-NEXT: cmpq %rcx, %rdi
+; AVX-NEXT: jne .LBB5_7
+; AVX-NEXT: .LBB5_3:
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je .LBB5_5
+; AVX-NEXT: # %bb.4:
+; AVX-NEXT: shlq $5, %rcx
+; AVX-NEXT: vmovdqa (%rsi,%rcx), %ymm1
+; AVX-NEXT: {vex} vpdpwssd (%rdx,%rcx), %ymm0, %ymm1
+; AVX-NEXT: vmovdqa %ymm1, (%rsi,%rcx)
+; AVX-NEXT: .LBB5_5:
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: bar_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testl %edi, %edi
+; AVX512-NEXT: jle .LBB5_5
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: movl %edi, %eax
+; AVX512-NEXT: cmpl $1, %edi
+; AVX512-NEXT: jne .LBB5_6
+; AVX512-NEXT: # %bb.2:
+; AVX512-NEXT: xorl %ecx, %ecx
+; AVX512-NEXT: jmp .LBB5_3
+; AVX512-NEXT: .LBB5_6:
+; AVX512-NEXT: movl %eax, %edi
+; AVX512-NEXT: andl $-2, %edi
+; AVX512-NEXT: movl $32, %r8d
+; AVX512-NEXT: xorl %ecx, %ecx
+; AVX512-NEXT: .p2align 4, 0x90
+; AVX512-NEXT: .LBB5_7: # =>This Inner Loop Header: Depth=1
+; AVX512-NEXT: vmovdqa -32(%rsi,%r8), %ymm1
+; AVX512-NEXT: vmovdqa (%rsi,%r8), %ymm2
+; AVX512-NEXT: vpdpwssd -32(%rdx,%r8), %ymm0, %ymm1
+; AVX512-NEXT: vmovdqa %ymm1, -32(%rsi,%r8)
+; AVX512-NEXT: vpdpwssd (%rdx,%r8), %ymm0, %ymm2
+; AVX512-NEXT: vmovdqa %ymm2, (%rsi,%r8)
+; AVX512-NEXT: addq $2, %rcx
+; AVX512-NEXT: addq $64, %r8
+; AVX512-NEXT: cmpq %rcx, %rdi
+; AVX512-NEXT: jne .LBB5_7
+; AVX512-NEXT: .LBB5_3:
+; AVX512-NEXT: testb $1, %al
+; AVX512-NEXT: je .LBB5_5
+; AVX512-NEXT: # %bb.4:
+; AVX512-NEXT: shlq $5, %rcx
+; AVX512-NEXT: vmovdqa (%rsi,%rcx), %ymm1
+; AVX512-NEXT: vpdpwssd (%rdx,%rcx), %ymm0, %ymm1
+; AVX512-NEXT: vmovdqa %ymm1, (%rsi,%rcx)
+; AVX512-NEXT: .LBB5_5:
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%5 = icmp sgt i32 %0, 0
br i1 %5, label %6, label %22
More information about the llvm-commits
mailing list