[llvm] 176c341 - [X86][BF16] Add 32-bit tests to show ABI problem, NFC
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 3 23:44:46 PST 2024
Author: Phoebe Wang
Date: 2024-01-04T15:43:34+08:00
New Revision: 176c341198cbfa05debc3554e958ea90e0ef3cc9
URL: https://github.com/llvm/llvm-project/commit/176c341198cbfa05debc3554e958ea90e0ef3cc9
DIFF: https://github.com/llvm/llvm-project/commit/176c341198cbfa05debc3554e958ea90e0ef3cc9.diff
LOG: [X86][BF16] Add 32-bit tests to show ABI problem, NFC
Added:
Modified:
llvm/test/CodeGen/X86/bfloat.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 9c65310f79d7ec..7ef362619d5fd0 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -1,10 +1,35 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=X86
; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,BF16
; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,FP16
; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=CHECK,AVX,AVXNC
define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
+; X86-LABEL: add:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %ecx
+; X86-NEXT: shll $16, %ecx
+; X86-NEXT: vmovd %ecx, %xmm0
+; X86-NEXT: movzwl (%eax), %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: movw %ax, (%esi)
+; X86-NEXT: addl $8, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
; SSE2-LABEL: add:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbx
@@ -46,6 +71,21 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
}
define bfloat @add2(bfloat %a, bfloat %b) nounwind {
+; X86-LABEL: add2:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
; SSE2-LABEL: add2:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
@@ -78,6 +118,47 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
}
define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
+; X86-LABEL: add_double:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $32, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: vmovsd %xmm0, (%esp)
+; X86-NEXT: calll __truncdfbf2
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd %xmm0, %edi
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: vmovsd %xmm0, (%esp)
+; X86-NEXT: calll __truncdfbf2
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: shll $16, %edi
+; X86-NEXT: vmovd %edi, %xmm1
+; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; X86-NEXT: vmovsd %xmm0, (%esi)
+; X86-NEXT: addl $32, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl
+;
; SSE2-LABEL: add_double:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
@@ -146,6 +227,41 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
}
define double @add_double2(double %da, double %db) nounwind {
+; X86-LABEL: add_double2:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $40, %esp
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: vmovsd %xmm0, (%esp)
+; X86-NEXT: calll __truncdfbf2
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd %xmm0, %esi
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: vmovsd %xmm0, (%esp)
+; X86-NEXT: calll __truncdfbf2
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: shll $16, %esi
+; X86-NEXT: vmovd %esi, %xmm1
+; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; X86-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: fldl {{[0-9]+}}(%esp)
+; X86-NEXT: addl $40, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
; SSE2-LABEL: add_double2:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbx
@@ -203,6 +319,26 @@ define double @add_double2(double %da, double %db) nounwind {
}
define void @add_constant(ptr %pa, ptr %pc) nounwind {
+; X86-LABEL: add_constant:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl (%eax), %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: movw %ax, (%esi)
+; X86-NEXT: addl $8, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
; SSE2-LABEL: add_constant:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbx
@@ -237,6 +373,18 @@ define void @add_constant(ptr %pa, ptr %pc) nounwind {
}
define bfloat @add_constant2(bfloat %a) nounwind {
+; X86-LABEL: add_constant2:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
; SSE2-LABEL: add_constant2:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
@@ -263,6 +411,12 @@ define bfloat @add_constant2(bfloat %a) nounwind {
}
define void @store_constant(ptr %pc) nounwind {
+; X86-LABEL: store_constant:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movw $16256, (%eax) # imm = 0x3F80
+; X86-NEXT: retl
+;
; CHECK-LABEL: store_constant:
; CHECK: # %bb.0:
; CHECK-NEXT: movw $16256, (%rdi) # imm = 0x3F80
@@ -272,6 +426,14 @@ define void @store_constant(ptr %pc) nounwind {
}
define void @fold_ext_trunc(ptr %pa, ptr %pc) nounwind {
+; X86-LABEL: fold_ext_trunc:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %ecx
+; X86-NEXT: movw %cx, (%eax)
+; X86-NEXT: retl
+;
; CHECK-LABEL: fold_ext_trunc:
; CHECK: # %bb.0:
; CHECK-NEXT: movzwl (%rdi), %eax
@@ -285,6 +447,11 @@ define void @fold_ext_trunc(ptr %pa, ptr %pc) nounwind {
}
define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
+; X86-LABEL: fold_ext_trunc2:
+; X86: # %bb.0:
+; X86-NEXT: flds {{[0-9]+}}(%esp)
+; X86-NEXT: retl
+;
; CHECK-LABEL: fold_ext_trunc2:
; CHECK: # %bb.0:
; CHECK-NEXT: retq
@@ -294,6 +461,165 @@ define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
}
define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
+; X86-LABEL: addv:
+; X86: # %bb.0:
+; X86-NEXT: subl $172, %esp
+; X86-NEXT: vpextrw $1, %xmm1, %eax
+; X86-NEXT: vmovdqa %xmm1, %xmm4
+; X86-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm2
+; X86-NEXT: vpextrw $1, %xmm0, %eax
+; X86-NEXT: vmovdqa %xmm0, %xmm3
+; X86-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vaddss %xmm2, %xmm1, %xmm0
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: vpextrw $2, %xmm4, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: vpextrw $2, %xmm3, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT: vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: vpextrw $3, %xmm0, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT: vpextrw $3, %xmm1, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT: vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: vpextrw $4, %xmm0, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT: vpextrw $4, %xmm1, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT: vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: vpextrw $5, %xmm0, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT: vpextrw $5, %xmm1, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT: vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: vpextrw $6, %xmm0, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT: vpextrw $6, %xmm1, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT: vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: vpextrw $7, %xmm0, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT: vpextrw $7, %xmm1, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT: vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: vmovw %xmm0, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm0
+; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT: vmovw %xmm1, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: vmovss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd %xmm1, %eax
+; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; X86-NEXT: addl $172, %esp
+; X86-NEXT: retl
+;
; SSE2-LABEL: addv:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
@@ -776,6 +1102,12 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
}
define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
+; X86-LABEL: pr62997:
+; X86: # %bb.0:
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT: retl
+;
; SSE2-LABEL: pr62997:
; SSE2: # %bb.0:
; SSE2-NEXT: movd %xmm0, %eax
@@ -798,6 +1130,11 @@ define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
}
define <32 x bfloat> @pr63017() {
+; X86-LABEL: pr63017:
+; X86: # %bb.0:
+; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X86-NEXT: retl
+;
; SSE2-LABEL: pr63017:
; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm0, %xmm0
@@ -820,6 +1157,12 @@ define <32 x bfloat> @pr63017() {
}
define <32 x bfloat> @pr63017_2() nounwind {
+; X86-LABEL: pr63017_2:
+; X86: # %bb.0:
+; X86-NEXT: vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
+; X86-NEXT: retl
+;
; SSE2-LABEL: pr63017_2:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %r14
@@ -1644,6 +1987,12 @@ define <32 x bfloat> @pr63017_2() nounwind {
}
define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
+; X86-LABEL: pr62997_3:
+; X86: # %bb.0:
+; X86-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
+; X86-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; X86-NEXT: retl
+;
; SSE2-LABEL: pr62997_3:
; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm0, %rax
@@ -1678,6 +2027,12 @@ define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>)
define <4 x float> @pr64460_1(<4 x bfloat> %a) {
+; X86-LABEL: pr64460_1:
+; X86: # %bb.0:
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT: retl
+;
; SSE2-LABEL: pr64460_1:
; SSE2: # %bb.0:
; SSE2-NEXT: pextrw $1, %xmm0, %eax
@@ -1709,6 +2064,12 @@ define <4 x float> @pr64460_1(<4 x bfloat> %a) {
}
define <8 x float> @pr64460_2(<8 x bfloat> %a) {
+; X86-LABEL: pr64460_2:
+; X86: # %bb.0:
+; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X86-NEXT: vpslld $16, %ymm0, %ymm0
+; X86-NEXT: retl
+;
; SSE2-LABEL: pr64460_2:
; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm0, %rdx
@@ -1758,6 +2119,12 @@ define <8 x float> @pr64460_2(<8 x bfloat> %a) {
}
define <16 x float> @pr64460_3(<16 x bfloat> %a) {
+; X86-LABEL: pr64460_3:
+; X86: # %bb.0:
+; X86-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; X86-NEXT: vpslld $16, %zmm0, %zmm0
+; X86-NEXT: retl
+;
; SSE2-LABEL: pr64460_3:
; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm1, %rdi
@@ -1852,6 +2219,13 @@ define <16 x float> @pr64460_3(<16 x bfloat> %a) {
}
define <8 x double> @pr64460_4(<8 x bfloat> %a) {
+; X86-LABEL: pr64460_4:
+; X86: # %bb.0:
+; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X86-NEXT: vpslld $16, %ymm0, %ymm0
+; X86-NEXT: vcvtps2pd %ymm0, %zmm0
+; X86-NEXT: retl
+;
; SSE2-LABEL: pr64460_4:
; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm0, %rsi
@@ -1951,6 +2325,13 @@ define <8 x double> @pr64460_4(<8 x bfloat> %a) {
}
define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind {
+; X86-LABEL: fptrunc_v4f32:
+; X86: # %bb.0:
+; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
+; X86-NEXT: vzeroupper
+; X86-NEXT: retl
+;
; SSE2-LABEL: fptrunc_v4f32:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
@@ -2009,6 +2390,12 @@ define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind {
}
define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind {
+; X86-LABEL: fptrunc_v8f32:
+; X86: # %bb.0:
+; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
+; X86-NEXT: vzeroupper
+; X86-NEXT: retl
+;
; SSE2-LABEL: fptrunc_v8f32:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
@@ -2087,6 +2474,11 @@ define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind {
}
define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
+; X86-LABEL: fptrunc_v16f32:
+; X86: # %bb.0:
+; X86-NEXT: vcvtneps2bf16 %zmm0, %ymm0
+; X86-NEXT: retl
+;
; SSE2-LABEL: fptrunc_v16f32:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
@@ -2222,6 +2614,91 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
}
define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
+; X86-LABEL: fptrunc_v8f64:
+; X86: # %bb.0:
+; X86-NEXT: subl $220, %esp
+; X86-NEXT: vmovups %zmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 64-byte Spill
+; X86-NEXT: vmovhps %xmm0, (%esp)
+; X86-NEXT: vzeroupper
+; X86-NEXT: calll __truncdfbf2
+; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
+; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: vmovlps %xmm0, (%esp)
+; X86-NEXT: vzeroupper
+; X86-NEXT: calll __truncdfbf2
+; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: vmovhps %xmm0, (%esp)
+; X86-NEXT: calll __truncdfbf2
+; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
+; X86-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: vmovlps %xmm0, (%esp)
+; X86-NEXT: vzeroupper
+; X86-NEXT: calll __truncdfbf2
+; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: vmovhps %xmm0, (%esp)
+; X86-NEXT: calll __truncdfbf2
+; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
+; X86-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: vmovlps %xmm0, (%esp)
+; X86-NEXT: vzeroupper
+; X86-NEXT: calll __truncdfbf2
+; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: vmovhps %xmm0, (%esp)
+; X86-NEXT: calll __truncdfbf2
+; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
+; X86-NEXT: vmovlps %xmm0, (%esp)
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: vzeroupper
+; X86-NEXT: calll __truncdfbf2
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd %xmm1, %eax
+; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; X86-NEXT: addl $220, %esp
+; X86-NEXT: retl
+;
; SSE2-LABEL: fptrunc_v8f64:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
@@ -2456,6 +2933,12 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
}
define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
+; X86-LABEL: test_v8bf16_v32bf16:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X86-NEXT: retl
+;
; SSE2-LABEL: test_v8bf16_v32bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
@@ -2480,6 +2963,12 @@ define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
}
define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
+; X86-LABEL: concat_v8bf16:
+; X86: # %bb.0:
+; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT: retl
+;
; SSE2-LABEL: concat_v8bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: retq
@@ -2494,6 +2983,12 @@ define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
}
define <8 x bfloat> @extract_v32bf16_v8bf16(<32 x bfloat> %x) {
+; X86-LABEL: extract_v32bf16_v8bf16:
+; X86: # %bb.0:
+; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X86-NEXT: vzeroupper
+; X86-NEXT: retl
+;
; SSE2-LABEL: extract_v32bf16_v8bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: pextrw $0, %xmm1, %eax
@@ -2531,6 +3026,11 @@ define <8 x bfloat> @extract_v32bf16_v8bf16(<32 x bfloat> %x) {
}
define <16 x bfloat> @concat_zero_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
+; X86-LABEL: concat_zero_v8bf16:
+; X86: # %bb.0:
+; X86-NEXT: vmovaps %xmm0, %xmm0
+; X86-NEXT: retl
+;
; SSE2-LABEL: concat_zero_v8bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
More information about the llvm-commits
mailing list