[llvm] [SelectionDAG] Make ARITH_FENCE support half and bfloat type (PR #90836)
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Sat May 4 04:15:23 PDT 2024
https://github.com/phoebewang updated https://github.com/llvm/llvm-project/pull/90836
>From 065e985f5f3ac4a12f1134fe7d3d6b6573c96003 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang at intel.com>
Date: Thu, 2 May 2024 17:10:30 +0800
Subject: [PATCH 1/4] [SelectionDAG] Make ARITH_FENCE support half and bfloat
type
---
.../SelectionDAG/LegalizeFloatTypes.cpp | 7 ++
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
llvm/test/CodeGen/X86/arithmetic_fence2.ll | 85 +++++++++++++++++++
3 files changed, 93 insertions(+)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index abe5be76382556..00f94e48a3f9ad 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2825,6 +2825,8 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
report_fatal_error("Do not know how to soft promote this operator's "
"result!");
+ case ISD::ARITH_FENCE:
+ R = SoftPromoteHalfRes_ARITH_FENCE(N); break;
case ISD::BITCAST: R = SoftPromoteHalfRes_BITCAST(N); break;
case ISD::ConstantFP: R = SoftPromoteHalfRes_ConstantFP(N); break;
case ISD::EXTRACT_VECTOR_ELT:
@@ -2904,6 +2906,11 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
SetSoftPromotedHalf(SDValue(N, ResNo), R);
}
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_ARITH_FENCE(SDNode *N) {
+ return DAG.getNode(ISD::ARITH_FENCE, SDLoc(N), MVT::i16,
+ BitConvertToInteger(N->getOperand(0)));
+}
+
SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BITCAST(SDNode *N) {
return BitConvertToInteger(N->getOperand(0));
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 49be824deb5134..e9714f6f72b6bb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -726,6 +726,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void SetSoftPromotedHalf(SDValue Op, SDValue Result);
void SoftPromoteHalfResult(SDNode *N, unsigned ResNo);
+ SDValue SoftPromoteHalfRes_ARITH_FENCE(SDNode *N);
SDValue SoftPromoteHalfRes_BinOp(SDNode *N);
SDValue SoftPromoteHalfRes_BITCAST(SDNode *N);
SDValue SoftPromoteHalfRes_ConstantFP(SDNode *N);
diff --git a/llvm/test/CodeGen/X86/arithmetic_fence2.ll b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
index 6a854b58fc02d0..bc80e1a70112d3 100644
--- a/llvm/test/CodeGen/X86/arithmetic_fence2.ll
+++ b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
@@ -157,6 +157,91 @@ define <8 x float> @f6(<8 x float> %a) {
ret <8 x float> %3
}
+define half @f7(half %a) nounwind {
+; X86-LABEL: f7:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: pextrw $0, %xmm0, %eax
+; X86-NEXT: movw %ax, (%esp)
+; X86-NEXT: calll __extendhfsf2
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: calll __truncsfhf2
+; X86-NEXT: pextrw $0, %xmm0, %eax
+; X86-NEXT: movw %ax, (%esp)
+; X86-NEXT: calll __extendhfsf2
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: calll __truncsfhf2
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: f7:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq __extendhfsf2 at PLT
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfhf2 at PLT
+; X64-NEXT: callq __extendhfsf2 at PLT
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfhf2 at PLT
+; X64-NEXT: popq %rax
+; X64-NEXT: retq
+ %1 = fadd fast half %a, %a
+ %t = call half @llvm.arithmetic.fence.f16(half %1)
+ %2 = fadd fast half %a, %a
+ %3 = fadd fast half %1, %2
+ ret half %3
+}
+
+define bfloat @f8(bfloat %a) nounwind {
+; X86-LABEL: f8:
+; X86: # %bb.0:
+; X86-NEXT: pushl %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: movd %eax, %xmm0
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: pextrw $0, %xmm0, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: movd %eax, %xmm0
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: popl %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: f8:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: pextrw $0, %xmm0, %eax
+; X64-NEXT: shll $16, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfbf2 at PLT
+; X64-NEXT: pextrw $0, %xmm0, %eax
+; X64-NEXT: shll $16, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfbf2 at PLT
+; X64-NEXT: popq %rax
+; X64-NEXT: retq
+ %1 = fadd fast bfloat %a, %a
+ %t = call bfloat @llvm.arithmetic.fence.bf16(bfloat %1)
+ %2 = fadd fast bfloat %a, %a
+ %3 = fadd fast bfloat %1, %2
+ ret bfloat %3
+}
+
+declare half @llvm.arithmetic.fence.f16(half)
+declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
declare float @llvm.arithmetic.fence.f32(float)
declare double @llvm.arithmetic.fence.f64(double)
declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>)
>From 7c01fe3a90a3b825eb98ab806c74a7cad2516ec2 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang at intel.com>
Date: Thu, 2 May 2024 17:32:17 +0800
Subject: [PATCH 2/4] Address review comments
---
llvm/test/CodeGen/X86/arithmetic_fence2.ll | 278 ++++++++++++++++++++-
1 file changed, 268 insertions(+), 10 deletions(-)
diff --git a/llvm/test/CodeGen/X86/arithmetic_fence2.ll b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
index bc80e1a70112d3..13b0e6194768a4 100644
--- a/llvm/test/CodeGen/X86/arithmetic_fence2.ll
+++ b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
@@ -192,11 +192,11 @@ define half @f7(half %a) nounwind {
; X64-NEXT: callq __truncsfhf2 at PLT
; X64-NEXT: popq %rax
; X64-NEXT: retq
- %1 = fadd fast half %a, %a
- %t = call half @llvm.arithmetic.fence.f16(half %1)
- %2 = fadd fast half %a, %a
- %3 = fadd fast half %1, %2
- ret half %3
+ %b = fadd half %a, %a
+ %c = call half @llvm.arithmetic.fence.f16(half %b)
+ %d = fadd half %a, %a
+ %e = fadd half %b, %d
+ ret half %e
}
define bfloat @f8(bfloat %a) nounwind {
@@ -233,15 +233,273 @@ define bfloat @f8(bfloat %a) nounwind {
; X64-NEXT: callq __truncsfbf2 at PLT
; X64-NEXT: popq %rax
; X64-NEXT: retq
- %1 = fadd fast bfloat %a, %a
- %t = call bfloat @llvm.arithmetic.fence.bf16(bfloat %1)
- %2 = fadd fast bfloat %a, %a
- %3 = fadd fast bfloat %1, %2
- ret bfloat %3
+ %b = fadd bfloat %a, %a
+ %c = call bfloat @llvm.arithmetic.fence.bf16(bfloat %b)
+ %d = fadd bfloat %a, %a
+ %e = fadd bfloat %b, %d
+ ret bfloat %e
+}
+
+define <2 x half> @f9(<2 x half> %a) nounwind {
+; X86-LABEL: f9:
+; X86: # %bb.0:
+; X86-NEXT: subl $36, %esp
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrld $16, %xmm1
+; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: pextrw $0, %xmm0, %eax
+; X86-NEXT: movw %ax, (%esp)
+; X86-NEXT: calll __extendhfsf2
+; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: pextrw $0, %xmm0, %eax
+; X86-NEXT: movw %ax, (%esp)
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: calll __extendhfsf2
+; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: calll __truncsfhf2
+; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: calll __truncsfhf2
+; X86-NEXT: pextrw $0, %xmm0, %eax
+; X86-NEXT: movw %ax, (%esp)
+; X86-NEXT: calll __extendhfsf2
+; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: pextrw $0, %xmm0, %eax
+; X86-NEXT: movw %ax, (%esp)
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: calll __extendhfsf2
+; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: calll __truncsfhf2
+; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: calll __truncsfhf2
+; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT: addl $36, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: f9:
+; X64: # %bb.0:
+; X64-NEXT: subq $40, %rsp
+; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: psrld $16, %xmm0
+; X64-NEXT: callq __extendhfsf2 at PLT
+; X64-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: callq __extendhfsf2 at PLT
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfhf2 at PLT
+; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: movss (%rsp), %xmm0 # 4-byte Reload
+; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfhf2 at PLT
+; X64-NEXT: callq __extendhfsf2 at PLT
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfhf2 at PLT
+; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: callq __extendhfsf2 at PLT
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfhf2 at PLT
+; X64-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; X64-NEXT: addq $40, %rsp
+; X64-NEXT: retq
+ %b = fadd <2 x half> %a, %a
+ %c = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> %b)
+ %d = fadd <2 x half> %a, %a
+ %e = fadd <2 x half> %b, %d
+ ret <2 x half> %e
+}
+
+define <4 x bfloat> @f10(<4 x bfloat> %a) nounwind {
+; X86-LABEL: f10:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $56, %esp
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pextrw $0, %xmm0, %eax
+; X86-NEXT: psrld $16, %xmm0
+; X86-NEXT: pextrw $0, %xmm0, %esi
+; X86-NEXT: psrlq $48, %xmm1
+; X86-NEXT: pextrw $0, %xmm1, %edi
+; X86-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
+; X86-NEXT: pextrw $0, %xmm2, %ecx
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: movd %eax, %xmm0
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: shll $16, %ecx
+; X86-NEXT: movd %ecx, %xmm0
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: shll $16, %edi
+; X86-NEXT: movd %edi, %xmm0
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: shll $16, %esi
+; X86-NEXT: movd %esi, %xmm0
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 4-byte Reload
+; X86-NEXT: # xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: movss %xmm1, (%esp)
+; X86-NEXT: pextrw $0, %xmm0, %esi
+; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: pextrw $0, %xmm0, %edi
+; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: pextrw $0, %xmm0, %ebx
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: pextrw $0, %xmm0, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: movd %eax, %xmm0
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: shll $16, %ebx
+; X86-NEXT: movd %ebx, %xmm0
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: shll $16, %edi
+; X86-NEXT: movd %edi, %xmm0
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: shll $16, %esi
+; X86-NEXT: movd %esi, %xmm0
+; X86-NEXT: addss %xmm0, %xmm0
+; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: movd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movd %xmm0, (%esp)
+; X86-NEXT: calll __truncsfbf2
+; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
+; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-NEXT: movdqa %xmm1, %xmm0
+; X86-NEXT: addl $56, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl
+;
+; X64-LABEL: f10:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbp
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %rbx
+; X64-NEXT: subq $56, %rsp
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: psrld $16, %xmm1
+; X64-NEXT: pextrw $0, %xmm1, %ebp
+; X64-NEXT: pextrw $0, %xmm0, %r15d
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; X64-NEXT: pextrw $0, %xmm1, %r14d
+; X64-NEXT: psrlq $48, %xmm0
+; X64-NEXT: pextrw $0, %xmm0, %eax
+; X64-NEXT: shll $16, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfbf2 at PLT
+; X64-NEXT: pextrw $0, %xmm0, %ebx
+; X64-NEXT: shll $16, %r14d
+; X64-NEXT: movd %r14d, %xmm0
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfbf2 at PLT
+; X64-NEXT: pextrw $0, %xmm0, %r14d
+; X64-NEXT: shll $16, %r15d
+; X64-NEXT: movd %r15d, %xmm0
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfbf2 at PLT
+; X64-NEXT: pextrw $0, %xmm0, %r15d
+; X64-NEXT: shll $16, %ebp
+; X64-NEXT: movd %ebp, %xmm0
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfbf2 at PLT
+; X64-NEXT: pextrw $0, %xmm0, %eax
+; X64-NEXT: shll $16, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfbf2 at PLT
+; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: shll $16, %r15d
+; X64-NEXT: movd %r15d, %xmm0
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfbf2 at PLT
+; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: shll $16, %r14d
+; X64-NEXT: movd %r14d, %xmm0
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfbf2 at PLT
+; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; X64-NEXT: shll $16, %ebx
+; X64-NEXT: movd %ebx, %xmm0
+; X64-NEXT: addss %xmm0, %xmm0
+; X64-NEXT: callq __truncsfbf2 at PLT
+; X64-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: addq $56, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r14
+; X64-NEXT: popq %r15
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+ %b = fadd <4 x bfloat> %a, %a
+ %c = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> %b)
+ %d = fadd <4 x bfloat> %a, %a
+ %e = fadd <4 x bfloat> %b, %d
+ ret <4 x bfloat> %e
}
declare half @llvm.arithmetic.fence.f16(half)
declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
+declare <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half>)
+declare <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat>)
declare float @llvm.arithmetic.fence.f32(float)
declare double @llvm.arithmetic.fence.f64(double)
declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>)
>From 8d2947dcca564d75fdb88f03215fdd8bec3d9272 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang at intel.com>
Date: Thu, 2 May 2024 17:44:29 +0800
Subject: [PATCH 3/4] Drop other instructions
---
llvm/test/CodeGen/X86/arithmetic_fence2.ll | 336 ++++-----------------
1 file changed, 53 insertions(+), 283 deletions(-)
diff --git a/llvm/test/CodeGen/X86/arithmetic_fence2.ll b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
index 13b0e6194768a4..77d128bb11c273 100644
--- a/llvm/test/CodeGen/X86/arithmetic_fence2.ll
+++ b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
@@ -160,340 +160,110 @@ define <8 x float> @f6(<8 x float> %a) {
define half @f7(half %a) nounwind {
; X86-LABEL: f7:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: movw %ax, (%esp)
-; X86-NEXT: calll __extendhfsf2
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: calll __truncsfhf2
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: movw %ax, (%esp)
-; X86-NEXT: calll __extendhfsf2
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: calll __truncsfhf2
-; X86-NEXT: addl $12, %esp
+; X86-NEXT: #ARITH_FENCE
; X86-NEXT: retl
;
; X64-LABEL: f7:
; X64: # %bb.0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: callq __extendhfsf2 at PLT
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfhf2 at PLT
-; X64-NEXT: callq __extendhfsf2 at PLT
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfhf2 at PLT
-; X64-NEXT: popq %rax
+; X64-NEXT: #ARITH_FENCE
; X64-NEXT: retq
- %b = fadd half %a, %a
- %c = call half @llvm.arithmetic.fence.f16(half %b)
- %d = fadd half %a, %a
- %e = fadd half %b, %d
- ret half %e
+ %b = call half @llvm.arithmetic.fence.f16(half %a)
+ ret half %b
}
define bfloat @f8(bfloat %a) nounwind {
; X86-LABEL: f8:
; X86: # %bb.0:
-; X86-NEXT: pushl %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: popl %eax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: pinsrw $0, %eax, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: f8:
; X64: # %bb.0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: pextrw $0, %xmm0, %eax
-; X64-NEXT: shll $16, %eax
-; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfbf2 at PLT
; X64-NEXT: pextrw $0, %xmm0, %eax
-; X64-NEXT: shll $16, %eax
-; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfbf2 at PLT
-; X64-NEXT: popq %rax
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: pinsrw $0, %eax, %xmm0
; X64-NEXT: retq
- %b = fadd bfloat %a, %a
- %c = call bfloat @llvm.arithmetic.fence.bf16(bfloat %b)
- %d = fadd bfloat %a, %a
- %e = fadd bfloat %b, %d
- ret bfloat %e
+ %b = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a)
+ ret bfloat %b
}
define <2 x half> @f9(<2 x half> %a) nounwind {
; X86-LABEL: f9:
; X86: # %bb.0:
-; X86-NEXT: subl $36, %esp
; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psrld $16, %xmm1
-; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: movw %ax, (%esp)
-; X86-NEXT: calll __extendhfsf2
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: movw %ax, (%esp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: calll __extendhfsf2
-; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: calll __truncsfhf2
-; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: calll __truncsfhf2
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: movw %ax, (%esp)
-; X86-NEXT: calll __extendhfsf2
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: movw %ax, (%esp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: calll __extendhfsf2
-; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: calll __truncsfhf2
-; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: calll __truncsfhf2
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: #ARITH_FENCE
; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-NEXT: addl $36, %esp
; X86-NEXT: retl
;
; X64-LABEL: f9:
; X64: # %bb.0:
-; X64-NEXT: subq $40, %rsp
-; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: psrld $16, %xmm0
-; X64-NEXT: callq __extendhfsf2 at PLT
-; X64-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill
-; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-NEXT: callq __extendhfsf2 at PLT
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfhf2 at PLT
-; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT: movss (%rsp), %xmm0 # 4-byte Reload
-; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfhf2 at PLT
-; X64-NEXT: callq __extendhfsf2 at PLT
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfhf2 at PLT
-; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: callq __extendhfsf2 at PLT
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfhf2 at PLT
-; X64-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload
-; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X64-NEXT: addq $40, %rsp
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: psrld $16, %xmm1
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: retq
- %b = fadd <2 x half> %a, %a
- %c = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> %b)
- %d = fadd <2 x half> %a, %a
- %e = fadd <2 x half> %b, %d
- ret <2 x half> %e
+ %b = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> %a)
+ ret <2 x half> %b
}
define <4 x bfloat> @f10(<4 x bfloat> %a) nounwind {
; X86-LABEL: f10:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $56, %esp
; X86-NEXT: movdqa %xmm0, %xmm1
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: psrld $16, %xmm0
-; X86-NEXT: pextrw $0, %xmm0, %esi
; X86-NEXT: psrlq $48, %xmm1
-; X86-NEXT: pextrw $0, %xmm1, %edi
-; X86-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; X86-NEXT: pextrw $0, %xmm2, %ecx
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shll $16, %ecx
-; X86-NEXT: movd %ecx, %xmm0
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shll $16, %edi
-; X86-NEXT: movd %edi, %xmm0
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shll $16, %esi
-; X86-NEXT: movd %esi, %xmm0
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 4-byte Reload
-; X86-NEXT: # xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: movss %xmm1, (%esp)
+; X86-NEXT: pextrw $0, %xmm1, %eax
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; X86-NEXT: pextrw $0, %xmm1, %edx
+; X86-NEXT: pextrw $0, %xmm0, %ecx
+; X86-NEXT: psrld $16, %xmm0
; X86-NEXT: pextrw $0, %xmm0, %esi
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: pextrw $0, %xmm0, %edi
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: pextrw $0, %xmm0, %ebx
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: pextrw $0, %xmm0, %eax
-; X86-NEXT: shll $16, %eax
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shll $16, %ebx
-; X86-NEXT: movd %ebx, %xmm0
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shll $16, %edi
-; X86-NEXT: movd %edi, %xmm0
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
-; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shll $16, %esi
-; X86-NEXT: movd %esi, %xmm0
-; X86-NEXT: addss %xmm0, %xmm0
-; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: movd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: pinsrw $0, %eax, %xmm0
+; X86-NEXT: pinsrw $0, %edx, %xmm1
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-NEXT: movdqa %xmm1, %xmm0
-; X86-NEXT: addl $56, %esp
+; X86-NEXT: pinsrw $0, %ecx, %xmm0
+; X86-NEXT: pinsrw $0, %esi, %xmm2
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: f10:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbp
-; X64-NEXT: pushq %r15
-; X64-NEXT: pushq %r14
-; X64-NEXT: pushq %rbx
-; X64-NEXT: subq $56, %rsp
; X64-NEXT: movdqa %xmm0, %xmm1
-; X64-NEXT: psrld $16, %xmm1
-; X64-NEXT: pextrw $0, %xmm1, %ebp
-; X64-NEXT: pextrw $0, %xmm0, %r15d
+; X64-NEXT: psrlq $48, %xmm1
+; X64-NEXT: pextrw $0, %xmm1, %eax
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; X64-NEXT: pextrw $0, %xmm1, %r14d
-; X64-NEXT: psrlq $48, %xmm0
-; X64-NEXT: pextrw $0, %xmm0, %eax
-; X64-NEXT: shll $16, %eax
-; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfbf2 at PLT
-; X64-NEXT: pextrw $0, %xmm0, %ebx
-; X64-NEXT: shll $16, %r14d
-; X64-NEXT: movd %r14d, %xmm0
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfbf2 at PLT
-; X64-NEXT: pextrw $0, %xmm0, %r14d
-; X64-NEXT: shll $16, %r15d
-; X64-NEXT: movd %r15d, %xmm0
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfbf2 at PLT
-; X64-NEXT: pextrw $0, %xmm0, %r15d
-; X64-NEXT: shll $16, %ebp
-; X64-NEXT: movd %ebp, %xmm0
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfbf2 at PLT
-; X64-NEXT: pextrw $0, %xmm0, %eax
-; X64-NEXT: shll $16, %eax
-; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfbf2 at PLT
-; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: shll $16, %r15d
-; X64-NEXT: movd %r15d, %xmm0
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfbf2 at PLT
-; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: shll $16, %r14d
-; X64-NEXT: movd %r14d, %xmm0
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfbf2 at PLT
-; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; X64-NEXT: shll $16, %ebx
-; X64-NEXT: movd %ebx, %xmm0
-; X64-NEXT: addss %xmm0, %xmm0
-; X64-NEXT: callq __truncsfbf2 at PLT
-; X64-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
+; X64-NEXT: pextrw $0, %xmm1, %ecx
+; X64-NEXT: pextrw $0, %xmm0, %edx
+; X64-NEXT: psrld $16, %xmm0
+; X64-NEXT: pextrw $0, %xmm0, %esi
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: pinsrw $0, %eax, %xmm0
+; X64-NEXT: pinsrw $0, %ecx, %xmm1
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; X64-NEXT: pinsrw $0, %edx, %xmm0
+; X64-NEXT: pinsrw $0, %esi, %xmm2
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: addq $56, %rsp
-; X64-NEXT: popq %rbx
-; X64-NEXT: popq %r14
-; X64-NEXT: popq %r15
-; X64-NEXT: popq %rbp
; X64-NEXT: retq
- %b = fadd <4 x bfloat> %a, %a
- %c = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> %b)
- %d = fadd <4 x bfloat> %a, %a
- %e = fadd <4 x bfloat> %b, %d
- ret <4 x bfloat> %e
+ %b = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> %a)
+ ret <4 x bfloat> %b
}
declare half @llvm.arithmetic.fence.f16(half)
>From 1cf434579aa84faeb1eb2e501b2e2603b40fa4fc Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang at intel.com>
Date: Sat, 4 May 2024 19:15:06 +0800
Subject: [PATCH 4/4] add 3 x test
---
llvm/test/CodeGen/X86/arithmetic_fence2.ll | 45 +++++++++++++++++++++-
1 file changed, 43 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/X86/arithmetic_fence2.ll b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
index 77d128bb11c273..3c2ef21527f501 100644
--- a/llvm/test/CodeGen/X86/arithmetic_fence2.ll
+++ b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
@@ -212,9 +212,49 @@ define <2 x half> @f9(<2 x half> %a) nounwind {
ret <2 x half> %b
}
-define <4 x bfloat> @f10(<4 x bfloat> %a) nounwind {
+define <3 x bfloat> @f10(<3 x bfloat> %a) nounwind {
; X86-LABEL: f10:
; X86: # %bb.0:
+; X86-NEXT: pextrw $0, %xmm0, %eax
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrld $16, %xmm1
+; X86-NEXT: pextrw $0, %xmm1, %ecx
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: pextrw $0, %xmm0, %edx
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: pinsrw $0, %eax, %xmm0
+; X86-NEXT: pinsrw $0, %ecx, %xmm1
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT: pinsrw $0, %edx, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: retl
+;
+; X64-LABEL: f10:
+; X64: # %bb.0:
+; X64-NEXT: pextrw $0, %xmm0, %eax
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: psrld $16, %xmm1
+; X64-NEXT: pextrw $0, %xmm1, %ecx
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X64-NEXT: pextrw $0, %xmm0, %edx
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: pinsrw $0, %eax, %xmm0
+; X64-NEXT: pinsrw $0, %ecx, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: pinsrw $0, %edx, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %b = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat> %a)
+ ret <3 x bfloat> %b
+}
+
+define <4 x bfloat> @f11(<4 x bfloat> %a) nounwind {
+; X86-LABEL: f11:
+; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psrlq $48, %xmm1
@@ -239,7 +279,7 @@ define <4 x bfloat> @f10(<4 x bfloat> %a) nounwind {
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
-; X64-LABEL: f10:
+; X64-LABEL: f11:
; X64: # %bb.0:
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psrlq $48, %xmm1
@@ -269,6 +309,7 @@ define <4 x bfloat> @f10(<4 x bfloat> %a) nounwind {
declare half @llvm.arithmetic.fence.f16(half)
declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
declare <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half>)
+declare <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat>)
declare <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat>)
declare float @llvm.arithmetic.fence.f32(float)
declare double @llvm.arithmetic.fence.f64(double)
More information about the llvm-commits
mailing list