[libcxx-commits] [lldb] [compiler-rt] [libc] [flang] [clang-tools-extra] [libcxx] [llvm] [clang] [X86][BF16] Try to use `f16` for lowering (PR #76901)
via libcxx-commits
libcxx-commits at lists.llvm.org
Wed Jan 3 23:53:38 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Phoebe Wang (phoebewang)
<details>
<summary>Changes</summary>
---
Patch is 102.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/76901.diff
4 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+6-5)
- (modified) llvm/lib/Target/X86/X86ISelLowering.h (-10)
- (modified) llvm/lib/Target/X86/X86ISelLoweringCall.cpp (+3-34)
- (modified) llvm/test/CodeGen/X86/bfloat.ll (+600-724)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cd56529bfa0fd8..a9ac6aa6558441 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7475,10 +7475,12 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
- MVT IVT = VT.changeVectorElementTypeToInteger();
+ MVT IVT =
+ VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
SmallVector<SDValue, 16> NewOps;
for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
- NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));
+ NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
+ Op.getOperand(I)));
SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
return DAG.getBitcast(VT, Res);
}
@@ -21515,9 +21517,8 @@ SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
RTLIB::Libcall LC =
RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);
SDValue Res =
- makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
- return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,
- DAG.getBitcast(MVT::i32, Res));
+ makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
+ return DAG.getBitcast(MVT::i16, Res);
}
/// Depending on uarch and/or optimizing for size, we might prefer to use a
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 9bd1622cb0d3a6..32745400a38b7e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1714,16 +1714,6 @@ namespace llvm {
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
- bool splitValueIntoRegisterParts(
- SelectionDAG & DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
- unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC)
- const override;
-
- SDValue joinRegisterPartsIntoValue(
- SelectionDAG & DAG, const SDLoc &DL, const SDValue *Parts,
- unsigned NumParts, MVT PartVT, EVT ValueVT,
- std::optional<CallingConv::ID> CC) const override;
-
bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index b8b5421b900501..d75bd4171fde9d 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -127,6 +127,9 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
return getRegisterTypeForCallingConv(Context, CC,
VT.changeVectorElementType(MVT::f16));
+ if (VT == MVT::bf16)
+ return MVT::f16;
+
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
@@ -421,40 +424,6 @@ unsigned X86TargetLowering::getJumpTableEncoding() const {
return TargetLowering::getJumpTableEncoding();
}
-bool X86TargetLowering::splitValueIntoRegisterParts(
- SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
- unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
- bool IsABIRegCopy = CC.has_value();
- EVT ValueVT = Val.getValueType();
- if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
- unsigned ValueBits = ValueVT.getSizeInBits();
- unsigned PartBits = PartVT.getSizeInBits();
- Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
- Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
- Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
- Parts[0] = Val;
- return true;
- }
- return false;
-}
-
-SDValue X86TargetLowering::joinRegisterPartsIntoValue(
- SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
- MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
- bool IsABIRegCopy = CC.has_value();
- if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
- unsigned ValueBits = ValueVT.getSizeInBits();
- unsigned PartBits = PartVT.getSizeInBits();
- SDValue Val = Parts[0];
-
- Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
- Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
- Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
- return Val;
- }
- return SDValue();
-}
-
bool X86TargetLowering::useSoftFloat() const {
return Subtarget.useSoftFloat();
}
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 7ef362619d5fd0..b309f47e4b7190 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -3,7 +3,7 @@
; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,BF16
; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,FP16
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=CHECK,AVX,AVXNC
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=CHECK,AVX,BF16,AVXNC
define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
; X86-LABEL: add:
@@ -22,10 +22,7 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm0, %eax
-; X86-NEXT: movw %ax, (%esi)
+; X86-NEXT: vmovsh %xmm0, (%esi)
; X86-NEXT: addl $8, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl
@@ -42,27 +39,42 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rbx)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
;
-; AVX-LABEL: add:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rbx
-; AVX-NEXT: movq %rdx, %rbx
-; AVX-NEXT: movzwl (%rsi), %eax
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: movzwl (%rdi), %eax
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm1
-; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: callq __truncsfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: movw %ax, (%rbx)
-; AVX-NEXT: popq %rbx
-; AVX-NEXT: retq
+; BF16-LABEL: add:
+; BF16: # %bb.0:
+; BF16-NEXT: pushq %rbx
+; BF16-NEXT: movq %rdx, %rbx
+; BF16-NEXT: movzwl (%rsi), %eax
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm0
+; BF16-NEXT: movzwl (%rdi), %eax
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm1
+; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; BF16-NEXT: callq __truncsfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, (%rbx)
+; BF16-NEXT: popq %rbx
+; BF16-NEXT: retq
+;
+; FP16-LABEL: add:
+; FP16: # %bb.0:
+; FP16-NEXT: pushq %rbx
+; FP16-NEXT: movq %rdx, %rbx
+; FP16-NEXT: movzwl (%rsi), %eax
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm0
+; FP16-NEXT: movzwl (%rdi), %eax
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm1
+; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT: callq __truncsfbf2 at PLT
+; FP16-NEXT: vmovsh %xmm0, (%rbx)
+; FP16-NEXT: popq %rbx
+; FP16-NEXT: retq
%a = load bfloat, ptr %pa
%b = load bfloat, ptr %pb
%add = fadd bfloat %a, %b
@@ -89,8 +101,8 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; SSE2-LABEL: add2:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movd %xmm1, %ecx
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm1, %ecx
; SSE2-NEXT: shll $16, %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: shll $16, %eax
@@ -100,19 +112,33 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
-; AVX-LABEL: add2:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rax
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: vmovd %xmm1, %ecx
-; AVX-NEXT: shll $16, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm0
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm1
-; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: callq __truncsfbf2 at PLT
-; AVX-NEXT: popq %rax
-; AVX-NEXT: retq
+; BF16-LABEL: add2:
+; BF16: # %bb.0:
+; BF16-NEXT: pushq %rax
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: vpextrw $0, %xmm1, %ecx
+; BF16-NEXT: shll $16, %ecx
+; BF16-NEXT: vmovd %ecx, %xmm0
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm1
+; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; BF16-NEXT: callq __truncsfbf2 at PLT
+; BF16-NEXT: popq %rax
+; BF16-NEXT: retq
+;
+; FP16-LABEL: add2:
+; FP16: # %bb.0:
+; FP16-NEXT: pushq %rax
+; FP16-NEXT: vmovw %xmm0, %eax
+; FP16-NEXT: vmovw %xmm1, %ecx
+; FP16-NEXT: shll $16, %ecx
+; FP16-NEXT: vmovd %ecx, %xmm0
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm1
+; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT: callq __truncsfbf2 at PLT
+; FP16-NEXT: popq %rax
+; FP16-NEXT: retq
%add = fadd bfloat %a, %b
ret bfloat %add
}
@@ -123,22 +149,18 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $32, %esp
+; X86-NEXT: subl $16, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: vmovsd %xmm0, (%esp)
; X86-NEXT: calll __truncdfbf2
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm0, %edi
+; X86-NEXT: vmovw %xmm0, %edi
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: vmovsd %xmm0, (%esp)
; X86-NEXT: calll __truncdfbf2
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: shll $16, %edi
@@ -146,14 +168,12 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; X86-NEXT: vmovsd %xmm0, (%esi)
-; X86-NEXT: addl $32, %esp
+; X86-NEXT: addl $16, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -168,17 +188,17 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; SSE2-NEXT: movq %rsi, %r14
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebp
+; SSE2-NEXT: pextrw $0, %xmm0, %ebp
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: shll $16, %ebp
; SSE2-NEXT: movd %ebp, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
@@ -188,34 +208,63 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
-; AVX-LABEL: add_double:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rbp
-; AVX-NEXT: pushq %r14
-; AVX-NEXT: pushq %rbx
-; AVX-NEXT: movq %rdx, %rbx
-; AVX-NEXT: movq %rsi, %r14
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: callq __truncdfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %ebp
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: callq __truncdfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: shll $16, %ebp
-; AVX-NEXT: vmovd %ebp, %xmm1
-; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: callq __truncsfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovsd %xmm0, (%rbx)
-; AVX-NEXT: popq %rbx
-; AVX-NEXT: popq %r14
-; AVX-NEXT: popq %rbp
-; AVX-NEXT: retq
+; BF16-LABEL: add_double:
+; BF16: # %bb.0:
+; BF16-NEXT: pushq %rbp
+; BF16-NEXT: pushq %r14
+; BF16-NEXT: pushq %rbx
+; BF16-NEXT: movq %rdx, %rbx
+; BF16-NEXT: movq %rsi, %r14
+; BF16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, %ebp
+; BF16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm0
+; BF16-NEXT: shll $16, %ebp
+; BF16-NEXT: vmovd %ebp, %xmm1
+; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; BF16-NEXT: callq __truncsfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm0
+; BF16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; BF16-NEXT: vmovsd %xmm0, (%rbx)
+; BF16-NEXT: popq %rbx
+; BF16-NEXT: popq %r14
+; BF16-NEXT: popq %rbp
+; BF16-NEXT: retq
+;
+; FP16-LABEL: add_double:
+; FP16: # %bb.0:
+; FP16-NEXT: pushq %rbp
+; FP16-NEXT: pushq %r14
+; FP16-NEXT: pushq %rbx
+; FP16-NEXT: movq %rdx, %rbx
+; FP16-NEXT: movq %rsi, %r14
+; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovw %xmm0, %ebp
+; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovw %xmm0, %eax
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm0
+; FP16-NEXT: shll $16, %ebp
+; FP16-NEXT: vmovd %ebp, %xmm1
+; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT: callq __truncsfbf2 at PLT
+; FP16-NEXT: vmovw %xmm0, %eax
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm0
+; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; FP16-NEXT: vmovsd %xmm0, (%rbx)
+; FP16-NEXT: popq %rbx
+; FP16-NEXT: popq %r14
+; FP16-NEXT: popq %rbp
+; FP16-NEXT: retq
%la = load double, ptr %pa
%a = fptrunc double %la to bfloat
%lb = load double, ptr %pb
@@ -230,19 +279,15 @@ define double @add_double2(double %da, double %db) nounwind {
; X86-LABEL: add_double2:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $40, %esp
+; X86-NEXT: subl $24, %esp
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: vmovsd %xmm0, (%esp)
; X86-NEXT: calll __truncdfbf2
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm0, %esi
+; X86-NEXT: vmovw %xmm0, %esi
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: vmovsd %xmm0, (%esp)
; X86-NEXT: calll __truncdfbf2
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: shll $16, %esi
@@ -250,15 +295,13 @@ define double @add_double2(double %da, double %db) nounwind {
; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: calll __truncsfbf2
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm0, %eax
+; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; X86-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: fldl {{[0-9]+}}(%esp)
-; X86-NEXT: addl $40, %esp
+; X86-NEXT: addl $24, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
@@ -268,18 +311,18 @@ define double @add_double2(double %da, double %db) nounwind {
; SSE2-NEXT: subq $16, %rsp
; SSE2-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
; SSE2-NEXT: # xmm0 = mem[0],zero
; SSE2-NEXT: callq __truncdfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: shll $16, %ebx
; SSE2-NEXT: movd %ebx, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
@@ -287,30 +330,55 @@ define double @add_double2(double %da, double %db) nounwind {
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
;
-; AVX-LABEL: add_double2:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rbx
-; AVX-NEXT: subq $16, %rsp
-; AVX-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX-NEXT: callq __truncdfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %ebx
-; AVX-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0],zero
-; AVX-NEXT: callq __truncdfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: shll $16, %ebx
-; AVX-NEXT: vmovd %ebx, %xmm1
-; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: callq __truncsfbf2 at PLT
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: shll $16, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: addq $16, %rsp
-; AVX-NEXT: popq %rbx
-; AVX-NEXT: retq
+; BF16-LABEL: add_double2:
+; BF16: # %bb.0:
+; BF16-NEXT: pushq %rbx
+; BF16-NEXT: subq $16, %rsp
+; BF16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, %ebx
+; BF16-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
+; BF16-NEXT: # xmm0 = mem[0],zero
+; BF16-NEXT: callq __truncdfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm0
+; BF16-NEXT: shll $16, %ebx
+; BF16-NEXT: vmovd %ebx, %xmm1
+; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; BF16-NEXT: callq __truncsfbf2 at PLT
+; BF16-NEXT: vpextrw $0, %xmm0, %eax
+; BF16-NEXT: shll $16, %eax
+; BF16-NEXT: vmovd %eax, %xmm0
+; BF16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; BF16-NEXT: addq $16, %rsp
+; BF16-NEXT: popq %rbx
+; BF16-NEXT: retq
+;
+; FP16-LABEL: add_double2:
+; FP16: # %bb.0:
+; FP16-NEXT: pushq %rbx
+; FP16-NEXT: subq $16, %rsp
+; FP16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovw %xmm0, %ebx
+; FP16-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; FP16-NEXT: # xmm0 = mem[0],zero
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovw %xmm0, %eax
+; FP16-NEXT: shll $16, %eax
+; FP16-NEXT: vmovd %eax, %xmm0
+; FP16-NEXT: shll ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/76901
More information about the libcxx-commits
mailing list