[llvm] Enable custom lowering of fabs_v16f16 with AVX and fabs_v32f16 with A… (PR #73565)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 27 12:45:31 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: David Li (david-xl)
<details>
<summary>Changes</summary>
This is the last patch for fabs lowering. v32f16 works for AVX as well with the patch (with type legalization).
---
Patch is 138.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73565.diff
2 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+2-2)
- (modified) llvm/test/CodeGen/X86/vec_fabs.ll (+24-2281)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d0e51301945ecb5..7cfd48e283b47aa 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1596,8 +1596,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STORE, VT, Custom);
}
setF16Action(MVT::v16f16, Expand);
- if (Subtarget.hasAVX2())
- setOperationAction(ISD::FABS, MVT::v16f16, Custom);
+ setOperationAction(ISD::FABS, MVT::v16f16, Custom);
setOperationAction(ISD::FADD, MVT::v16f16, Expand);
setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
@@ -2054,6 +2053,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
setOperationAction(ISD::CTPOP, VT, Legal);
}
+ setOperationAction(ISD::FABS, MVT::v32f16, Custom);
}
// This block control legalization of v32i1/v64i1 which are available with
diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll
index 7e915c9ee040764..ec02dfda30c8502 100644
--- a/llvm/test/CodeGen/X86/vec_fabs.ll
+++ b/llvm/test/CodeGen/X86/vec_fabs.ll
@@ -282,235 +282,9 @@ declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
define <16 x half> @fabs_v16f16(ptr %p) {
; X86-AVX1-LABEL: fabs_v16f16:
; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: pushl %esi
-; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT: subl $308, %esp # imm = 0x134
-; X86-AVX1-NEXT: .cfi_def_cfa_offset 316
-; X86-AVX1-NEXT: .cfi_offset %esi, -8
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-AVX1-NEXT: vmovdqa (%esi), %xmm0
-; X86-AVX1-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovaps 16(%esi), %xmm1
-; X86-AVX1-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vbroadcastss 4(%esi), %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vbroadcastss 8(%esi), %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vbroadcastss 12(%esi), %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vbroadcastss 20(%esi), %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vbroadcastss 24(%esi), %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vbroadcastss 28(%esi), %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpextrw $0, %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __extendhfsf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm0, (%esp)
-; X86-AVX1-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX1-NEXT: vmovss %xmm1, (%esp)
-; X86-AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; X86-AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X86-AVX1-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-AVX1-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX1-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX1-NEXT: calll __truncsfhf2
-; X86-AVX1-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; X86-AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX1-NEXT: # xmm0 = xmm0[0],mem[0]
-; X86-AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X86-AVX1-NEXT: addl $308, %esp # imm = 0x134
-; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT: popl %esi
-; X86-AVX1-NEXT: .cfi_def_cfa_offset 4
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT: vmovaps (%eax), %ymm0
+; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: fabs_v16f16:
@@ -529,135 +303,8 @@ define <16 x half> @fabs_v16f16(ptr %p) {
;
; X64-AVX1-LABEL: fabs_v16f16:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: pushq %rbx
-; X64-AVX1-NEXT: .cfi_def_cfa_offset 16
-; X64-AVX1-NEXT: subq $80, %rsp
-; X64-AVX1-NEXT: .cfi_def_cfa_offset 96
-; X64-AVX1-NEXT: .cfi_offset %rbx, -16
-; X64-AVX1-NEXT: movq %rdi, %rbx
-; X64-AVX1-NEXT: vbroadcastss 28(%rdi), %xmm0
-; X64-AVX1-NEXT: callq __extendhfsf2 at PLT
-; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2 at PLT
-; X64-AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX1-NEXT: vmovaps (%rbx), %xmm0
-; X64-AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX1-NEXT: vmovdqa 16(%rbx), %xmm0
-; X64-AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX1-NEXT: callq __extendhfsf2 at PLT
-; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2 at PLT
-; X64-AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX1-NEXT: vbroadcastss 24(%rbx), %xmm0
-; X64-AVX1-NEXT: callq __extendhfsf2 at PLT
-; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2 at PLT
-; X64-AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; X64-AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX1-NEXT: callq __extendhfsf2 at PLT
-; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2 at PLT
-; X64-AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; X64-AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; X64-AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX1-NEXT: vbroadcastss 20(%rbx), %xmm0
-; X64-AVX1-NEXT: callq __extendhfsf2 at PLT
-; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2 at PLT
-; X64-AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; X64-AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __extendhfsf2 at PLT
-; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2 at PLT
-; X64-AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; X64-AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; X64-AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX1-NEXT: callq __extendhfsf2 at PLT
-; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2 at PLT
-; X64-AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __extendhfsf2 at PLT
-; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX1-NEXT: callq __truncsfhf2 at PLT
-; X64-AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; X64-AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX1-NEXT: # xmm0 = xmm0[0],mem[0]
-; X64-AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/73565
More information about the llvm-commits
mailing list