[llvm] [X86][SelectionDAG] - Add support for llvm.canonicalize intrinsic (PR #106370)
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 11 22:57:32 PDT 2024
================
@@ -0,0 +1,299 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5
+
+; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE,SSE2
+; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX512F
+; RUN: llc -mattr=+avx512bw -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX512BW
+
+define void @v_test_canonicalize__half(half addrspace(1)* %out) {
+; SSE-LABEL: v_test_canonicalize__half:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: subq $16, %rsp
+; SSE-NEXT: .cfi_def_cfa_offset 32
+; SSE-NEXT: .cfi_offset %rbx, -16
+; SSE-NEXT: movq %rdi, %rbx
+; SSE-NEXT: pinsrw $0, (%rdi), %xmm0
+; SSE-NEXT: callq __extendhfsf2 at PLT
+; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: callq __extendhfsf2 at PLT
+; SSE-NEXT: mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT: callq __truncsfhf2 at PLT
+; SSE-NEXT: pextrw $0, %xmm0, %eax
+; SSE-NEXT: movw %ax, (%rbx)
+; SSE-NEXT: addq $16, %rsp
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: .cfi_def_cfa_offset 8
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v_test_canonicalize__half:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: subq $16, %rsp
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: .cfi_offset %rbx, -16
+; AVX1-NEXT: movq %rdi, %rbx
+; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX1-NEXT: callq __extendhfsf2 at PLT
+; AVX1-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: callq __extendhfsf2 at PLT
+; AVX1-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX1-NEXT: callq __truncsfhf2 at PLT
+; AVX1-NEXT: vpextrw $0, %xmm0, (%rbx)
+; AVX1-NEXT: addq $16, %rsp
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: .cfi_def_cfa_offset 8
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v_test_canonicalize__half:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: subq $16, %rsp
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: .cfi_offset %rbx, -16
+; AVX2-NEXT: movq %rdi, %rbx
+; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX2-NEXT: callq __extendhfsf2 at PLT
+; AVX2-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; AVX2-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: callq __extendhfsf2 at PLT
+; AVX2-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT: callq __truncsfhf2 at PLT
+; AVX2-NEXT: vpextrw $0, %xmm0, (%rbx)
+; AVX2-NEXT: addq $16, %rsp
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: .cfi_def_cfa_offset 8
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v_test_canonicalize__half:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: movzwl (%rdi), %eax
+; AVX512F-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %eax, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512F-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: movw %ax, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v_test_canonicalize__half:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: movzwl (%rdi), %eax
+; AVX512BW-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm0
+; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT: vmovd %eax, %xmm1
+; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512BW-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: movw %ax, (%rdi)
+; AVX512BW-NEXT: retq
+entry:
+ %val = load half, half addrspace(1)* %out
+ %canonicalized = call half @llvm.canonicalize.f16(half %val)
+ store half %canonicalized, half addrspace(1)* %out
+ ret void
+}
+
+
+define half @complex_canonicalize_fmul_half(half %a, half %b) {
+; SSE-LABEL: complex_canonicalize_fmul_half:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: pushq %rax
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: callq __extendhfsf2 at PLT
+; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: callq __extendhfsf2 at PLT
+; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movss (%rsp), %xmm1 # 4-byte Reload
+; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: subss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: callq __truncsfhf2 at PLT
+; SSE-NEXT: callq __extendhfsf2 at PLT
+; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT: callq __truncsfhf2 at PLT
+; SSE-NEXT: callq __extendhfsf2 at PLT
+; SSE-NEXT: subss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT: callq __truncsfhf2 at PLT
+; SSE-NEXT: callq __extendhfsf2 at PLT
+; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: callq __extendhfsf2 at PLT
+; SSE-NEXT: mulss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT: callq __truncsfhf2 at PLT
+; SSE-NEXT: callq __extendhfsf2 at PLT
+; SSE-NEXT: subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT: callq __truncsfhf2 at PLT
+; SSE-NEXT: popq %rax
+; SSE-NEXT: .cfi_def_cfa_offset 8
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: complex_canonicalize_fmul_half:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: pushq %rax
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX1-NEXT: callq __extendhfsf2 at PLT
+; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX1-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: callq __extendhfsf2 at PLT
+; AVX1-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX1-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload
+; AVX1-NEXT: # xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vsubss %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: callq __truncsfhf2 at PLT
+; AVX1-NEXT: callq __extendhfsf2 at PLT
+; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX1-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX1-NEXT: callq __truncsfhf2 at PLT
+; AVX1-NEXT: callq __extendhfsf2 at PLT
+; AVX1-NEXT: vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX1-NEXT: callq __truncsfhf2 at PLT
+; AVX1-NEXT: callq __extendhfsf2 at PLT
+; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: callq __extendhfsf2 at PLT
+; AVX1-NEXT: vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX1-NEXT: callq __truncsfhf2 at PLT
+; AVX1-NEXT: callq __extendhfsf2 at PLT
+; AVX1-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX1-NEXT: callq __truncsfhf2 at PLT
+; AVX1-NEXT: popq %rax
+; AVX1-NEXT: .cfi_def_cfa_offset 8
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: complex_canonicalize_fmul_half:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: callq __extendhfsf2 at PLT
+; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT: callq __extendhfsf2 at PLT
+; AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload
+; AVX2-NEXT: # xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT: vsubss %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: callq __truncsfhf2 at PLT
+; AVX2-NEXT: callq __extendhfsf2 at PLT
+; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX2-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT: callq __truncsfhf2 at PLT
+; AVX2-NEXT: callq __extendhfsf2 at PLT
+; AVX2-NEXT: vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT: callq __truncsfhf2 at PLT
+; AVX2-NEXT: callq __extendhfsf2 at PLT
+; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX2-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: callq __extendhfsf2 at PLT
+; AVX2-NEXT: vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT: callq __truncsfhf2 at PLT
+; AVX2-NEXT: callq __extendhfsf2 at PLT
+; AVX2-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT: callq __truncsfhf2 at PLT
+; AVX2-NEXT: popq %rax
+; AVX2-NEXT: .cfi_def_cfa_offset 8
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: complex_canonicalize_fmul_half:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vpextrw $0, %xmm1, %eax
+; AVX512F-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %eax, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT: vsubss %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT: vmulss %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: complex_canonicalize_fmul_half:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vpextrw $0, %xmm1, %eax
+; AVX512BW-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm0
+; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT: vmovd %eax, %xmm1
+; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512BW-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT: vaddss %xmm1, %xmm0, %xmm2
+; AVX512BW-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512BW-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512BW-NEXT: vsubss %xmm0, %xmm2, %xmm0
+; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm2
+; AVX512BW-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512BW-NEXT: vmulss %xmm0, %xmm2, %xmm0
+; AVX512BW-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: retq
+entry:
+
+ %mul1 = fsub half %a, %b
+ %add = fadd half %mul1, %b
+ %mul2 = fsub half %add, %mul1
+ %canonicalized = call half @llvm.canonicalize.f16(half %mul2)
+ %result = fsub half %canonicalized, %b
+ ret half %result
+}
+
+declare half @llvm.canonicalize.f16(half)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
+; SSE2: {{.*}}
----------------
phoebewang wrote:
Remove these unused prefixes
https://github.com/llvm/llvm-project/pull/106370
More information about the llvm-commits
mailing list