[llvm] e499ae5 - [X86][BF16] Support INSERT_SUBVECTOR and CONCAT_VECTORS (#76485)

Wed Dec 27 21:29:05 PST 2023

Author: Phoebe Wang
Date: 2023-12-28T13:29:01+08:00
New Revision: e499ae53b3e2e5cb2116b875bd6fe8b63209ef99

URL: https://github.com/llvm/llvm-project/commit/e499ae53b3e2e5cb2116b875bd6fe8b63209ef99
DIFF: https://github.com/llvm/llvm-project/commit/e499ae53b3e2e5cb2116b875bd6fe8b63209ef99.diff

LOG: [X86][BF16] Support INSERT_SUBVECTOR and CONCAT_VECTORS (#76485)

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86InstrSSE.td
    llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
    llvm/test/CodeGen/X86/bfloat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 63bdf24d6b4f5e..35e54ebd5129f7 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2267,6 +2267,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FDIV, VT, Expand);
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
     }
     setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
     addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
@@ -2282,6 +2284,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
     setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32bf16, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v32bf16, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {

diff  --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 0e4e6937bf44cd..b61a694630d194 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -7909,6 +7909,9 @@ let Predicates = [HasAVX2, NoVLX] in {
   defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8,  loadv16i8,  loadv32i8>;
 }
 
+let Predicates = [HasAVXNECONVERT, NoVLX] in
+  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8bf16, v16bf16, loadv8bf16, loadv16bf16>;
+
 //===----------------------------------------------------------------------===//
 // VEXTRACTI128 - Extract packed integer values
 //
@@ -7931,6 +7934,9 @@ let Predicates = [HasAVX2, NoVLX] in {
   defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
 }
 
+let Predicates = [HasAVXNECONVERT, NoVLX] in
+  defm : vextract_lowering<"VEXTRACTI128", v16bf16, v8bf16>;
+
 //===----------------------------------------------------------------------===//
 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
 //

diff  --git a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
index 40b512d68be816..46fabb5efede68 100644
--- a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
@@ -372,31 +372,11 @@ entry:
 
 ;; FIXME: This should generate the same output as above, but let's fix the crash first.
 define <16 x bfloat> @test_no_vbroadcast2() nounwind {
-; X86-LABEL: test_no_vbroadcast2:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp # encoding: [0x55]
-; X86-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
-; X86-NEXT:    andl $-32, %esp # encoding: [0x83,0xe4,0xe0]
-; X86-NEXT:    subl $64, %esp # encoding: [0x83,0xec,0x40]
-; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
-; X86-NEXT:    vmovaps %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
-; X86-NEXT:    vpbroadcastw (%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
-; X86-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
-; X86-NEXT:    popl %ebp # encoding: [0x5d]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_no_vbroadcast2:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pushq %rbp # encoding: [0x55]
-; X64-NEXT:    movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
-; X64-NEXT:    andq $-32, %rsp # encoding: [0x48,0x83,0xe4,0xe0]
-; X64-NEXT:    subq $64, %rsp # encoding: [0x48,0x83,0xec,0x40]
-; X64-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
-; X64-NEXT:    vmovaps %xmm0, (%rsp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
-; X64-NEXT:    vpbroadcastw (%rsp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
-; X64-NEXT:    movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
-; X64-NEXT:    popq %rbp # encoding: [0x5d]
-; X64-NEXT:    retq # encoding: [0xc3]
+; CHECK-LABEL: test_no_vbroadcast2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 entry:
   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 72f3eacf87594c..674a0eacb0ca98 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -2212,17 +2212,10 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
 ;
 ; AVXNC-LABEL: fptrunc_v16f32:
 ; AVXNC:       # %bb.0:
-; AVXNC-NEXT:    pushq %rbp
-; AVXNC-NEXT:    movq %rsp, %rbp
-; AVXNC-NEXT:    andq $-32, %rsp
-; AVXNC-NEXT:    subq $64, %rsp
-; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm1, %xmm1
-; AVXNC-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
 ; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
-; AVXNC-NEXT:    vmovaps %xmm0, (%rsp)
-; AVXNC-NEXT:    vmovaps (%rsp), %ymm0
-; AVXNC-NEXT:    movq %rbp, %rsp
-; AVXNC-NEXT:    popq %rbp
+; AVXNC-NEXT:    vinsertf128 $0, %xmm0, %ymm0, %ymm0
+; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm1, %xmm1
+; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVXNC-NEXT:    retq
   %b = fptrunc <16 x float> %a to <16 x bfloat>
   ret <16 x bfloat> %b
@@ -2485,3 +2478,54 @@ define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
   %3 = shufflevector <8 x bfloat> %2, <8 x bfloat> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <32 x bfloat> %3
 }
+
+define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
+; SSE2-LABEL: concat_v8bf16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: concat_v8bf16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x bfloat> %a
+}
+
+define <8 x bfloat> @extract_v32bf16_v8bf16(<32 x bfloat> %x) {
+; SSE2-LABEL: extract_v32bf16_v8bf16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pextrw $0, %xmm1, %eax
+; SSE2-NEXT:    pextrw $1, %xmm1, %ecx
+; SSE2-NEXT:    shll $16, %ecx
+; SSE2-NEXT:    orl %eax, %ecx
+; SSE2-NEXT:    pextrw $2, %xmm1, %eax
+; SSE2-NEXT:    pextrw $3, %xmm1, %edx
+; SSE2-NEXT:    shll $16, %edx
+; SSE2-NEXT:    orl %eax, %edx
+; SSE2-NEXT:    shlq $32, %rdx
+; SSE2-NEXT:    orq %rcx, %rdx
+; SSE2-NEXT:    pextrw $4, %xmm1, %eax
+; SSE2-NEXT:    pextrw $5, %xmm1, %ecx
+; SSE2-NEXT:    shll $16, %ecx
+; SSE2-NEXT:    orl %eax, %ecx
+; SSE2-NEXT:    pextrw $6, %xmm1, %eax
+; SSE2-NEXT:    pextrw $7, %xmm1, %esi
+; SSE2-NEXT:    shll $16, %esi
+; SSE2-NEXT:    orl %eax, %esi
+; SSE2-NEXT:    shlq $32, %rsi
+; SSE2-NEXT:    orq %rcx, %rsi
+; SSE2-NEXT:    movq %rsi, %xmm1
+; SSE2-NEXT:    movq %rdx, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: extract_v32bf16_v8bf16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %a = shufflevector <32 x bfloat> %x, <32 x bfloat> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x bfloat> %a
+}