[llvm] [X86][BF16] Support INSERT_SUBVECTOR and CONCAT_VECTORS (PR #76485)

Wed Dec 27 19:50:45 PST 2023

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: Phoebe Wang (phoebewang)

<details>
<summary>Changes</summary>



---
Full diff: https://github.com/llvm/llvm-project/pull/76485.diff


4 Files Affected:

- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+4) 
- (modified) llvm/lib/Target/X86/X86InstrSSE.td (+8-4) 
- (modified) llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll (+5-25) 
- (modified) llvm/test/CodeGen/X86/bfloat.ll (+17-10) 


``````````diff

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 63bdf24d6b4f5e..35e54ebd5129f7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2267,6 +2267,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FDIV, VT, Expand);
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
     }
     setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
     addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
@@ -2282,6 +2284,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
     setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32bf16, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v32bf16, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 0e4e6937bf44cd..e6340e773ea5d5 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -7160,10 +7160,6 @@ def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF128rm addr:$src)>;
 }
 
-let Predicates = [HasAVXNECONVERT, NoVLX] in
-  def : Pat<(v16bf16 (X86SubVBroadcastld128 addr:$src)),
-            (VBROADCASTF128rm addr:$src)>;
-
 //===----------------------------------------------------------------------===//
 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
 //
@@ -7931,6 +7927,14 @@ let Predicates = [HasAVX2, NoVLX] in {
   defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
 }
 
+let Predicates = [HasAVXNECONVERT, NoVLX] in {
+  def : Pat<(v16bf16 (X86SubVBroadcastld128 addr:$src)),
+            (VBROADCASTF128rm addr:$src)>;
+  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8bf16,
+                          v16bf16, loadv8bf16,  loadv16bf16>;
+  defm : vextract_lowering<"VEXTRACTI128", v16bf16, v8bf16>;
+}
+
 //===----------------------------------------------------------------------===//
 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
 //
diff --git a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
index 40b512d68be816..46fabb5efede68 100644
--- a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
@@ -372,31 +372,11 @@ entry:
 
 ;; FIXME: This should generate the same output as above, but let's fix the crash first.
 define <16 x bfloat> @test_no_vbroadcast2() nounwind {
-; X86-LABEL: test_no_vbroadcast2:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp # encoding: [0x55]
-; X86-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
-; X86-NEXT:    andl $-32, %esp # encoding: [0x83,0xe4,0xe0]
-; X86-NEXT:    subl $64, %esp # encoding: [0x83,0xec,0x40]
-; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
-; X86-NEXT:    vmovaps %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
-; X86-NEXT:    vpbroadcastw (%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
-; X86-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
-; X86-NEXT:    popl %ebp # encoding: [0x5d]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_no_vbroadcast2:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pushq %rbp # encoding: [0x55]
-; X64-NEXT:    movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
-; X64-NEXT:    andq $-32, %rsp # encoding: [0x48,0x83,0xe4,0xe0]
-; X64-NEXT:    subq $64, %rsp # encoding: [0x48,0x83,0xec,0x40]
-; X64-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
-; X64-NEXT:    vmovaps %xmm0, (%rsp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
-; X64-NEXT:    vpbroadcastw (%rsp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
-; X64-NEXT:    movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
-; X64-NEXT:    popq %rbp # encoding: [0x5d]
-; X64-NEXT:    retq # encoding: [0xc3]
+; CHECK-LABEL: test_no_vbroadcast2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 entry:
   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 72f3eacf87594c..0b823898c5c9ed 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -2212,17 +2212,10 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
 ;
 ; AVXNC-LABEL: fptrunc_v16f32:
 ; AVXNC:       # %bb.0:
-; AVXNC-NEXT:    pushq %rbp
-; AVXNC-NEXT:    movq %rsp, %rbp
-; AVXNC-NEXT:    andq $-32, %rsp
-; AVXNC-NEXT:    subq $64, %rsp
-; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm1, %xmm1
-; AVXNC-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
 ; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
-; AVXNC-NEXT:    vmovaps %xmm0, (%rsp)
-; AVXNC-NEXT:    vmovaps (%rsp), %ymm0
-; AVXNC-NEXT:    movq %rbp, %rsp
-; AVXNC-NEXT:    popq %rbp
+; AVXNC-NEXT:    vinsertf128 $0, %xmm0, %ymm0, %ymm0
+; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm1, %xmm1
+; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVXNC-NEXT:    retq
   %b = fptrunc <16 x float> %a to <16 x bfloat>
   ret <16 x bfloat> %b
@@ -2485,3 +2478,17 @@ define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
   %3 = shufflevector <8 x bfloat> %2, <8 x bfloat> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <32 x bfloat> %3
 }
+
+define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
+; SSE2-LABEL: concat_v8bf16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: concat_v8bf16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x bfloat> %a
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/76485