[llvm] 03b1454 - [AArch64] Add tablegen patterns for bf16 trn/zip/uzp.

Wed Oct 5 13:47:45 PDT 2022

Author: David Green
Date: 2022-10-05T21:47:36+01:00
New Revision: 03b145480d05575ebc0ce817fb07ae5f02dffb20

URL: https://github.com/llvm/llvm-project/commit/03b145480d05575ebc0ce817fb07ae5f02dffb20
DIFF: https://github.com/llvm/llvm-project/commit/03b145480d05575ebc0ce817fb07ae5f02dffb20.diff

LOG: [AArch64] Add tablegen patterns for bf16 trn/zip/uzp.

This adds some missing tablegen patterns to handle trn1/trn2/zip1/zip2/uzp1/uzp2,
similar to the Arm handling in 5e1a9d319d2ee5d59a151e1d82f, but via tablegen
patterns for the AArch64 backend.

Added: 
    llvm/test/CodeGen/AArch64/bf16-shuffle.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64InstrFormats.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index d5d007ceea249..5487527ca5702 100644

--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6840,8 +6840,12 @@ multiclass SIMDZipVector<bits<3>opc, string asm,
 
   def : Pat<(v4f16 (OpNode V64:$Rn, V64:$Rm)),
         (!cast<Instruction>(NAME#"v4i16") V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4bf16 (OpNode V64:$Rn, V64:$Rm)),
+        (!cast<Instruction>(NAME#"v4i16") V64:$Rn, V64:$Rm)>;
   def : Pat<(v8f16 (OpNode V128:$Rn, V128:$Rm)),
         (!cast<Instruction>(NAME#"v8i16") V128:$Rn, V128:$Rm)>;
+  def : Pat<(v8bf16 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v8i16") V128:$Rn, V128:$Rm)>;
   def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
         (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
   def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),

diff  --git a/llvm/test/CodeGen/AArch64/bf16-shuffle.ll b/llvm/test/CodeGen/AArch64/bf16-shuffle.ll
new file mode 100644
index 0000000000000..4a2d2f3b2bc18
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bf16-shuffle.ll
@@ -0,0 +1,354 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-eabi -mattr=+v8.6a,+neon < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-eabi -mattr=+v8.6a,+neon,+bf16 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-eabi -mattr=+v8.6a,+neon,+fullfp16,+bf16 < %s | FileCheck %s
+
+%struct.float16x4x2_t = type { [2 x <4 x bfloat>] }
+%struct.float16x8x2_t = type { [2 x <8 x bfloat>] }
+
+define dso_local %struct.float16x4x2_t @test_vzip_bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: test_vzip_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    zip1 v2.4h, v0.4h, v1.4h
+; CHECK-NEXT:    zip2 v1.4h, v0.4h, v1.4h
+; CHECK-NEXT:    fmov d0, d2
+; CHECK-NEXT:    ret
+entry:
+  %vzip.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %vzip1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vzip1.i, 0, 1
+  ret %struct.float16x4x2_t %.fca.0.1.insert
+}
+
+define dso_local %struct.float16x8x2_t @test_vzipq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: test_vzipq_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    zip1 v2.8h, v0.8h, v1.8h
+; CHECK-NEXT:    zip2 v1.8h, v0.8h, v1.8h
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %vzip.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %vzip1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vzip1.i, 0, 1
+  ret %struct.float16x8x2_t %.fca.0.1.insert
+}
+
+define dso_local %struct.float16x4x2_t @test_vuzp_bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: test_vuzp_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uzp1 v2.4h, v0.4h, v1.4h
+; CHECK-NEXT:    uzp2 v1.4h, v0.4h, v1.4h
+; CHECK-NEXT:    fmov d0, d2
+; CHECK-NEXT:    ret
+entry:
+  %vuzp.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %vuzp1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vuzp1.i, 0, 1
+  ret %struct.float16x4x2_t %.fca.0.1.insert
+}
+
+define dso_local %struct.float16x8x2_t @test_vuzpq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: test_vuzpq_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uzp1 v2.8h, v0.8h, v1.8h
+; CHECK-NEXT:    uzp2 v1.8h, v0.8h, v1.8h
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %vuzp.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vuzp1.i, 0, 1
+  ret %struct.float16x8x2_t %.fca.0.1.insert
+}
+
+define dso_local %struct.float16x4x2_t @test_vtrn_bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: test_vtrn_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    trn1 v2.4h, v0.4h, v1.4h
+; CHECK-NEXT:    trn2 v1.4h, v0.4h, v1.4h
+; CHECK-NEXT:    fmov d0, d2
+; CHECK-NEXT:    ret
+entry:
+  %vtrn.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %vtrn1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vtrn1.i, 0, 1
+  ret %struct.float16x4x2_t %.fca.0.1.insert
+}
+
+define dso_local %struct.float16x8x2_t @test_vtrnq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: test_vtrnq_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    trn1 v2.8h, v0.8h, v1.8h
+; CHECK-NEXT:    trn2 v1.8h, v0.8h, v1.8h
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %vtrn.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %vtrn1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vtrn1.i, 0, 1
+  ret %struct.float16x8x2_t %.fca.0.1.insert
+}
+
+define dso_local <4 x bfloat> @test_vmov_n_bf16(float %a.coerce) {
+; CHECK-LABEL: test_vmov_n_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to bfloat
+  %vecinit = insertelement <4 x bfloat> undef, bfloat %1, i32 0
+  %vecinit4 = shufflevector <4 x bfloat> %vecinit, <4 x bfloat> undef, <4 x i32> zeroinitializer
+  ret <4 x bfloat> %vecinit4
+}
+
+define dso_local <8 x bfloat> @test_vmovq_n_bf16(float %a.coerce) {
+; CHECK-LABEL: test_vmovq_n_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to bfloat
+  %vecinit = insertelement <8 x bfloat> undef, bfloat %1, i32 0
+  %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer
+  ret <8 x bfloat> %vecinit8
+}
+
+define dso_local <4 x bfloat> @test_vdup_n_bf16(float %a.coerce) {
+; CHECK-LABEL: test_vdup_n_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to bfloat
+  %vecinit = insertelement <4 x bfloat> undef, bfloat %1, i32 0
+  %vecinit4 = shufflevector <4 x bfloat> %vecinit, <4 x bfloat> undef, <4 x i32> zeroinitializer
+  ret <4 x bfloat> %vecinit4
+}
+
+define dso_local <8 x bfloat> @test_vdupq_n_bf16(float %a.coerce) {
+; CHECK-LABEL: test_vdupq_n_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to bfloat
+  %vecinit = insertelement <8 x bfloat> undef, bfloat %1, i32 0
+  %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer
+  ret <8 x bfloat> %vecinit8
+}
+
+define dso_local <4 x bfloat> @test_vdup_lane_bf16(<4 x bfloat> %a) {
+; CHECK-LABEL: test_vdup_lane_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %shuffle = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define dso_local <8 x bfloat> @test_vdupq_lane_bf16(<4 x bfloat> %a) {
+; CHECK-LABEL: test_vdupq_lane_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.8h, v0.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %shuffle = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x bfloat> %shuffle
+}
+
+define dso_local <4 x bfloat> @test_vext_bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: test_vext_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; CHECK-NEXT:    ret
+entry:
+  %vext = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x bfloat> %vext
+}
+
+define dso_local <8 x bfloat> @test_vextq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: test_vextq_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #10
+; CHECK-NEXT:    ret
+entry:
+  %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+  ret <8 x bfloat> %vext
+}
+
+define dso_local <4 x bfloat> @test_vext_aligned_bf16(<8 x bfloat> %a) {
+; CHECK-LABEL: test_vext_aligned_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x bfloat> %vext
+}
+
+define dso_local <4 x bfloat> @test_vext_unaligned_bf16(<8 x bfloat> %a) {
+; CHECK-LABEL: test_vext_unaligned_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #6
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x bfloat> %vext
+}
+
+define <8 x bfloat> @shuffle3step0_bf16(<32 x bfloat> %src) {
+; CHECK-LABEL: shuffle3step0_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI16_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    mov v3.16b, v2.16b
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT:    adrp x8, .LCPI16_1
+; CHECK-NEXT:    tbl v2.16b, { v0.16b, v1.16b }, v4.16b
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI16_1]
+; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  ret <8 x bfloat> %s1
+}
+
+define <8 x bfloat> @shuffle3step1_bf16(<32 x bfloat> %src) {
+; CHECK-LABEL: shuffle3step1_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI17_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    mov v3.16b, v2.16b
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI17_0]
+; CHECK-NEXT:    adrp x8, .LCPI17_1
+; CHECK-NEXT:    tbl v2.16b, { v0.16b, v1.16b }, v4.16b
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI17_1]
+; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  ret <8 x bfloat> %s1
+}
+
+define <8 x bfloat> @shuffle3step2_bf16(<32 x bfloat> %src) {
+; CHECK-LABEL: shuffle3step2_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI18_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    mov v3.16b, v2.16b
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT:    adrp x8, .LCPI18_1
+; CHECK-NEXT:    tbl v2.16b, { v0.16b, v1.16b }, v4.16b
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI18_1]
+; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  ret <8 x bfloat> %s1
+}
+
+
+define dso_local <4 x bfloat> @test_vrev64_bf16(<4 x bfloat> %a) {
+; CHECK-LABEL: test_vrev64_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x bfloat> %shuffle.i
+}
+
+define dso_local <8 x bfloat> @test_vrev64q_bf16(<8 x bfloat> %a) {
+; CHECK-LABEL: test_vrev64q_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x bfloat> %shuffle.i
+}
+
+define dso_local <4 x bfloat> @test_vrev32_bf16(<4 x bfloat> %a) {
+; CHECK-LABEL: test_vrev32_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x bfloat> %shuffle.i
+}
+
+define dso_local <8 x bfloat> @test_vrev32q_bf16(<8 x bfloat> %a) {
+; CHECK-LABEL: test_vrev32q_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev32 v0.8h, v0.8h
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x bfloat> %shuffle.i
+}
+
+define <4 x bfloat> @test_vld_dup1_4xbfloat(bfloat* %b) {
+; CHECK-LABEL: test_vld_dup1_4xbfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.4h }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %b1 = load bfloat, bfloat* %b, align 2
+  %vecinit = insertelement <4 x bfloat> undef, bfloat %b1, i32 0
+  %vecinit2 = insertelement <4 x bfloat> %vecinit, bfloat %b1, i32 1
+  %vecinit3 = insertelement <4 x bfloat> %vecinit2, bfloat %b1, i32 2
+  %vecinit4 = insertelement <4 x bfloat> %vecinit3, bfloat %b1, i32 3
+  ret <4 x bfloat> %vecinit4
+}
+
+define <8 x bfloat> @test_vld_dup1_8xbfloat(bfloat* %b) local_unnamed_addr {
+; CHECK-LABEL: test_vld_dup1_8xbfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.8h }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %b1 = load bfloat, bfloat* %b, align 2
+  %vecinit = insertelement <8 x bfloat> undef, bfloat %b1, i32 0
+  %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer
+  ret <8 x bfloat> %vecinit8
+}
+
+define <8 x bfloat> @test_shufflevector8xbfloat(<4 x bfloat> %a) {
+; CHECK-LABEL: test_shufflevector8xbfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.d[1], v0.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %r = shufflevector <4 x bfloat> %a, <4 x bfloat> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x bfloat> %r
+}
+