[llvm] 8d82f12 - [ARM][AArch64] Add tests for shuffles load patterns. NFC

David Green via llvm-commits llvm-commits at lists.llvm.org
Wed May 31 10:42:07 PDT 2023


Author: David Green
Date: 2023-05-31T18:42:01+01:00
New Revision: 8d82f12ac3e8a6dae4e50d20da0c14fc30bfc7ee

URL: https://github.com/llvm/llvm-project/commit/8d82f12ac3e8a6dae4e50d20da0c14fc30bfc7ee
DIFF: https://github.com/llvm/llvm-project/commit/8d82f12ac3e8a6dae4e50d20da0c14fc30bfc7ee.diff

LOG: [ARM][AArch64] Add tests for shuffles load patterns. NFC

See D151029

Added: 
    llvm/test/CodeGen/AArch64/insertshuffleload.ll
    llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AArch64/insertshuffleload.ll b/llvm/test/CodeGen/AArch64/insertshuffleload.ll
new file mode 100644
index 0000000000000..c9bdb9537157e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/insertshuffleload.ll
@@ -0,0 +1,478 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+define <8 x i8> @inserti8_first(ptr %p) {
+; CHECK-LABEL: inserti8_first:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #7
+; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  %l1 = load <8 x i8>, ptr %q
+  %l2 = load i8, ptr %p
+  %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+  ret <8 x i8> %ins
+}
+
+define <8 x i8> @inserti8_last(ptr %p) {
+; CHECK-LABEL: inserti8_last:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #1
+; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 8
+  %l1 = load <8 x i8>, ptr %p
+  %l2 = load i8, ptr %q
+  %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+  %ins = insertelement <8 x i8> %s, i8 %l2, i32 7
+  ret <8 x i8> %ins
+}
+
+define <8 x i16> @inserti8_first_sext(ptr %p) {
+; CHECK-LABEL: inserti8_first_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    ldrsb w8, [x0]
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #14
+; CHECK-NEXT:    mov v0.h[0], w8
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  %l1 = load <8 x i8>, ptr %q
+  %s1 = sext <8 x i8> %l1 to <8 x i16>
+  %l2 = load i8, ptr %p
+  %s2 = sext i8 %l2 to i16
+  %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i16> %s, i16 %s2, i32 0
+  ret <8 x i16> %ins
+}
+
+define <8 x i16> @inserti8_last_sext(ptr %p) {
+; CHECK-LABEL: inserti8_last_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldrsb w8, [x0, #8]
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #2
+; CHECK-NEXT:    mov v0.h[7], w8
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 8
+  %l1 = load <8 x i8>, ptr %p
+  %s1 = sext <8 x i8> %l1 to <8 x i16>
+  %l2 = load i8, ptr %q
+  %s2 = sext i8 %l2 to i16
+  %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+  %ins = insertelement <8 x i16> %s, i16 %s2, i32 7
+  ret <8 x i16> %ins
+}
+
+define <8 x i16> @inserti8_first_zext(ptr %p) {
+; CHECK-LABEL: inserti8_first_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #14
+; CHECK-NEXT:    mov v0.h[0], w8
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  %l1 = load <8 x i8>, ptr %q
+  %s1 = zext <8 x i8> %l1 to <8 x i16>
+  %l2 = load i8, ptr %p
+  %s2 = zext i8 %l2 to i16
+  %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i16> %s, i16 %s2, i32 0
+  ret <8 x i16> %ins
+}
+
+define <8 x i16> @inserti8_last_zext(ptr %p) {
+; CHECK-LABEL: inserti8_last_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldrb w8, [x0, #8]
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #2
+; CHECK-NEXT:    mov v0.h[7], w8
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 8
+  %l1 = load <8 x i8>, ptr %p
+  %s1 = zext <8 x i8> %l1 to <8 x i16>
+  %l2 = load i8, ptr %q
+  %s2 = zext i8 %l2 to i16
+  %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+  %ins = insertelement <8 x i16> %s, i16 %s2, i32 7
+  ret <8 x i16> %ins
+}
+
+define <8 x i32> @inserti32_first(ptr %p) {
+; CHECK-LABEL: inserti32_first:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur q1, [x0, #4]
+; CHECK-NEXT:    ldur q2, [x0, #20]
+; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
+; CHECK-NEXT:    ext v1.16b, v1.16b, v2.16b, #12
+; CHECK-NEXT:    ld1 { v0.s }[0], [x0]
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 4
+  %l1 = load <8 x i32>, ptr %q
+  %l2 = load i32, ptr %p
+  %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i32> %s, i32 %l2, i32 0
+  ret <8 x i32> %ins
+}
+
+define <8 x i32> @inserti32_last(ptr %p) {
+; CHECK-LABEL: inserti32_last:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q2, q0, [x0]
+; CHECK-NEXT:    add x8, x0, #32
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #4
+; CHECK-NEXT:    ext v0.16b, v2.16b, v0.16b, #4
+; CHECK-NEXT:    ld1 { v1.s }[3], [x8]
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 32
+  %l1 = load <8 x i32>, ptr %p
+  %l2 = load i32, ptr %q
+  %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+  %ins = insertelement <8 x i32> %s, i32 %l2, i32 7
+  ret <8 x i32> %ins
+}
+
+define <8 x i32> @inserti32_first_multiuse(ptr %p) {
+; CHECK-LABEL: inserti32_first_multiuse:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur q0, [x0, #4]
+; CHECK-NEXT:    ldur q1, [x0, #20]
+; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #12
+; CHECK-NEXT:    ext v3.16b, v0.16b, v1.16b, #12
+; CHECK-NEXT:    ld1 { v2.s }[0], [x0]
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 4
+  %l1 = load <8 x i32>, ptr %q
+  %l2 = load i32, ptr %p
+  %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i32> %s, i32 %l2, i32 0
+  %a = add <8 x i32> %l1, %ins
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @inserti32_last_multiuse(ptr %p) {
+; CHECK-LABEL: inserti32_last_multiuse:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    add x8, x0, #32
+; CHECK-NEXT:    ext v2.16b, v1.16b, v0.16b, #4
+; CHECK-NEXT:    ext v3.16b, v0.16b, v1.16b, #4
+; CHECK-NEXT:    ld1 { v2.s }[3], [x8]
+; CHECK-NEXT:    add v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 32
+  %l1 = load <8 x i32>, ptr %p
+  %l2 = load i32, ptr %q
+  %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+  %ins = insertelement <8 x i32> %s, i32 %l2, i32 7
+  %a = add <8 x i32> %l1, %ins
+  ret <8 x i32> %a
+}
+
+define <4 x float> @insertf32_first(ptr %p) {
+; CHECK-LABEL: insertf32_first:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur q0, [x0, #4]
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #12
+; CHECK-NEXT:    ld1 { v0.s }[0], [x0]
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 4
+  %l1 = load <4 x float>, ptr %q
+  %l2 = load float, ptr %p
+  %s = shufflevector <4 x float> %l1, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 2>
+  %ins = insertelement <4 x float> %s, float %l2, i32 0
+  ret <4 x float> %ins
+}
+
+define <4 x float> @insertf32_last(ptr %p) {
+; CHECK-LABEL: insertf32_last:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    add x8, x0, #16
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
+; CHECK-NEXT:    ld1 { v0.s }[3], [x8]
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 16
+  %l1 = load <4 x float>, ptr %p
+  %l2 = load float, ptr %q
+  %s = shufflevector <4 x float> %l1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
+  %ins = insertelement <4 x float> %s, float %l2, i32 3
+  ret <4 x float> %ins
+}
+
+define <2 x i64> @inserti64_first(ptr %p) {
+; CHECK-LABEL: inserti64_first:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    ld1r { v0.2d }, [x8]
+; CHECK-NEXT:    ld1 { v0.d }[0], [x0]
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 8
+  %l1 = load <2 x i64>, ptr %q
+  %l2 = load i64, ptr %p
+  %s = shufflevector <2 x i64> %l1, <2 x i64> undef, <2 x i32> <i32 undef, i32 0>
+  %ins = insertelement <2 x i64> %s, i64 %l2, i32 0
+  ret <2 x i64> %ins
+}
+
+define <2 x i64> @inserti64_last(ptr %p) {
+; CHECK-LABEL: inserti64_last:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    add x8, x0, #16
+; CHECK-NEXT:    dup v0.2d, v0.d[1]
+; CHECK-NEXT:    ld1 { v0.d }[1], [x8]
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 16
+  %l1 = load <2 x i64>, ptr %p
+  %l2 = load i64, ptr %q
+  %s = shufflevector <2 x i64> %l1, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %ins = insertelement <2 x i64> %s, i64 %l2, i32 1
+  ret <2 x i64> %ins
+}
+
+define <8 x i8> @inserti8_first_undef(ptr %p) {
+; CHECK-LABEL: inserti8_first_undef:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #7
+; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  %l1 = load <8 x i8>, ptr %q
+  %l2 = load i8, ptr %p
+  %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 undef, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+  ret <8 x i8> %ins
+}
+
+define <8 x i8> @inserti8_last_undef(ptr %p) {
+; CHECK-LABEL: inserti8_last_undef:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    dup v0.8b, v0.b[1]
+; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 8
+  %l1 = load <8 x i8>, ptr %p
+  %l2 = load i8, ptr %q
+  %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %ins = insertelement <8 x i8> %s, i8 %l2, i32 7
+  ret <8 x i8> %ins
+}
+
+
+
+define <8 x i16> @wrong_zextandsext(ptr %p) {
+; CHECK-LABEL: wrong_zextandsext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    ldrsb w8, [x0]
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #14
+; CHECK-NEXT:    mov v0.h[0], w8
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  %l1 = load <8 x i8>, ptr %q
+  %s1 = zext <8 x i8> %l1 to <8 x i16>
+  %l2 = load i8, ptr %p
+  %s2 = sext i8 %l2 to i16
+  %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i16> %s, i16 %s2, i32 0
+  ret <8 x i16> %ins
+}
+
+define <8 x i8> @wrongidx_first(ptr %p) {
+; CHECK-LABEL: wrongidx_first:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #7
+; CHECK-NEXT:    ld1 { v0.b }[7], [x0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  %l1 = load <8 x i8>, ptr %q
+  %l2 = load i8, ptr %p
+  %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i8> %s, i8 %l2, i32 7
+  ret <8 x i8> %ins
+}
+
+define <8 x i8> @wrong_last(ptr %p) {
+; CHECK-LABEL: wrong_last:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #1
+; CHECK-NEXT:    ld1 { v0.b }[0], [x8]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 8
+  %l1 = load <8 x i8>, ptr %p
+  %l2 = load i8, ptr %q
+  %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+  %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+  ret <8 x i8> %ins
+}
+
+define <8 x i8> @wrong_shuffle(ptr %p) {
+; CHECK-LABEL: wrong_shuffle:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    adrp x8, .LCPI19_0
+; CHECK-NEXT:    mov v0.d[1], v0.d[0]
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI19_0]
+; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
+; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  %l1 = load <8 x i8>, ptr %q
+  %l2 = load i8, ptr %p
+  %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+  ret <8 x i8> %ins
+}
+
+define <8 x i16> @wrong_exttype(ptr %p) {
+; CHECK-LABEL: wrong_exttype:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #14
+; CHECK-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  %l1 = load <8 x i8>, ptr %q
+  %s1 = sext <8 x i8> %l1 to <8 x i16>
+  %l2 = load i16, ptr %p
+  %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i16> %s, i16 %l2, i32 0
+  ret <8 x i16> %ins
+}
+
+define <4 x i32> @wrong_exttype2(ptr %p) {
+; CHECK-LABEL: wrong_exttype2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur s0, [x0, #1]
+; CHECK-NEXT:    ldrsh w8, [x0]
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #12
+; CHECK-NEXT:    mov v0.s[0], w8
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  %l1 = load <4 x i8>, ptr %q
+  %s1 = sext <4 x i8> %l1 to <4 x i32>
+  %l2 = load i16, ptr %p
+  %s2 = sext i16 %l2 to i32
+  %s = shufflevector <4 x i32> %s1, <4 x i32> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 2>
+  %ins = insertelement <4 x i32> %s, i32 %s2, i32 0
+  ret <4 x i32> %ins
+}
+
+define <8 x i8> @wrong_offsetfirst(ptr %p) {
+; CHECK-LABEL: wrong_offsetfirst:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #-1]
+; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #7
+; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 -1
+  %l1 = load <8 x i8>, ptr %q
+  %l2 = load i8, ptr %p
+  %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+  ret <8 x i8> %ins
+}
+
+define <8 x i8> @wrong_offsetlast(ptr %p) {
+; CHECK-LABEL: wrong_offsetlast:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    add x8, x0, #7
+; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #1
+; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 7
+  %l1 = load <8 x i8>, ptr %p
+  %l2 = load i8, ptr %q
+  %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+  %ins = insertelement <8 x i8> %s, i8 %l2, i32 7
+  ret <8 x i8> %ins
+}
+
+
+define <8 x i8> @storebetween(ptr %p, ptr %r) {
+; CHECK-LABEL: storebetween:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    strb wzr, [x1]
+; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #7
+; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  %l1 = load <8 x i8>, ptr %q
+  store i8 0, ptr %r
+  %l2 = load i8, ptr %p
+  %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+  ret <8 x i8> %ins
+}
+
+define <8 x i8> @storebefore(ptr %p, ptr %r) {
+; CHECK-LABEL: storebefore:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    strb wzr, [x1]
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #7
+; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  store i8 0, ptr %r
+  %l1 = load <8 x i8>, ptr %q
+  %l2 = load i8, ptr %p
+  %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+  ret <8 x i8> %ins
+}
+
+define <8 x i8> @storeafter(ptr %p, ptr %r) {
+; CHECK-LABEL: storeafter:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldur d0, [x0, #1]
+; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #7
+; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-NEXT:    strb wzr, [x1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  %l1 = load <8 x i8>, ptr %q
+  %l2 = load i8, ptr %p
+  store i8 0, ptr %r
+  %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+  ret <8 x i8> %ins
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll
new file mode 100644
index 0000000000000..7714f8dd92d73
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll
@@ -0,0 +1,482 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=thumbv8.1m.main-none-eabihf -mattr=+mve.fp | FileCheck %s --check-prefix=CHECKLE
+; RUN: llc < %s -mtriple=thumbebv8.1m.main-none-eabihf -mattr=+mve.fp | FileCheck %s --check-prefix=CHECKBE
+
+
+define <8 x i8> @inserti8_first(ptr %p) {
+; CHECKLE-LABEL: inserti8_first:
+; CHECKLE:       @ %bb.0:
+; CHECKLE-NEXT:    vldrb.u16 q1, [r0, #1]
+; CHECKLE-NEXT:    ldrb r1, [r0]
+; CHECKLE-NEXT:    vmovx.f16 s10, s5
+; CHECKLE-NEXT:    vmovx.f16 s8, s4
+; CHECKLE-NEXT:    vins.f16 s10, s6
+; CHECKLE-NEXT:    vmovx.f16 s6, s6
+; CHECKLE-NEXT:    vmov.16 q0[0], r1
+; CHECKLE-NEXT:    vins.f16 s8, s5
+; CHECKLE-NEXT:    vins.f16 s6, s7
+; CHECKLE-NEXT:    vmov.f32 s1, s8
+; CHECKLE-NEXT:    vmov.f32 s2, s10
+; CHECKLE-NEXT:    vins.f16 s0, s4
+; CHECKLE-NEXT:    vmov.f32 s3, s6
+; CHECKLE-NEXT:    bx lr
+;
+; CHECKBE-LABEL: inserti8_first:
+; CHECKBE:       @ %bb.0:
+; CHECKBE-NEXT:    vldrb.u16 q0, [r0, #1]
+; CHECKBE-NEXT:    ldrb r1, [r0]
+; CHECKBE-NEXT:    vmovx.f16 s6, s1
+; CHECKBE-NEXT:    vmovx.f16 s4, s0
+; CHECKBE-NEXT:    vins.f16 s6, s2
+; CHECKBE-NEXT:    vmovx.f16 s2, s2
+; CHECKBE-NEXT:    vmov.16 q2[0], r1
+; CHECKBE-NEXT:    vins.f16 s4, s1
+; CHECKBE-NEXT:    vins.f16 s2, s3
+; CHECKBE-NEXT:    vins.f16 s8, s0
+; CHECKBE-NEXT:    vmov.f32 s9, s4
+; CHECKBE-NEXT:    vmov.f32 s10, s6
+; CHECKBE-NEXT:    vmov.f32 s11, s2
+; CHECKBE-NEXT:    vrev64.16 q0, q2
+; CHECKBE-NEXT:    bx lr
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  %l1 = load <8 x i8>, ptr %q
+  %l2 = load i8, ptr %p
+  %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+  ret <8 x i8> %ins
+}
+
+define <8 x i8> @inserti8_last(ptr %p) {
+; CHECKLE-LABEL: inserti8_last:
+; CHECKLE:       @ %bb.0:
+; CHECKLE-NEXT:    vldrb.u16 q1, [r0]
+; CHECKLE-NEXT:    ldrb r1, [r0, #8]
+; CHECKLE-NEXT:    vmovx.f16 s0, s4
+; CHECKLE-NEXT:    vmovx.f16 s1, s5
+; CHECKLE-NEXT:    vmovx.f16 s2, s6
+; CHECKLE-NEXT:    vins.f16 s0, s5
+; CHECKLE-NEXT:    vins.f16 s1, s6
+; CHECKLE-NEXT:    vins.f16 s2, s7
+; CHECKLE-NEXT:    vmov.u16 r0, q1[7]
+; CHECKLE-NEXT:    vmov.16 q0[6], r0
+; CHECKLE-NEXT:    vmov.16 q0[7], r1
+; CHECKLE-NEXT:    bx lr
+;
+; CHECKBE-LABEL: inserti8_last:
+; CHECKBE:       @ %bb.0:
+; CHECKBE-NEXT:    vldrb.u16 q0, [r0]
+; CHECKBE-NEXT:    ldrb r1, [r0, #8]
+; CHECKBE-NEXT:    vmovx.f16 s4, s0
+; CHECKBE-NEXT:    vmovx.f16 s5, s1
+; CHECKBE-NEXT:    vmovx.f16 s6, s2
+; CHECKBE-NEXT:    vins.f16 s4, s1
+; CHECKBE-NEXT:    vins.f16 s5, s2
+; CHECKBE-NEXT:    vins.f16 s6, s3
+; CHECKBE-NEXT:    vmov.u16 r0, q0[7]
+; CHECKBE-NEXT:    vmov.16 q1[6], r0
+; CHECKBE-NEXT:    vmov.16 q1[7], r1
+; CHECKBE-NEXT:    vrev64.16 q0, q1
+; CHECKBE-NEXT:    bx lr
+  %q = getelementptr inbounds i8, ptr %p, i32 8
+  %l1 = load <8 x i8>, ptr %p
+  %l2 = load i8, ptr %q
+  %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+  %ins = insertelement <8 x i8> %s, i8 %l2, i32 7
+  ret <8 x i8> %ins
+}
+
+define <8 x i16> @inserti8_first_sext(ptr %p) {
+; CHECKLE-LABEL: inserti8_first_sext:
+; CHECKLE:       @ %bb.0:
+; CHECKLE-NEXT:    vldrb.s16 q1, [r0, #1]
+; CHECKLE-NEXT:    ldrsb.w r1, [r0]
+; CHECKLE-NEXT:    vmovx.f16 s10, s5
+; CHECKLE-NEXT:    vmovx.f16 s8, s4
+; CHECKLE-NEXT:    vins.f16 s10, s6
+; CHECKLE-NEXT:    vmovx.f16 s6, s6
+; CHECKLE-NEXT:    vmov.16 q0[0], r1
+; CHECKLE-NEXT:    vins.f16 s8, s5
+; CHECKLE-NEXT:    vins.f16 s6, s7
+; CHECKLE-NEXT:    vmov.f32 s1, s8
+; CHECKLE-NEXT:    vmov.f32 s2, s10
+; CHECKLE-NEXT:    vins.f16 s0, s4
+; CHECKLE-NEXT:    vmov.f32 s3, s6
+; CHECKLE-NEXT:    bx lr
+;
+; CHECKBE-LABEL: inserti8_first_sext:
+; CHECKBE:       @ %bb.0:
+; CHECKBE-NEXT:    vldrb.s16 q0, [r0, #1]
+; CHECKBE-NEXT:    ldrsb.w r1, [r0]
+; CHECKBE-NEXT:    vmovx.f16 s6, s1
+; CHECKBE-NEXT:    vmovx.f16 s4, s0
+; CHECKBE-NEXT:    vins.f16 s6, s2
+; CHECKBE-NEXT:    vmovx.f16 s2, s2
+; CHECKBE-NEXT:    vmov.16 q2[0], r1
+; CHECKBE-NEXT:    vins.f16 s4, s1
+; CHECKBE-NEXT:    vins.f16 s2, s3
+; CHECKBE-NEXT:    vins.f16 s8, s0
+; CHECKBE-NEXT:    vmov.f32 s9, s4
+; CHECKBE-NEXT:    vmov.f32 s10, s6
+; CHECKBE-NEXT:    vmov.f32 s11, s2
+; CHECKBE-NEXT:    vrev64.16 q0, q2
+; CHECKBE-NEXT:    bx lr
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  %l1 = load <8 x i8>, ptr %q
+  %s1 = sext <8 x i8> %l1 to <8 x i16>
+  %l2 = load i8, ptr %p
+  %s2 = sext i8 %l2 to i16
+  %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i16> %s, i16 %s2, i32 0
+  ret <8 x i16> %ins
+}
+
+define <8 x i16> @inserti8_last_sext(ptr %p) {
+; CHECKLE-LABEL: inserti8_last_sext:
+; CHECKLE:       @ %bb.0:
+; CHECKLE-NEXT:    vldrb.s16 q1, [r0]
+; CHECKLE-NEXT:    ldrsb.w r1, [r0, #8]
+; CHECKLE-NEXT:    vmovx.f16 s0, s4
+; CHECKLE-NEXT:    vmovx.f16 s1, s5
+; CHECKLE-NEXT:    vmovx.f16 s2, s6
+; CHECKLE-NEXT:    vins.f16 s0, s5
+; CHECKLE-NEXT:    vins.f16 s1, s6
+; CHECKLE-NEXT:    vins.f16 s2, s7
+; CHECKLE-NEXT:    vmov.u16 r0, q1[7]
+; CHECKLE-NEXT:    vmov.16 q0[6], r0
+; CHECKLE-NEXT:    vmov.16 q0[7], r1
+; CHECKLE-NEXT:    bx lr
+;
+; CHECKBE-LABEL: inserti8_last_sext:
+; CHECKBE:       @ %bb.0:
+; CHECKBE-NEXT:    vldrb.s16 q0, [r0]
+; CHECKBE-NEXT:    ldrsb.w r1, [r0, #8]
+; CHECKBE-NEXT:    vmovx.f16 s4, s0
+; CHECKBE-NEXT:    vmovx.f16 s5, s1
+; CHECKBE-NEXT:    vmovx.f16 s6, s2
+; CHECKBE-NEXT:    vins.f16 s4, s1
+; CHECKBE-NEXT:    vins.f16 s5, s2
+; CHECKBE-NEXT:    vins.f16 s6, s3
+; CHECKBE-NEXT:    vmov.u16 r0, q0[7]
+; CHECKBE-NEXT:    vmov.16 q1[6], r0
+; CHECKBE-NEXT:    vmov.16 q1[7], r1
+; CHECKBE-NEXT:    vrev64.16 q0, q1
+; CHECKBE-NEXT:    bx lr
+  %q = getelementptr inbounds i8, ptr %p, i32 8
+  %l1 = load <8 x i8>, ptr %p
+  %s1 = sext <8 x i8> %l1 to <8 x i16>
+  %l2 = load i8, ptr %q
+  %s2 = sext i8 %l2 to i16
+  %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+  %ins = insertelement <8 x i16> %s, i16 %s2, i32 7
+  ret <8 x i16> %ins
+}
+
+define <8 x i16> @inserti8_first_zext(ptr %p) {
+; CHECKLE-LABEL: inserti8_first_zext:
+; CHECKLE:       @ %bb.0:
+; CHECKLE-NEXT:    vldrb.u16 q1, [r0, #1]
+; CHECKLE-NEXT:    ldrb r1, [r0]
+; CHECKLE-NEXT:    vmovx.f16 s10, s5
+; CHECKLE-NEXT:    vmovx.f16 s8, s4
+; CHECKLE-NEXT:    vins.f16 s10, s6
+; CHECKLE-NEXT:    vmovx.f16 s6, s6
+; CHECKLE-NEXT:    vmov.16 q0[0], r1
+; CHECKLE-NEXT:    vins.f16 s8, s5
+; CHECKLE-NEXT:    vins.f16 s6, s7
+; CHECKLE-NEXT:    vmov.f32 s1, s8
+; CHECKLE-NEXT:    vmov.f32 s2, s10
+; CHECKLE-NEXT:    vins.f16 s0, s4
+; CHECKLE-NEXT:    vmov.f32 s3, s6
+; CHECKLE-NEXT:    bx lr
+;
+; CHECKBE-LABEL: inserti8_first_zext:
+; CHECKBE:       @ %bb.0:
+; CHECKBE-NEXT:    vldrb.u16 q0, [r0, #1]
+; CHECKBE-NEXT:    ldrb r1, [r0]
+; CHECKBE-NEXT:    vmovx.f16 s6, s1
+; CHECKBE-NEXT:    vmovx.f16 s4, s0
+; CHECKBE-NEXT:    vins.f16 s6, s2
+; CHECKBE-NEXT:    vmovx.f16 s2, s2
+; CHECKBE-NEXT:    vmov.16 q2[0], r1
+; CHECKBE-NEXT:    vins.f16 s4, s1
+; CHECKBE-NEXT:    vins.f16 s2, s3
+; CHECKBE-NEXT:    vins.f16 s8, s0
+; CHECKBE-NEXT:    vmov.f32 s9, s4
+; CHECKBE-NEXT:    vmov.f32 s10, s6
+; CHECKBE-NEXT:    vmov.f32 s11, s2
+; CHECKBE-NEXT:    vrev64.16 q0, q2
+; CHECKBE-NEXT:    bx lr
+  %q = getelementptr inbounds i8, ptr %p, i32 1
+  %l1 = load <8 x i8>, ptr %q
+  %s1 = zext <8 x i8> %l1 to <8 x i16>
+  %l2 = load i8, ptr %p
+  %s2 = zext i8 %l2 to i16
+  %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i16> %s, i16 %s2, i32 0
+  ret <8 x i16> %ins
+}
+
+define <8 x i16> @inserti8_last_zext(ptr %p) {
+; CHECKLE-LABEL: inserti8_last_zext:
+; CHECKLE:       @ %bb.0:
+; CHECKLE-NEXT:    vldrb.u16 q1, [r0]
+; CHECKLE-NEXT:    ldrb r1, [r0, #8]
+; CHECKLE-NEXT:    vmovx.f16 s0, s4
+; CHECKLE-NEXT:    vmovx.f16 s1, s5
+; CHECKLE-NEXT:    vmovx.f16 s2, s6
+; CHECKLE-NEXT:    vins.f16 s0, s5
+; CHECKLE-NEXT:    vins.f16 s1, s6
+; CHECKLE-NEXT:    vins.f16 s2, s7
+; CHECKLE-NEXT:    vmov.u16 r0, q1[7]
+; CHECKLE-NEXT:    vmov.16 q0[6], r0
+; CHECKLE-NEXT:    vmov.16 q0[7], r1
+; CHECKLE-NEXT:    bx lr
+;
+; CHECKBE-LABEL: inserti8_last_zext:
+; CHECKBE:       @ %bb.0:
+; CHECKBE-NEXT:    vldrb.u16 q0, [r0]
+; CHECKBE-NEXT:    ldrb r1, [r0, #8]
+; CHECKBE-NEXT:    vmovx.f16 s4, s0
+; CHECKBE-NEXT:    vmovx.f16 s5, s1
+; CHECKBE-NEXT:    vmovx.f16 s6, s2
+; CHECKBE-NEXT:    vins.f16 s4, s1
+; CHECKBE-NEXT:    vins.f16 s5, s2
+; CHECKBE-NEXT:    vins.f16 s6, s3
+; CHECKBE-NEXT:    vmov.u16 r0, q0[7]
+; CHECKBE-NEXT:    vmov.16 q1[6], r0
+; CHECKBE-NEXT:    vmov.16 q1[7], r1
+; CHECKBE-NEXT:    vrev64.16 q0, q1
+; CHECKBE-NEXT:    bx lr
+  %q = getelementptr inbounds i8, ptr %p, i32 8
+  %l1 = load <8 x i8>, ptr %p
+  %s1 = zext <8 x i8> %l1 to <8 x i16>
+  %l2 = load i8, ptr %q
+  %s2 = zext i8 %l2 to i16
+  %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+  %ins = insertelement <8 x i16> %s, i16 %s2, i32 7
+  ret <8 x i16> %ins
+}
+
+define <8 x i32> @inserti32_first(ptr %p) {
+; CHECKLE-LABEL: inserti32_first:
+; CHECKLE:       @ %bb.0:
+; CHECKLE-NEXT:    vldrw.u32 q1, [r0, #4]
+; CHECKLE-NEXT:    vldrw.u32 q2, [r0, #20]
+; CHECKLE-NEXT:    ldr r1, [r0]
+; CHECKLE-NEXT:    vmov.f32 s1, s4
+; CHECKLE-NEXT:    vmov.f32 s2, s5
+; CHECKLE-NEXT:    vmov.f32 s3, s6
+; CHECKLE-NEXT:    vmov.f32 s4, s7
+; CHECKLE-NEXT:    vmov.32 q0[0], r1
+; CHECKLE-NEXT:    vmov.f32 s5, s8
+; CHECKLE-NEXT:    vmov.f32 s6, s9
+; CHECKLE-NEXT:    vmov.f32 s7, s10
+; CHECKLE-NEXT:    bx lr
+;
+; CHECKBE-LABEL: inserti32_first:
+; CHECKBE:       @ %bb.0:
+; CHECKBE-NEXT:    vldrw.u32 q0, [r0, #20]
+; CHECKBE-NEXT:    vldrw.u32 q2, [r0, #4]
+; CHECKBE-NEXT:    ldr r1, [r0]
+; CHECKBE-NEXT:    vmov.f32 s12, s11
+; CHECKBE-NEXT:    vmov.f32 s13, s0
+; CHECKBE-NEXT:    vmov.f32 s14, s1
+; CHECKBE-NEXT:    vmov.f32 s15, s2
+; CHECKBE-NEXT:    vrev64.32 q1, q3
+; CHECKBE-NEXT:    vmov.f32 s13, s8
+; CHECKBE-NEXT:    vmov.f32 s14, s9
+; CHECKBE-NEXT:    vmov.f32 s15, s10
+; CHECKBE-NEXT:    vmov.32 q3[0], r1
+; CHECKBE-NEXT:    vrev64.32 q0, q3
+; CHECKBE-NEXT:    bx lr
+  %q = getelementptr inbounds i8, ptr %p, i32 4
+  %l1 = load <8 x i32>, ptr %q
+  %l2 = load i32, ptr %p
+  %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i32> %s, i32 %l2, i32 0
+  ret <8 x i32> %ins
+}
+
+define <8 x i32> @inserti32_last(ptr %p) {
+; CHECKLE-LABEL: inserti32_last:
+; CHECKLE:       @ %bb.0:
+; CHECKLE-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECKLE-NEXT:    vldrw.u32 q0, [r0]
+; CHECKLE-NEXT:    ldr r1, [r0, #32]
+; CHECKLE-NEXT:    vmov.f32 s0, s1
+; CHECKLE-NEXT:    vmov.f32 s1, s2
+; CHECKLE-NEXT:    vmov.f32 s2, s3
+; CHECKLE-NEXT:    vmov.f32 s3, s8
+; CHECKLE-NEXT:    vmov.f32 s4, s9
+; CHECKLE-NEXT:    vmov.f32 s5, s10
+; CHECKLE-NEXT:    vmov.f32 s6, s11
+; CHECKLE-NEXT:    vmov.32 q1[3], r1
+; CHECKLE-NEXT:    bx lr
+;
+; CHECKBE-LABEL: inserti32_last:
+; CHECKBE:       @ %bb.0:
+; CHECKBE-NEXT:    vldrw.u32 q0, [r0]
+; CHECKBE-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECKBE-NEXT:    ldr r1, [r0, #32]
+; CHECKBE-NEXT:    vmov.f32 s8, s1
+; CHECKBE-NEXT:    vmov.f32 s9, s2
+; CHECKBE-NEXT:    vmov.f32 s10, s3
+; CHECKBE-NEXT:    vmov.f32 s11, s4
+; CHECKBE-NEXT:    vrev64.32 q0, q2
+; CHECKBE-NEXT:    vmov.f32 s8, s5
+; CHECKBE-NEXT:    vmov.f32 s9, s6
+; CHECKBE-NEXT:    vmov.f32 s10, s7
+; CHECKBE-NEXT:    vmov.32 q2[3], r1
+; CHECKBE-NEXT:    vrev64.32 q1, q2
+; CHECKBE-NEXT:    bx lr
+  %q = getelementptr inbounds i8, ptr %p, i32 32
+  %l1 = load <8 x i32>, ptr %p
+  %l2 = load i32, ptr %q
+  %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+  %ins = insertelement <8 x i32> %s, i32 %l2, i32 7
+  ret <8 x i32> %ins
+}
+
+define <8 x i32> @inserti32_first_multiuse(ptr %p) {
+; CHECKLE-LABEL: inserti32_first_multiuse:
+; CHECKLE:       @ %bb.0:
+; CHECKLE-NEXT:    vldrw.u32 q1, [r0, #20]
+; CHECKLE-NEXT:    vldrw.u32 q0, [r0, #4]
+; CHECKLE-NEXT:    ldr r1, [r0]
+; CHECKLE-NEXT:    vmov.f32 s8, s3
+; CHECKLE-NEXT:    vmov.f32 s9, s4
+; CHECKLE-NEXT:    vmov.f32 s10, s5
+; CHECKLE-NEXT:    vmov.f32 s11, s6
+; CHECKLE-NEXT:    vadd.i32 q1, q1, q2
+; CHECKLE-NEXT:    vmov.f32 s9, s0
+; CHECKLE-NEXT:    vmov.f32 s10, s1
+; CHECKLE-NEXT:    vmov.f32 s11, s2
+; CHECKLE-NEXT:    vmov.32 q2[0], r1
+; CHECKLE-NEXT:    vadd.i32 q0, q0, q2
+; CHECKLE-NEXT:    bx lr
+;
+; CHECKBE-LABEL: inserti32_first_multiuse:
+; CHECKBE:       @ %bb.0:
+; CHECKBE-NEXT:    vldrw.u32 q1, [r0, #20]
+; CHECKBE-NEXT:    vldrw.u32 q0, [r0, #4]
+; CHECKBE-NEXT:    ldr r1, [r0]
+; CHECKBE-NEXT:    vmov.f32 s8, s3
+; CHECKBE-NEXT:    vmov.f32 s9, s4
+; CHECKBE-NEXT:    vmov.f32 s10, s5
+; CHECKBE-NEXT:    vmov.f32 s11, s6
+; CHECKBE-NEXT:    vadd.i32 q2, q1, q2
+; CHECKBE-NEXT:    vrev64.32 q1, q2
+; CHECKBE-NEXT:    vmov.f32 s9, s0
+; CHECKBE-NEXT:    vmov.f32 s10, s1
+; CHECKBE-NEXT:    vmov.f32 s11, s2
+; CHECKBE-NEXT:    vmov.32 q2[0], r1
+; CHECKBE-NEXT:    vadd.i32 q2, q0, q2
+; CHECKBE-NEXT:    vrev64.32 q0, q2
+; CHECKBE-NEXT:    bx lr
+  %q = getelementptr inbounds i8, ptr %p, i32 4
+  %l1 = load <8 x i32>, ptr %q
+  %l2 = load i32, ptr %p
+  %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  %ins = insertelement <8 x i32> %s, i32 %l2, i32 0
+  %a = add <8 x i32> %l1, %ins
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @inserti32_last_multiuse(ptr %p) {
+; CHECKLE-LABEL: inserti32_last_multiuse:
+; CHECKLE:       @ %bb.0:
+; CHECKLE-NEXT:    vldrw.u32 q0, [r0]
+; CHECKLE-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECKLE-NEXT:    ldr r1, [r0, #32]
+; CHECKLE-NEXT:    vmov.f32 s8, s1
+; CHECKLE-NEXT:    vmov.f32 s9, s2
+; CHECKLE-NEXT:    vmov.f32 s10, s3
+; CHECKLE-NEXT:    vmov.f32 s11, s4
+; CHECKLE-NEXT:    vadd.i32 q0, q0, q2
+; CHECKLE-NEXT:    vmov.f32 s8, s5
+; CHECKLE-NEXT:    vmov.f32 s9, s6
+; CHECKLE-NEXT:    vmov.f32 s10, s7
+; CHECKLE-NEXT:    vmov.32 q2[3], r1
+; CHECKLE-NEXT:    vadd.i32 q1, q1, q2
+; CHECKLE-NEXT:    bx lr
+;
+; CHECKBE-LABEL: inserti32_last_multiuse:
+; CHECKBE:       @ %bb.0:
+; CHECKBE-NEXT:    vldrw.u32 q0, [r0]
+; CHECKBE-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECKBE-NEXT:    ldr r1, [r0, #32]
+; CHECKBE-NEXT:    vmov.f32 s8, s1
+; CHECKBE-NEXT:    vmov.f32 s9, s2
+; CHECKBE-NEXT:    vmov.f32 s10, s3
+; CHECKBE-NEXT:    vmov.f32 s11, s4
+; CHECKBE-NEXT:    vadd.i32 q2, q0, q2
+; CHECKBE-NEXT:    vrev64.32 q0, q2
+; CHECKBE-NEXT:    vmov.f32 s8, s5
+; CHECKBE-NEXT:    vmov.f32 s9, s6
+; CHECKBE-NEXT:    vmov.f32 s10, s7
+; CHECKBE-NEXT:    vmov.32 q2[3], r1
+; CHECKBE-NEXT:    vadd.i32 q2, q1, q2
+; CHECKBE-NEXT:    vrev64.32 q1, q2
+; CHECKBE-NEXT:    bx lr
+  %q = getelementptr inbounds i8, ptr %p, i32 32
+  %l1 = load <8 x i32>, ptr %p
+  %l2 = load i32, ptr %q
+  %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+  %ins = insertelement <8 x i32> %s, i32 %l2, i32 7
+  %a = add <8 x i32> %l1, %ins
+  ret <8 x i32> %a
+}
+
+define <4 x float> @insertf32_first(ptr %p) {
+; CHECKLE-LABEL: insertf32_first:
+; CHECKLE:       @ %bb.0:
+; CHECKLE-NEXT:    vldrw.u32 q1, [r0, #4]
+; CHECKLE-NEXT:    vldr s0, [r0]
+; CHECKLE-NEXT:    vmov.f32 s1, s4
+; CHECKLE-NEXT:    vmov.f32 s2, s5
+; CHECKLE-NEXT:    vmov.f32 s3, s6
+; CHECKLE-NEXT:    bx lr
+;
+; CHECKBE-LABEL: insertf32_first:
+; CHECKBE:       @ %bb.0:
+; CHECKBE-NEXT:    vldrw.u32 q0, [r0, #4]
+; CHECKBE-NEXT:    vldr s4, [r0]
+; CHECKBE-NEXT:    vmov.f32 s5, s0
+; CHECKBE-NEXT:    vmov.f32 s6, s1
+; CHECKBE-NEXT:    vmov.f32 s7, s2
+; CHECKBE-NEXT:    vrev64.32 q0, q1
+; CHECKBE-NEXT:    bx lr
+  %q = getelementptr inbounds i8, ptr %p, i32 4
+  %l1 = load <4 x float>, ptr %q
+  %l2 = load float, ptr %p
+  %s = shufflevector <4 x float> %l1, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 2>
+  %ins = insertelement <4 x float> %s, float %l2, i32 0
+  ret <4 x float> %ins
+}
+
+define <4 x float> @insertf32_last(ptr %p) {
+; CHECKLE-LABEL: insertf32_last:
+; CHECKLE:       @ %bb.0:
+; CHECKLE-NEXT:    vldrw.u32 q1, [r0]
+; CHECKLE-NEXT:    vldr s3, [r0, #16]
+; CHECKLE-NEXT:    vmov.f32 s0, s5
+; CHECKLE-NEXT:    vmov.f32 s1, s6
+; CHECKLE-NEXT:    vmov.f32 s2, s7
+; CHECKLE-NEXT:    bx lr
+;
+; CHECKBE-LABEL: insertf32_last:
+; CHECKBE:       @ %bb.0:
+; CHECKBE-NEXT:    vldrw.u32 q0, [r0]
+; CHECKBE-NEXT:    vldr s7, [r0, #16]
+; CHECKBE-NEXT:    vmov.f32 s4, s1
+; CHECKBE-NEXT:    vmov.f32 s5, s2
+; CHECKBE-NEXT:    vmov.f32 s6, s3
+; CHECKBE-NEXT:    vrev64.32 q0, q1
+; CHECKBE-NEXT:    bx lr
+  %q = getelementptr inbounds i8, ptr %p, i32 16
+  %l1 = load <4 x float>, ptr %p
+  %l2 = load float, ptr %q
+  %s = shufflevector <4 x float> %l1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
+  %ins = insertelement <4 x float> %s, float %l2, i32 3
+  ret <4 x float> %ins
+}


        


More information about the llvm-commits mailing list