[llvm] 8d82f12 - [ARM][AArch64] Add tests for shuffles load patterns. NFC
David Green via llvm-commits
llvm-commits at lists.llvm.org
Wed May 31 10:42:07 PDT 2023
Author: David Green
Date: 2023-05-31T18:42:01+01:00
New Revision: 8d82f12ac3e8a6dae4e50d20da0c14fc30bfc7ee
URL: https://github.com/llvm/llvm-project/commit/8d82f12ac3e8a6dae4e50d20da0c14fc30bfc7ee
DIFF: https://github.com/llvm/llvm-project/commit/8d82f12ac3e8a6dae4e50d20da0c14fc30bfc7ee.diff
LOG: [ARM][AArch64] Add tests for shuffles load patterns. NFC
See D151029
Added:
llvm/test/CodeGen/AArch64/insertshuffleload.ll
llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll
Modified:
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AArch64/insertshuffleload.ll b/llvm/test/CodeGen/AArch64/insertshuffleload.ll
new file mode 100644
index 0000000000000..c9bdb9537157e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/insertshuffleload.ll
@@ -0,0 +1,478 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+define <8 x i8> @inserti8_first(ptr %p) {
+; CHECK-LABEL: inserti8_first:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur d0, [x0, #1]
+; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7
+; CHECK-NEXT: ld1 { v0.b }[0], [x0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ %l1 = load <8 x i8>, ptr %q
+ %l2 = load i8, ptr %p
+ %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+ ret <8 x i8> %ins
+}
+
+define <8 x i8> @inserti8_last(ptr %p) {
+; CHECK-LABEL: inserti8_last:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: add x8, x0, #8
+; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #1
+; CHECK-NEXT: ld1 { v0.b }[7], [x8]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 8
+ %l1 = load <8 x i8>, ptr %p
+ %l2 = load i8, ptr %q
+ %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+ %ins = insertelement <8 x i8> %s, i8 %l2, i32 7
+ ret <8 x i8> %ins
+}
+
+define <8 x i16> @inserti8_first_sext(ptr %p) {
+; CHECK-LABEL: inserti8_first_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur d0, [x0, #1]
+; CHECK-NEXT: ldrsb w8, [x0]
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #14
+; CHECK-NEXT: mov v0.h[0], w8
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ %l1 = load <8 x i8>, ptr %q
+ %s1 = sext <8 x i8> %l1 to <8 x i16>
+ %l2 = load i8, ptr %p
+ %s2 = sext i8 %l2 to i16
+ %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i16> %s, i16 %s2, i32 0
+ ret <8 x i16> %ins
+}
+
+define <8 x i16> @inserti8_last_sext(ptr %p) {
+; CHECK-LABEL: inserti8_last_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldrsb w8, [x0, #8]
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #2
+; CHECK-NEXT: mov v0.h[7], w8
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 8
+ %l1 = load <8 x i8>, ptr %p
+ %s1 = sext <8 x i8> %l1 to <8 x i16>
+ %l2 = load i8, ptr %q
+ %s2 = sext i8 %l2 to i16
+ %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+ %ins = insertelement <8 x i16> %s, i16 %s2, i32 7
+ ret <8 x i16> %ins
+}
+
+define <8 x i16> @inserti8_first_zext(ptr %p) {
+; CHECK-LABEL: inserti8_first_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur d0, [x0, #1]
+; CHECK-NEXT: ldrb w8, [x0]
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #14
+; CHECK-NEXT: mov v0.h[0], w8
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ %l1 = load <8 x i8>, ptr %q
+ %s1 = zext <8 x i8> %l1 to <8 x i16>
+ %l2 = load i8, ptr %p
+ %s2 = zext i8 %l2 to i16
+ %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i16> %s, i16 %s2, i32 0
+ ret <8 x i16> %ins
+}
+
+define <8 x i16> @inserti8_last_zext(ptr %p) {
+; CHECK-LABEL: inserti8_last_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldrb w8, [x0, #8]
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #2
+; CHECK-NEXT: mov v0.h[7], w8
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 8
+ %l1 = load <8 x i8>, ptr %p
+ %s1 = zext <8 x i8> %l1 to <8 x i16>
+ %l2 = load i8, ptr %q
+ %s2 = zext i8 %l2 to i16
+ %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+ %ins = insertelement <8 x i16> %s, i16 %s2, i32 7
+ ret <8 x i16> %ins
+}
+
+define <8 x i32> @inserti32_first(ptr %p) {
+; CHECK-LABEL: inserti32_first:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur q1, [x0, #4]
+; CHECK-NEXT: ldur q2, [x0, #20]
+; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #12
+; CHECK-NEXT: ext v1.16b, v1.16b, v2.16b, #12
+; CHECK-NEXT: ld1 { v0.s }[0], [x0]
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 4
+ %l1 = load <8 x i32>, ptr %q
+ %l2 = load i32, ptr %p
+ %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i32> %s, i32 %l2, i32 0
+ ret <8 x i32> %ins
+}
+
+define <8 x i32> @inserti32_last(ptr %p) {
+; CHECK-LABEL: inserti32_last:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q2, q0, [x0]
+; CHECK-NEXT: add x8, x0, #32
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #4
+; CHECK-NEXT: ext v0.16b, v2.16b, v0.16b, #4
+; CHECK-NEXT: ld1 { v1.s }[3], [x8]
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 32
+ %l1 = load <8 x i32>, ptr %p
+ %l2 = load i32, ptr %q
+ %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+ %ins = insertelement <8 x i32> %s, i32 %l2, i32 7
+ ret <8 x i32> %ins
+}
+
+define <8 x i32> @inserti32_first_multiuse(ptr %p) {
+; CHECK-LABEL: inserti32_first_multiuse:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur q0, [x0, #4]
+; CHECK-NEXT: ldur q1, [x0, #20]
+; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #12
+; CHECK-NEXT: ext v3.16b, v0.16b, v1.16b, #12
+; CHECK-NEXT: ld1 { v2.s }[0], [x0]
+; CHECK-NEXT: add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 4
+ %l1 = load <8 x i32>, ptr %q
+ %l2 = load i32, ptr %p
+ %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i32> %s, i32 %l2, i32 0
+ %a = add <8 x i32> %l1, %ins
+ ret <8 x i32> %a
+}
+
+define <8 x i32> @inserti32_last_multiuse(ptr %p) {
+; CHECK-LABEL: inserti32_last_multiuse:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: add x8, x0, #32
+; CHECK-NEXT: ext v2.16b, v1.16b, v0.16b, #4
+; CHECK-NEXT: ext v3.16b, v0.16b, v1.16b, #4
+; CHECK-NEXT: ld1 { v2.s }[3], [x8]
+; CHECK-NEXT: add v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 32
+ %l1 = load <8 x i32>, ptr %p
+ %l2 = load i32, ptr %q
+ %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+ %ins = insertelement <8 x i32> %s, i32 %l2, i32 7
+ %a = add <8 x i32> %l1, %ins
+ ret <8 x i32> %a
+}
+
+define <4 x float> @insertf32_first(ptr %p) {
+; CHECK-LABEL: insertf32_first:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur q0, [x0, #4]
+; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #12
+; CHECK-NEXT: ld1 { v0.s }[0], [x0]
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 4
+ %l1 = load <4 x float>, ptr %q
+ %l2 = load float, ptr %p
+ %s = shufflevector <4 x float> %l1, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 2>
+ %ins = insertelement <4 x float> %s, float %l2, i32 0
+ ret <4 x float> %ins
+}
+
+define <4 x float> @insertf32_last(ptr %p) {
+; CHECK-LABEL: insertf32_last:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: add x8, x0, #16
+; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4
+; CHECK-NEXT: ld1 { v0.s }[3], [x8]
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 16
+ %l1 = load <4 x float>, ptr %p
+ %l2 = load float, ptr %q
+ %s = shufflevector <4 x float> %l1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
+ %ins = insertelement <4 x float> %s, float %l2, i32 3
+ ret <4 x float> %ins
+}
+
+define <2 x i64> @inserti64_first(ptr %p) {
+; CHECK-LABEL: inserti64_first:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x8, x0, #8
+; CHECK-NEXT: ld1r { v0.2d }, [x8]
+; CHECK-NEXT: ld1 { v0.d }[0], [x0]
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 8
+ %l1 = load <2 x i64>, ptr %q
+ %l2 = load i64, ptr %p
+ %s = shufflevector <2 x i64> %l1, <2 x i64> undef, <2 x i32> <i32 undef, i32 0>
+ %ins = insertelement <2 x i64> %s, i64 %l2, i32 0
+ ret <2 x i64> %ins
+}
+
+define <2 x i64> @inserti64_last(ptr %p) {
+; CHECK-LABEL: inserti64_last:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: add x8, x0, #16
+; CHECK-NEXT: dup v0.2d, v0.d[1]
+; CHECK-NEXT: ld1 { v0.d }[1], [x8]
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 16
+ %l1 = load <2 x i64>, ptr %p
+ %l2 = load i64, ptr %q
+ %s = shufflevector <2 x i64> %l1, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %ins = insertelement <2 x i64> %s, i64 %l2, i32 1
+ ret <2 x i64> %ins
+}
+
+define <8 x i8> @inserti8_first_undef(ptr %p) {
+; CHECK-LABEL: inserti8_first_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur d0, [x0, #1]
+; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7
+; CHECK-NEXT: ld1 { v0.b }[0], [x0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ %l1 = load <8 x i8>, ptr %q
+ %l2 = load i8, ptr %p
+ %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 undef, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+ ret <8 x i8> %ins
+}
+
+define <8 x i8> @inserti8_last_undef(ptr %p) {
+; CHECK-LABEL: inserti8_last_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: add x8, x0, #8
+; CHECK-NEXT: dup v0.8b, v0.b[1]
+; CHECK-NEXT: ld1 { v0.b }[7], [x8]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 8
+ %l1 = load <8 x i8>, ptr %p
+ %l2 = load i8, ptr %q
+ %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %ins = insertelement <8 x i8> %s, i8 %l2, i32 7
+ ret <8 x i8> %ins
+}
+
+
+
+define <8 x i16> @wrong_zextandsext(ptr %p) {
+; CHECK-LABEL: wrong_zextandsext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur d0, [x0, #1]
+; CHECK-NEXT: ldrsb w8, [x0]
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #14
+; CHECK-NEXT: mov v0.h[0], w8
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ %l1 = load <8 x i8>, ptr %q
+ %s1 = zext <8 x i8> %l1 to <8 x i16>
+ %l2 = load i8, ptr %p
+ %s2 = sext i8 %l2 to i16
+ %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i16> %s, i16 %s2, i32 0
+ ret <8 x i16> %ins
+}
+
+define <8 x i8> @wrongidx_first(ptr %p) {
+; CHECK-LABEL: wrongidx_first:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur d0, [x0, #1]
+; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7
+; CHECK-NEXT: ld1 { v0.b }[7], [x0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ %l1 = load <8 x i8>, ptr %q
+ %l2 = load i8, ptr %p
+ %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i8> %s, i8 %l2, i32 7
+ ret <8 x i8> %ins
+}
+
+define <8 x i8> @wrong_last(ptr %p) {
+; CHECK-LABEL: wrong_last:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: add x8, x0, #8
+; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #1
+; CHECK-NEXT: ld1 { v0.b }[0], [x8]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 8
+ %l1 = load <8 x i8>, ptr %p
+ %l2 = load i8, ptr %q
+ %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+ %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+ ret <8 x i8> %ins
+}
+
+define <8 x i8> @wrong_shuffle(ptr %p) {
+; CHECK-LABEL: wrong_shuffle:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur d0, [x0, #1]
+; CHECK-NEXT: adrp x8, .LCPI19_0
+; CHECK-NEXT: mov v0.d[1], v0.d[0]
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI19_0]
+; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
+; CHECK-NEXT: ld1 { v0.b }[0], [x0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ %l1 = load <8 x i8>, ptr %q
+ %l2 = load i8, ptr %p
+ %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+ ret <8 x i8> %ins
+}
+
+define <8 x i16> @wrong_exttype(ptr %p) {
+; CHECK-LABEL: wrong_exttype:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur d0, [x0, #1]
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #14
+; CHECK-NEXT: ld1 { v0.h }[0], [x0]
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ %l1 = load <8 x i8>, ptr %q
+ %s1 = sext <8 x i8> %l1 to <8 x i16>
+ %l2 = load i16, ptr %p
+ %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i16> %s, i16 %l2, i32 0
+ ret <8 x i16> %ins
+}
+
+define <4 x i32> @wrong_exttype2(ptr %p) {
+; CHECK-LABEL: wrong_exttype2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur s0, [x0, #1]
+; CHECK-NEXT: ldrsh w8, [x0]
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #12
+; CHECK-NEXT: mov v0.s[0], w8
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ %l1 = load <4 x i8>, ptr %q
+ %s1 = sext <4 x i8> %l1 to <4 x i32>
+ %l2 = load i16, ptr %p
+ %s2 = sext i16 %l2 to i32
+ %s = shufflevector <4 x i32> %s1, <4 x i32> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 2>
+ %ins = insertelement <4 x i32> %s, i32 %s2, i32 0
+ ret <4 x i32> %ins
+}
+
+define <8 x i8> @wrong_offsetfirst(ptr %p) {
+; CHECK-LABEL: wrong_offsetfirst:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur d0, [x0, #-1]
+; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7
+; CHECK-NEXT: ld1 { v0.b }[0], [x0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 -1
+ %l1 = load <8 x i8>, ptr %q
+ %l2 = load i8, ptr %p
+ %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+ ret <8 x i8> %ins
+}
+
+define <8 x i8> @wrong_offsetlast(ptr %p) {
+; CHECK-LABEL: wrong_offsetlast:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: add x8, x0, #7
+; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #1
+; CHECK-NEXT: ld1 { v0.b }[7], [x8]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 7
+ %l1 = load <8 x i8>, ptr %p
+ %l2 = load i8, ptr %q
+ %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+ %ins = insertelement <8 x i8> %s, i8 %l2, i32 7
+ ret <8 x i8> %ins
+}
+
+
+define <8 x i8> @storebetween(ptr %p, ptr %r) {
+; CHECK-LABEL: storebetween:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur d0, [x0, #1]
+; CHECK-NEXT: strb wzr, [x1]
+; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7
+; CHECK-NEXT: ld1 { v0.b }[0], [x0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ %l1 = load <8 x i8>, ptr %q
+ store i8 0, ptr %r
+ %l2 = load i8, ptr %p
+ %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+ ret <8 x i8> %ins
+}
+
+define <8 x i8> @storebefore(ptr %p, ptr %r) {
+; CHECK-LABEL: storebefore:
+; CHECK: // %bb.0:
+; CHECK-NEXT: strb wzr, [x1]
+; CHECK-NEXT: ldur d0, [x0, #1]
+; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7
+; CHECK-NEXT: ld1 { v0.b }[0], [x0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ store i8 0, ptr %r
+ %l1 = load <8 x i8>, ptr %q
+ %l2 = load i8, ptr %p
+ %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+ ret <8 x i8> %ins
+}
+
+define <8 x i8> @storeafter(ptr %p, ptr %r) {
+; CHECK-LABEL: storeafter:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldur d0, [x0, #1]
+; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7
+; CHECK-NEXT: ld1 { v0.b }[0], [x0]
+; CHECK-NEXT: strb wzr, [x1]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ %l1 = load <8 x i8>, ptr %q
+ %l2 = load i8, ptr %p
+ store i8 0, ptr %r
+ %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+ ret <8 x i8> %ins
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll
new file mode 100644
index 0000000000000..7714f8dd92d73
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll
@@ -0,0 +1,482 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=thumbv8.1m.main-none-eabihf -mattr=+mve.fp | FileCheck %s --check-prefix=CHECKLE
+; RUN: llc < %s -mtriple=thumbebv8.1m.main-none-eabihf -mattr=+mve.fp | FileCheck %s --check-prefix=CHECKBE
+
+
+define <8 x i8> @inserti8_first(ptr %p) {
+; CHECKLE-LABEL: inserti8_first:
+; CHECKLE: @ %bb.0:
+; CHECKLE-NEXT: vldrb.u16 q1, [r0, #1]
+; CHECKLE-NEXT: ldrb r1, [r0]
+; CHECKLE-NEXT: vmovx.f16 s10, s5
+; CHECKLE-NEXT: vmovx.f16 s8, s4
+; CHECKLE-NEXT: vins.f16 s10, s6
+; CHECKLE-NEXT: vmovx.f16 s6, s6
+; CHECKLE-NEXT: vmov.16 q0[0], r1
+; CHECKLE-NEXT: vins.f16 s8, s5
+; CHECKLE-NEXT: vins.f16 s6, s7
+; CHECKLE-NEXT: vmov.f32 s1, s8
+; CHECKLE-NEXT: vmov.f32 s2, s10
+; CHECKLE-NEXT: vins.f16 s0, s4
+; CHECKLE-NEXT: vmov.f32 s3, s6
+; CHECKLE-NEXT: bx lr
+;
+; CHECKBE-LABEL: inserti8_first:
+; CHECKBE: @ %bb.0:
+; CHECKBE-NEXT: vldrb.u16 q0, [r0, #1]
+; CHECKBE-NEXT: ldrb r1, [r0]
+; CHECKBE-NEXT: vmovx.f16 s6, s1
+; CHECKBE-NEXT: vmovx.f16 s4, s0
+; CHECKBE-NEXT: vins.f16 s6, s2
+; CHECKBE-NEXT: vmovx.f16 s2, s2
+; CHECKBE-NEXT: vmov.16 q2[0], r1
+; CHECKBE-NEXT: vins.f16 s4, s1
+; CHECKBE-NEXT: vins.f16 s2, s3
+; CHECKBE-NEXT: vins.f16 s8, s0
+; CHECKBE-NEXT: vmov.f32 s9, s4
+; CHECKBE-NEXT: vmov.f32 s10, s6
+; CHECKBE-NEXT: vmov.f32 s11, s2
+; CHECKBE-NEXT: vrev64.16 q0, q2
+; CHECKBE-NEXT: bx lr
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ %l1 = load <8 x i8>, ptr %q
+ %l2 = load i8, ptr %p
+ %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i8> %s, i8 %l2, i32 0
+ ret <8 x i8> %ins
+}
+
+define <8 x i8> @inserti8_last(ptr %p) {
+; CHECKLE-LABEL: inserti8_last:
+; CHECKLE: @ %bb.0:
+; CHECKLE-NEXT: vldrb.u16 q1, [r0]
+; CHECKLE-NEXT: ldrb r1, [r0, #8]
+; CHECKLE-NEXT: vmovx.f16 s0, s4
+; CHECKLE-NEXT: vmovx.f16 s1, s5
+; CHECKLE-NEXT: vmovx.f16 s2, s6
+; CHECKLE-NEXT: vins.f16 s0, s5
+; CHECKLE-NEXT: vins.f16 s1, s6
+; CHECKLE-NEXT: vins.f16 s2, s7
+; CHECKLE-NEXT: vmov.u16 r0, q1[7]
+; CHECKLE-NEXT: vmov.16 q0[6], r0
+; CHECKLE-NEXT: vmov.16 q0[7], r1
+; CHECKLE-NEXT: bx lr
+;
+; CHECKBE-LABEL: inserti8_last:
+; CHECKBE: @ %bb.0:
+; CHECKBE-NEXT: vldrb.u16 q0, [r0]
+; CHECKBE-NEXT: ldrb r1, [r0, #8]
+; CHECKBE-NEXT: vmovx.f16 s4, s0
+; CHECKBE-NEXT: vmovx.f16 s5, s1
+; CHECKBE-NEXT: vmovx.f16 s6, s2
+; CHECKBE-NEXT: vins.f16 s4, s1
+; CHECKBE-NEXT: vins.f16 s5, s2
+; CHECKBE-NEXT: vins.f16 s6, s3
+; CHECKBE-NEXT: vmov.u16 r0, q0[7]
+; CHECKBE-NEXT: vmov.16 q1[6], r0
+; CHECKBE-NEXT: vmov.16 q1[7], r1
+; CHECKBE-NEXT: vrev64.16 q0, q1
+; CHECKBE-NEXT: bx lr
+ %q = getelementptr inbounds i8, ptr %p, i32 8
+ %l1 = load <8 x i8>, ptr %p
+ %l2 = load i8, ptr %q
+ %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+ %ins = insertelement <8 x i8> %s, i8 %l2, i32 7
+ ret <8 x i8> %ins
+}
+
+define <8 x i16> @inserti8_first_sext(ptr %p) {
+; CHECKLE-LABEL: inserti8_first_sext:
+; CHECKLE: @ %bb.0:
+; CHECKLE-NEXT: vldrb.s16 q1, [r0, #1]
+; CHECKLE-NEXT: ldrsb.w r1, [r0]
+; CHECKLE-NEXT: vmovx.f16 s10, s5
+; CHECKLE-NEXT: vmovx.f16 s8, s4
+; CHECKLE-NEXT: vins.f16 s10, s6
+; CHECKLE-NEXT: vmovx.f16 s6, s6
+; CHECKLE-NEXT: vmov.16 q0[0], r1
+; CHECKLE-NEXT: vins.f16 s8, s5
+; CHECKLE-NEXT: vins.f16 s6, s7
+; CHECKLE-NEXT: vmov.f32 s1, s8
+; CHECKLE-NEXT: vmov.f32 s2, s10
+; CHECKLE-NEXT: vins.f16 s0, s4
+; CHECKLE-NEXT: vmov.f32 s3, s6
+; CHECKLE-NEXT: bx lr
+;
+; CHECKBE-LABEL: inserti8_first_sext:
+; CHECKBE: @ %bb.0:
+; CHECKBE-NEXT: vldrb.s16 q0, [r0, #1]
+; CHECKBE-NEXT: ldrsb.w r1, [r0]
+; CHECKBE-NEXT: vmovx.f16 s6, s1
+; CHECKBE-NEXT: vmovx.f16 s4, s0
+; CHECKBE-NEXT: vins.f16 s6, s2
+; CHECKBE-NEXT: vmovx.f16 s2, s2
+; CHECKBE-NEXT: vmov.16 q2[0], r1
+; CHECKBE-NEXT: vins.f16 s4, s1
+; CHECKBE-NEXT: vins.f16 s2, s3
+; CHECKBE-NEXT: vins.f16 s8, s0
+; CHECKBE-NEXT: vmov.f32 s9, s4
+; CHECKBE-NEXT: vmov.f32 s10, s6
+; CHECKBE-NEXT: vmov.f32 s11, s2
+; CHECKBE-NEXT: vrev64.16 q0, q2
+; CHECKBE-NEXT: bx lr
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ %l1 = load <8 x i8>, ptr %q
+ %s1 = sext <8 x i8> %l1 to <8 x i16>
+ %l2 = load i8, ptr %p
+ %s2 = sext i8 %l2 to i16
+ %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i16> %s, i16 %s2, i32 0
+ ret <8 x i16> %ins
+}
+
+define <8 x i16> @inserti8_last_sext(ptr %p) {
+; CHECKLE-LABEL: inserti8_last_sext:
+; CHECKLE: @ %bb.0:
+; CHECKLE-NEXT: vldrb.s16 q1, [r0]
+; CHECKLE-NEXT: ldrsb.w r1, [r0, #8]
+; CHECKLE-NEXT: vmovx.f16 s0, s4
+; CHECKLE-NEXT: vmovx.f16 s1, s5
+; CHECKLE-NEXT: vmovx.f16 s2, s6
+; CHECKLE-NEXT: vins.f16 s0, s5
+; CHECKLE-NEXT: vins.f16 s1, s6
+; CHECKLE-NEXT: vins.f16 s2, s7
+; CHECKLE-NEXT: vmov.u16 r0, q1[7]
+; CHECKLE-NEXT: vmov.16 q0[6], r0
+; CHECKLE-NEXT: vmov.16 q0[7], r1
+; CHECKLE-NEXT: bx lr
+;
+; CHECKBE-LABEL: inserti8_last_sext:
+; CHECKBE: @ %bb.0:
+; CHECKBE-NEXT: vldrb.s16 q0, [r0]
+; CHECKBE-NEXT: ldrsb.w r1, [r0, #8]
+; CHECKBE-NEXT: vmovx.f16 s4, s0
+; CHECKBE-NEXT: vmovx.f16 s5, s1
+; CHECKBE-NEXT: vmovx.f16 s6, s2
+; CHECKBE-NEXT: vins.f16 s4, s1
+; CHECKBE-NEXT: vins.f16 s5, s2
+; CHECKBE-NEXT: vins.f16 s6, s3
+; CHECKBE-NEXT: vmov.u16 r0, q0[7]
+; CHECKBE-NEXT: vmov.16 q1[6], r0
+; CHECKBE-NEXT: vmov.16 q1[7], r1
+; CHECKBE-NEXT: vrev64.16 q0, q1
+; CHECKBE-NEXT: bx lr
+ %q = getelementptr inbounds i8, ptr %p, i32 8
+ %l1 = load <8 x i8>, ptr %p
+ %s1 = sext <8 x i8> %l1 to <8 x i16>
+ %l2 = load i8, ptr %q
+ %s2 = sext i8 %l2 to i16
+ %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+ %ins = insertelement <8 x i16> %s, i16 %s2, i32 7
+ ret <8 x i16> %ins
+}
+
+define <8 x i16> @inserti8_first_zext(ptr %p) {
+; CHECKLE-LABEL: inserti8_first_zext:
+; CHECKLE: @ %bb.0:
+; CHECKLE-NEXT: vldrb.u16 q1, [r0, #1]
+; CHECKLE-NEXT: ldrb r1, [r0]
+; CHECKLE-NEXT: vmovx.f16 s10, s5
+; CHECKLE-NEXT: vmovx.f16 s8, s4
+; CHECKLE-NEXT: vins.f16 s10, s6
+; CHECKLE-NEXT: vmovx.f16 s6, s6
+; CHECKLE-NEXT: vmov.16 q0[0], r1
+; CHECKLE-NEXT: vins.f16 s8, s5
+; CHECKLE-NEXT: vins.f16 s6, s7
+; CHECKLE-NEXT: vmov.f32 s1, s8
+; CHECKLE-NEXT: vmov.f32 s2, s10
+; CHECKLE-NEXT: vins.f16 s0, s4
+; CHECKLE-NEXT: vmov.f32 s3, s6
+; CHECKLE-NEXT: bx lr
+;
+; CHECKBE-LABEL: inserti8_first_zext:
+; CHECKBE: @ %bb.0:
+; CHECKBE-NEXT: vldrb.u16 q0, [r0, #1]
+; CHECKBE-NEXT: ldrb r1, [r0]
+; CHECKBE-NEXT: vmovx.f16 s6, s1
+; CHECKBE-NEXT: vmovx.f16 s4, s0
+; CHECKBE-NEXT: vins.f16 s6, s2
+; CHECKBE-NEXT: vmovx.f16 s2, s2
+; CHECKBE-NEXT: vmov.16 q2[0], r1
+; CHECKBE-NEXT: vins.f16 s4, s1
+; CHECKBE-NEXT: vins.f16 s2, s3
+; CHECKBE-NEXT: vins.f16 s8, s0
+; CHECKBE-NEXT: vmov.f32 s9, s4
+; CHECKBE-NEXT: vmov.f32 s10, s6
+; CHECKBE-NEXT: vmov.f32 s11, s2
+; CHECKBE-NEXT: vrev64.16 q0, q2
+; CHECKBE-NEXT: bx lr
+ %q = getelementptr inbounds i8, ptr %p, i32 1
+ %l1 = load <8 x i8>, ptr %q
+ %s1 = zext <8 x i8> %l1 to <8 x i16>
+ %l2 = load i8, ptr %p
+ %s2 = zext i8 %l2 to i16
+ %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i16> %s, i16 %s2, i32 0
+ ret <8 x i16> %ins
+}
+
+define <8 x i16> @inserti8_last_zext(ptr %p) {
+; CHECKLE-LABEL: inserti8_last_zext:
+; CHECKLE: @ %bb.0:
+; CHECKLE-NEXT: vldrb.u16 q1, [r0]
+; CHECKLE-NEXT: ldrb r1, [r0, #8]
+; CHECKLE-NEXT: vmovx.f16 s0, s4
+; CHECKLE-NEXT: vmovx.f16 s1, s5
+; CHECKLE-NEXT: vmovx.f16 s2, s6
+; CHECKLE-NEXT: vins.f16 s0, s5
+; CHECKLE-NEXT: vins.f16 s1, s6
+; CHECKLE-NEXT: vins.f16 s2, s7
+; CHECKLE-NEXT: vmov.u16 r0, q1[7]
+; CHECKLE-NEXT: vmov.16 q0[6], r0
+; CHECKLE-NEXT: vmov.16 q0[7], r1
+; CHECKLE-NEXT: bx lr
+;
+; CHECKBE-LABEL: inserti8_last_zext:
+; CHECKBE: @ %bb.0:
+; CHECKBE-NEXT: vldrb.u16 q0, [r0]
+; CHECKBE-NEXT: ldrb r1, [r0, #8]
+; CHECKBE-NEXT: vmovx.f16 s4, s0
+; CHECKBE-NEXT: vmovx.f16 s5, s1
+; CHECKBE-NEXT: vmovx.f16 s6, s2
+; CHECKBE-NEXT: vins.f16 s4, s1
+; CHECKBE-NEXT: vins.f16 s5, s2
+; CHECKBE-NEXT: vins.f16 s6, s3
+; CHECKBE-NEXT: vmov.u16 r0, q0[7]
+; CHECKBE-NEXT: vmov.16 q1[6], r0
+; CHECKBE-NEXT: vmov.16 q1[7], r1
+; CHECKBE-NEXT: vrev64.16 q0, q1
+; CHECKBE-NEXT: bx lr
+ %q = getelementptr inbounds i8, ptr %p, i32 8
+ %l1 = load <8 x i8>, ptr %p
+ %s1 = zext <8 x i8> %l1 to <8 x i16>
+ %l2 = load i8, ptr %q
+ %s2 = zext i8 %l2 to i16
+ %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+ %ins = insertelement <8 x i16> %s, i16 %s2, i32 7
+ ret <8 x i16> %ins
+}
+
+define <8 x i32> @inserti32_first(ptr %p) {
+; CHECKLE-LABEL: inserti32_first:
+; CHECKLE: @ %bb.0:
+; CHECKLE-NEXT: vldrw.u32 q1, [r0, #4]
+; CHECKLE-NEXT: vldrw.u32 q2, [r0, #20]
+; CHECKLE-NEXT: ldr r1, [r0]
+; CHECKLE-NEXT: vmov.f32 s1, s4
+; CHECKLE-NEXT: vmov.f32 s2, s5
+; CHECKLE-NEXT: vmov.f32 s3, s6
+; CHECKLE-NEXT: vmov.f32 s4, s7
+; CHECKLE-NEXT: vmov.32 q0[0], r1
+; CHECKLE-NEXT: vmov.f32 s5, s8
+; CHECKLE-NEXT: vmov.f32 s6, s9
+; CHECKLE-NEXT: vmov.f32 s7, s10
+; CHECKLE-NEXT: bx lr
+;
+; CHECKBE-LABEL: inserti32_first:
+; CHECKBE: @ %bb.0:
+; CHECKBE-NEXT: vldrw.u32 q0, [r0, #20]
+; CHECKBE-NEXT: vldrw.u32 q2, [r0, #4]
+; CHECKBE-NEXT: ldr r1, [r0]
+; CHECKBE-NEXT: vmov.f32 s12, s11
+; CHECKBE-NEXT: vmov.f32 s13, s0
+; CHECKBE-NEXT: vmov.f32 s14, s1
+; CHECKBE-NEXT: vmov.f32 s15, s2
+; CHECKBE-NEXT: vrev64.32 q1, q3
+; CHECKBE-NEXT: vmov.f32 s13, s8
+; CHECKBE-NEXT: vmov.f32 s14, s9
+; CHECKBE-NEXT: vmov.f32 s15, s10
+; CHECKBE-NEXT: vmov.32 q3[0], r1
+; CHECKBE-NEXT: vrev64.32 q0, q3
+; CHECKBE-NEXT: bx lr
+ %q = getelementptr inbounds i8, ptr %p, i32 4
+ %l1 = load <8 x i32>, ptr %q
+ %l2 = load i32, ptr %p
+ %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i32> %s, i32 %l2, i32 0
+ ret <8 x i32> %ins
+}
+
+define <8 x i32> @inserti32_last(ptr %p) {
+; CHECKLE-LABEL: inserti32_last:
+; CHECKLE: @ %bb.0:
+; CHECKLE-NEXT: vldrw.u32 q2, [r0, #16]
+; CHECKLE-NEXT: vldrw.u32 q0, [r0]
+; CHECKLE-NEXT: ldr r1, [r0, #32]
+; CHECKLE-NEXT: vmov.f32 s0, s1
+; CHECKLE-NEXT: vmov.f32 s1, s2
+; CHECKLE-NEXT: vmov.f32 s2, s3
+; CHECKLE-NEXT: vmov.f32 s3, s8
+; CHECKLE-NEXT: vmov.f32 s4, s9
+; CHECKLE-NEXT: vmov.f32 s5, s10
+; CHECKLE-NEXT: vmov.f32 s6, s11
+; CHECKLE-NEXT: vmov.32 q1[3], r1
+; CHECKLE-NEXT: bx lr
+;
+; CHECKBE-LABEL: inserti32_last:
+; CHECKBE: @ %bb.0:
+; CHECKBE-NEXT: vldrw.u32 q0, [r0]
+; CHECKBE-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECKBE-NEXT: ldr r1, [r0, #32]
+; CHECKBE-NEXT: vmov.f32 s8, s1
+; CHECKBE-NEXT: vmov.f32 s9, s2
+; CHECKBE-NEXT: vmov.f32 s10, s3
+; CHECKBE-NEXT: vmov.f32 s11, s4
+; CHECKBE-NEXT: vrev64.32 q0, q2
+; CHECKBE-NEXT: vmov.f32 s8, s5
+; CHECKBE-NEXT: vmov.f32 s9, s6
+; CHECKBE-NEXT: vmov.f32 s10, s7
+; CHECKBE-NEXT: vmov.32 q2[3], r1
+; CHECKBE-NEXT: vrev64.32 q1, q2
+; CHECKBE-NEXT: bx lr
+ %q = getelementptr inbounds i8, ptr %p, i32 32
+ %l1 = load <8 x i32>, ptr %p
+ %l2 = load i32, ptr %q
+ %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+ %ins = insertelement <8 x i32> %s, i32 %l2, i32 7
+ ret <8 x i32> %ins
+}
+
+define <8 x i32> @inserti32_first_multiuse(ptr %p) {
+; CHECKLE-LABEL: inserti32_first_multiuse:
+; CHECKLE: @ %bb.0:
+; CHECKLE-NEXT: vldrw.u32 q1, [r0, #20]
+; CHECKLE-NEXT: vldrw.u32 q0, [r0, #4]
+; CHECKLE-NEXT: ldr r1, [r0]
+; CHECKLE-NEXT: vmov.f32 s8, s3
+; CHECKLE-NEXT: vmov.f32 s9, s4
+; CHECKLE-NEXT: vmov.f32 s10, s5
+; CHECKLE-NEXT: vmov.f32 s11, s6
+; CHECKLE-NEXT: vadd.i32 q1, q1, q2
+; CHECKLE-NEXT: vmov.f32 s9, s0
+; CHECKLE-NEXT: vmov.f32 s10, s1
+; CHECKLE-NEXT: vmov.f32 s11, s2
+; CHECKLE-NEXT: vmov.32 q2[0], r1
+; CHECKLE-NEXT: vadd.i32 q0, q0, q2
+; CHECKLE-NEXT: bx lr
+;
+; CHECKBE-LABEL: inserti32_first_multiuse:
+; CHECKBE: @ %bb.0:
+; CHECKBE-NEXT: vldrw.u32 q1, [r0, #20]
+; CHECKBE-NEXT: vldrw.u32 q0, [r0, #4]
+; CHECKBE-NEXT: ldr r1, [r0]
+; CHECKBE-NEXT: vmov.f32 s8, s3
+; CHECKBE-NEXT: vmov.f32 s9, s4
+; CHECKBE-NEXT: vmov.f32 s10, s5
+; CHECKBE-NEXT: vmov.f32 s11, s6
+; CHECKBE-NEXT: vadd.i32 q2, q1, q2
+; CHECKBE-NEXT: vrev64.32 q1, q2
+; CHECKBE-NEXT: vmov.f32 s9, s0
+; CHECKBE-NEXT: vmov.f32 s10, s1
+; CHECKBE-NEXT: vmov.f32 s11, s2
+; CHECKBE-NEXT: vmov.32 q2[0], r1
+; CHECKBE-NEXT: vadd.i32 q2, q0, q2
+; CHECKBE-NEXT: vrev64.32 q0, q2
+; CHECKBE-NEXT: bx lr
+ %q = getelementptr inbounds i8, ptr %p, i32 4
+ %l1 = load <8 x i32>, ptr %q
+ %l2 = load i32, ptr %p
+ %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ %ins = insertelement <8 x i32> %s, i32 %l2, i32 0
+ %a = add <8 x i32> %l1, %ins
+ ret <8 x i32> %a
+}
+
+define <8 x i32> @inserti32_last_multiuse(ptr %p) {
+; CHECKLE-LABEL: inserti32_last_multiuse:
+; CHECKLE: @ %bb.0:
+; CHECKLE-NEXT: vldrw.u32 q0, [r0]
+; CHECKLE-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECKLE-NEXT: ldr r1, [r0, #32]
+; CHECKLE-NEXT: vmov.f32 s8, s1
+; CHECKLE-NEXT: vmov.f32 s9, s2
+; CHECKLE-NEXT: vmov.f32 s10, s3
+; CHECKLE-NEXT: vmov.f32 s11, s4
+; CHECKLE-NEXT: vadd.i32 q0, q0, q2
+; CHECKLE-NEXT: vmov.f32 s8, s5
+; CHECKLE-NEXT: vmov.f32 s9, s6
+; CHECKLE-NEXT: vmov.f32 s10, s7
+; CHECKLE-NEXT: vmov.32 q2[3], r1
+; CHECKLE-NEXT: vadd.i32 q1, q1, q2
+; CHECKLE-NEXT: bx lr
+;
+; CHECKBE-LABEL: inserti32_last_multiuse:
+; CHECKBE: @ %bb.0:
+; CHECKBE-NEXT: vldrw.u32 q0, [r0]
+; CHECKBE-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECKBE-NEXT: ldr r1, [r0, #32]
+; CHECKBE-NEXT: vmov.f32 s8, s1
+; CHECKBE-NEXT: vmov.f32 s9, s2
+; CHECKBE-NEXT: vmov.f32 s10, s3
+; CHECKBE-NEXT: vmov.f32 s11, s4
+; CHECKBE-NEXT: vadd.i32 q2, q0, q2
+; CHECKBE-NEXT: vrev64.32 q0, q2
+; CHECKBE-NEXT: vmov.f32 s8, s5
+; CHECKBE-NEXT: vmov.f32 s9, s6
+; CHECKBE-NEXT: vmov.f32 s10, s7
+; CHECKBE-NEXT: vmov.32 q2[3], r1
+; CHECKBE-NEXT: vadd.i32 q2, q1, q2
+; CHECKBE-NEXT: vrev64.32 q1, q2
+; CHECKBE-NEXT: bx lr
+ %q = getelementptr inbounds i8, ptr %p, i32 32
+ %l1 = load <8 x i32>, ptr %p
+ %l2 = load i32, ptr %q
+ %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+ %ins = insertelement <8 x i32> %s, i32 %l2, i32 7
+ %a = add <8 x i32> %l1, %ins
+ ret <8 x i32> %a
+}
+
+define <4 x float> @insertf32_first(ptr %p) {
+; CHECKLE-LABEL: insertf32_first:
+; CHECKLE: @ %bb.0:
+; CHECKLE-NEXT: vldrw.u32 q1, [r0, #4]
+; CHECKLE-NEXT: vldr s0, [r0]
+; CHECKLE-NEXT: vmov.f32 s1, s4
+; CHECKLE-NEXT: vmov.f32 s2, s5
+; CHECKLE-NEXT: vmov.f32 s3, s6
+; CHECKLE-NEXT: bx lr
+;
+; CHECKBE-LABEL: insertf32_first:
+; CHECKBE: @ %bb.0:
+; CHECKBE-NEXT: vldrw.u32 q0, [r0, #4]
+; CHECKBE-NEXT: vldr s4, [r0]
+; CHECKBE-NEXT: vmov.f32 s5, s0
+; CHECKBE-NEXT: vmov.f32 s6, s1
+; CHECKBE-NEXT: vmov.f32 s7, s2
+; CHECKBE-NEXT: vrev64.32 q0, q1
+; CHECKBE-NEXT: bx lr
+ %q = getelementptr inbounds i8, ptr %p, i32 4
+ %l1 = load <4 x float>, ptr %q
+ %l2 = load float, ptr %p
+ %s = shufflevector <4 x float> %l1, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 2>
+ %ins = insertelement <4 x float> %s, float %l2, i32 0
+ ret <4 x float> %ins
+}
+
+define <4 x float> @insertf32_last(ptr %p) {
+; CHECKLE-LABEL: insertf32_last:
+; CHECKLE: @ %bb.0:
+; CHECKLE-NEXT: vldrw.u32 q1, [r0]
+; CHECKLE-NEXT: vldr s3, [r0, #16]
+; CHECKLE-NEXT: vmov.f32 s0, s5
+; CHECKLE-NEXT: vmov.f32 s1, s6
+; CHECKLE-NEXT: vmov.f32 s2, s7
+; CHECKLE-NEXT: bx lr
+;
+; CHECKBE-LABEL: insertf32_last:
+; CHECKBE: @ %bb.0:
+; CHECKBE-NEXT: vldrw.u32 q0, [r0]
+; CHECKBE-NEXT: vldr s7, [r0, #16]
+; CHECKBE-NEXT: vmov.f32 s4, s1
+; CHECKBE-NEXT: vmov.f32 s5, s2
+; CHECKBE-NEXT: vmov.f32 s6, s3
+; CHECKBE-NEXT: vrev64.32 q0, q1
+; CHECKBE-NEXT: bx lr
+ %q = getelementptr inbounds i8, ptr %p, i32 16
+ %l1 = load <4 x float>, ptr %p
+ %l2 = load float, ptr %q
+ %s = shufflevector <4 x float> %l1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
+ %ins = insertelement <4 x float> %s, float %l2, i32 3
+ ret <4 x float> %ins
+}
More information about the llvm-commits
mailing list