[llvm] af42d80 - [AArch64] Unit test for zext lowering for different types of vectors
Nilanjana Basu via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 9 00:51:11 PST 2022
Author: Nilanjana Basu
Date: 2022-12-09T13:55:25+05:30
New Revision: af42d80a63e38a941bc0d9df8e4d6ae287f75566
URL: https://github.com/llvm/llvm-project/commit/af42d80a63e38a941bc0d9df8e4d6ae287f75566
DIFF: https://github.com/llvm/llvm-project/commit/af42d80a63e38a941bc0d9df8e4d6ae287f75566.diff
LOG: [AArch64] Unit test for zext lowering for different types of vectors
These are unit tests for code generated for zext lowering of vectors.
There are tests for different types of vectors, as well as cases where multiple 'zext' instructions of same type occur back-to-back.
Testing of Global-ISel path is added.
This will help in comparing changes in zext lowering in https://reviews.llvm.org/D136722.
Reviewed By: fhahn
Differential Revision: https://reviews.llvm.org/D137993
Added:
Modified:
llvm/test/CodeGen/AArch64/zext-to-tbl.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 232777936ae55..cfaa369d9d924 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=arm64-apple-ios -mattr=+sve -o - %s | FileCheck %s
; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -o - %s | FileCheck --check-prefix=CHECK-BE %s
+; RUN: llc -mtriple=arm64-apple-ios -mattr=+global-isel -mattr=+sve -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+global-isel -mattr=+sve -o - %s | FileCheck --check-prefix=CHECK-BE %s
; CHECK-LABEL: lCPI0_0:
; CHECK-NEXT: .byte 0 ; 0x0
@@ -792,12 +794,1060 @@ exit:
ret void
}
+define void @zext_v8i8_to_v8i64_in_loop(i8* %src, i64* %dst) {
+; CHECK-LABEL: _zext_v8i8_to_v8i64_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB8_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr d0, [x0, x8]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ushll2.2d v2, v1, #0
+; CHECK-NEXT: ushll.2d v1, v1, #0
+; CHECK-NEXT: ushll2.2d v3, v0, #0
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: stp q1, q2, [x1, #32]
+; CHECK-NEXT: stp q0, q3, [x1], #128
+; CHECK-NEXT: b.ne LBB8_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: zext_v8i8_to_v8i64_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB8_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x10, x1, #32
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v0.8b }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #48
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll2 v1.4s, v0.8h, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: ushll2 v2.2d, v1.4s, #0
+; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-BE-NEXT: st1 { v2.2d }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #16
+; CHECK-BE-NEXT: ushll v2.2d, v0.2s, #0
+; CHECK-BE-NEXT: st1 { v1.2d }, [x10]
+; CHECK-BE-NEXT: ushll2 v0.2d, v0.4s, #0
+; CHECK-BE-NEXT: st1 { v2.2d }, [x1]
+; CHECK-BE-NEXT: add x1, x1, #128
+; CHECK-BE-NEXT: st1 { v0.2d }, [x9]
+; CHECK-BE-NEXT: b.ne .LBB8_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
+ %load = load <8 x i8>, <8 x i8>* %src.gep.cast
+ %ext = zext <8 x i8> %load to <8 x i64>
+ %dst.gep = getelementptr i64, i64* %dst, i64 %iv
+ %dst.gep.cast = bitcast i64* %dst.gep to <8 x i64>*
+ store <8 x i64> %ext, <8 x i64>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v8i8_to_v8i16_in_loop(i8* %src, i16* %dst) {
+; CHECK-LABEL: _zext_v8i8_to_v8i16_in_loop: ; @zext_v8i8_to_v8i16_in_loop
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB9_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr d0, [x0, x8]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: str q0, [x1], #32
+; CHECK-NEXT: b.ne LBB9_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: zext_v8i8_to_v8i16_in_loop: // @zext_v8i8_to_v8i16_in_loop
+; CHECK-BE: .cfi_startproc
+; CHECK-BE-NEXT: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB9_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v0.8b }, [x9]
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: st1 { v0.8h }, [x1]
+; CHECK-BE-NEXT: add x1, x1, #32
+; CHECK-BE-NEXT: b.ne .LBB9_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
+ %load = load <8 x i8>, <8 x i8>* %src.gep.cast
+ %ext = zext <8 x i8> %load to <8 x i16>
+ %dst.gep = getelementptr i16, i16* %dst, i64 %iv
+ %dst.gep.cast = bitcast i16* %dst.gep to <8 x i16>*
+ store <8 x i16> %ext, <8 x i16>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v8i8_to_v8i20_in_loop(i8* %src, i20* %dst) {
+; CHECK-LABEL: _zext_v8i8_to_v8i20_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB10_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr d0, [x0, x8]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: mov.s w10, v1[1]
+; CHECK-NEXT: mov.s w13, v0[1]
+; CHECK-NEXT: fmov w11, s1
+; CHECK-NEXT: mov.s w12, v1[2]
+; CHECK-NEXT: fmov w15, s0
+; CHECK-NEXT: mov.s w16, v0[2]
+; CHECK-NEXT: mov.s w9, v1[3]
+; CHECK-NEXT: mov.s w14, v0[3]
+; CHECK-NEXT: orr x10, x11, x10, lsl #20
+; CHECK-NEXT: orr x11, x15, x13, lsl #20
+; CHECK-NEXT: orr x10, x10, x12, lsl #40
+; CHECK-NEXT: orr x11, x11, x16, lsl #40
+; CHECK-NEXT: lsr x13, x9, #4
+; CHECK-NEXT: lsr x12, x14, #4
+; CHECK-NEXT: orr x9, x10, x9, lsl #60
+; CHECK-NEXT: orr x10, x11, x14, lsl #60
+; CHECK-NEXT: strh w13, [x1, #18]
+; CHECK-NEXT: strh w12, [x1, #8]
+; CHECK-NEXT: stur x9, [x1, #10]
+; CHECK-NEXT: str x10, [x1], #64
+; CHECK-NEXT: b.ne LBB10_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: zext_v8i8_to_v8i20_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB10_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v0.8b }, [x9]
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll2 v1.4s, v0.8h, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: mov w9, v1.s[1]
+; CHECK-BE-NEXT: mov w11, v0.s[1]
+; CHECK-BE-NEXT: mov w13, v1.s[2]
+; CHECK-BE-NEXT: fmov w14, s1
+; CHECK-BE-NEXT: mov w15, v0.s[2]
+; CHECK-BE-NEXT: fmov w16, s0
+; CHECK-BE-NEXT: mov w10, v1.s[3]
+; CHECK-BE-NEXT: lsl x9, x9, #40
+; CHECK-BE-NEXT: mov w12, v0.s[3]
+; CHECK-BE-NEXT: lsl x11, x11, #40
+; CHECK-BE-NEXT: orr x9, x9, x14, lsl #60
+; CHECK-BE-NEXT: orr x11, x11, x16, lsl #60
+; CHECK-BE-NEXT: orr x9, x9, x13, lsl #20
+; CHECK-BE-NEXT: orr x11, x11, x15, lsl #20
+; CHECK-BE-NEXT: lsr w13, w14, #4
+; CHECK-BE-NEXT: lsr w14, w16, #4
+; CHECK-BE-NEXT: strh w10, [x1, #18]
+; CHECK-BE-NEXT: extr x9, x13, x9, #16
+; CHECK-BE-NEXT: strh w12, [x1, #8]
+; CHECK-BE-NEXT: extr x10, x14, x11, #16
+; CHECK-BE-NEXT: stur x9, [x1, #10]
+; CHECK-BE-NEXT: str x10, [x1], #64
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
+ %load = load <8 x i8>, <8 x i8>* %src.gep.cast
+ %ext = zext <8 x i8> %load to <8 x i20>
+ %dst.gep = getelementptr i20, i20* %dst, i64 %iv
+ %dst.gep.cast = bitcast i20* %dst.gep to <8 x i20>*
+ store <8 x i20> %ext, <8 x i20>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v4i8_to_v4i32_in_loop(i8* %src, i32* %dst) {
+; CHECK-LABEL: _zext_v4i8_to_v4i32_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB11_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr s0, [x0, x8]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: str q0, [x1], #64
+; CHECK-NEXT: b.ne LBB11_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: zext_v4i8_to_v4i32_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB11_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ldr s0, [x0, x8]
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: rev32 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT: add x1, x1, #64
+; CHECK-BE-NEXT: b.ne .LBB11_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <4 x i8>*
+ %load = load <4 x i8>, <4 x i8>* %src.gep.cast
+ %ext = zext <4 x i8> %load to <4 x i32>
+ %dst.gep = getelementptr i32, i32* %dst, i64 %iv
+ %dst.gep.cast = bitcast i32* %dst.gep to <4 x i32>*
+ store <4 x i32> %ext, <4 x i32>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v12i8_to_v12i32_in_loop(i8* %src, i32* %dst) {
+; CHECK-LABEL: _zext_v12i8_to_v12i32_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB12_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr q0, [x0, x8]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: ushll2.8h v1, v0, #0
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: ushll2.4s v2, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: stp q2, q1, [x1, #16]
+; CHECK-NEXT: str q0, [x1], #64
+; CHECK-NEXT: b.ne LBB12_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: zext_v12i8_to_v12i32_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB12_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x10, x1, #16
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #32
+; CHECK-BE-NEXT: ushll v1.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-BE-NEXT: ushll v2.4s, v1.4h, #0
+; CHECK-BE-NEXT: ushll2 v1.4s, v1.8h, #0
+; CHECK-BE-NEXT: st1 { v2.4s }, [x1]
+; CHECK-BE-NEXT: add x1, x1, #64
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: st1 { v1.4s }, [x10]
+; CHECK-BE-NEXT: st1 { v0.4s }, [x9]
+; CHECK-BE-NEXT: b.ne .LBB12_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <12 x i8>*
+ %load = load <12 x i8>, <12 x i8>* %src.gep.cast
+ %ext = zext <12 x i8> %load to <12 x i32>
+ %dst.gep = getelementptr i32, i32* %dst, i64 %iv
+ %dst.gep.cast = bitcast i32* %dst.gep to <12 x i32>*
+ store <12 x i32> %ext, <12 x i32>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v16i4_to_v16i32_in_loop(i4* %src, i32* %dst) {
+; CHECK-LABEL: _zext_v16i4_to_v16i32_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: movi.4s v0, #15
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB13_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr x9, [x0, x8]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: and w10, w9, #0xf
+; CHECK-NEXT: ubfx w11, w9, #4, #4
+; CHECK-NEXT: fmov s1, w10
+; CHECK-NEXT: ubfx w10, w9, #8, #4
+; CHECK-NEXT: mov.b v1[1], w11
+; CHECK-NEXT: mov.b v1[2], w10
+; CHECK-NEXT: ubfx w10, w9, #12, #4
+; CHECK-NEXT: mov.b v1[3], w10
+; CHECK-NEXT: ubfx w10, w9, #16, #4
+; CHECK-NEXT: mov.b v1[4], w10
+; CHECK-NEXT: ubfx w10, w9, #20, #4
+; CHECK-NEXT: mov.b v1[5], w10
+; CHECK-NEXT: ubfx w10, w9, #24, #4
+; CHECK-NEXT: mov.b v1[6], w10
+; CHECK-NEXT: ubfx x10, x9, #28, #4
+; CHECK-NEXT: mov.b v1[7], w10
+; CHECK-NEXT: ubfx x10, x9, #32, #4
+; CHECK-NEXT: mov.b v1[8], w10
+; CHECK-NEXT: ubfx x10, x9, #36, #4
+; CHECK-NEXT: mov.b v1[9], w10
+; CHECK-NEXT: ubfx x10, x9, #40, #4
+; CHECK-NEXT: mov.b v1[10], w10
+; CHECK-NEXT: ubfx x10, x9, #44, #4
+; CHECK-NEXT: mov.b v1[11], w10
+; CHECK-NEXT: ubfx x10, x9, #48, #4
+; CHECK-NEXT: mov.b v1[12], w10
+; CHECK-NEXT: ubfx x10, x9, #52, #4
+; CHECK-NEXT: mov.b v1[13], w10
+; CHECK-NEXT: ubfx x10, x9, #56, #4
+; CHECK-NEXT: lsr x9, x9, #60
+; CHECK-NEXT: mov.b v1[14], w10
+; CHECK-NEXT: mov.b v1[15], w9
+; CHECK-NEXT: ext.16b v2, v1, v1, #8
+; CHECK-NEXT: zip2.8b v3, v1, v0
+; CHECK-NEXT: zip1.8b v1, v1, v0
+; CHECK-NEXT: zip1.8b v4, v2, v0
+; CHECK-NEXT: zip2.8b v2, v2, v0
+; CHECK-NEXT: ushll.4s v3, v3, #0
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: and.16b v3, v3, v0
+; CHECK-NEXT: and.16b v1, v1, v0
+; CHECK-NEXT: stp q1, q3, [x1]
+; CHECK-NEXT: ushll.4s v1, v2, #0
+; CHECK-NEXT: ushll.4s v2, v4, #0
+; CHECK-NEXT: and.16b v1, v1, v0
+; CHECK-NEXT: and.16b v2, v2, v0
+; CHECK-NEXT: stp q2, q1, [x1, #32]
+; CHECK-NEXT: add x1, x1, #64
+; CHECK-NEXT: b.ne LBB13_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: zext_v16i4_to_v16i32_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: movi v0.4s, #15
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB13_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: ldr x9, [x0, x8]
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: lsr x10, x9, #60
+; CHECK-BE-NEXT: ubfx x11, x9, #56, #4
+; CHECK-BE-NEXT: fmov s1, w10
+; CHECK-BE-NEXT: ubfx x10, x9, #52, #4
+; CHECK-BE-NEXT: mov v1.b[1], w11
+; CHECK-BE-NEXT: add x11, x1, #32
+; CHECK-BE-NEXT: mov v1.b[2], w10
+; CHECK-BE-NEXT: ubfx x10, x9, #48, #4
+; CHECK-BE-NEXT: mov v1.b[3], w10
+; CHECK-BE-NEXT: ubfx x10, x9, #44, #4
+; CHECK-BE-NEXT: mov v1.b[4], w10
+; CHECK-BE-NEXT: ubfx x10, x9, #40, #4
+; CHECK-BE-NEXT: mov v1.b[5], w10
+; CHECK-BE-NEXT: ubfx x10, x9, #36, #4
+; CHECK-BE-NEXT: mov v1.b[6], w10
+; CHECK-BE-NEXT: ubfx x10, x9, #32, #4
+; CHECK-BE-NEXT: mov v1.b[7], w10
+; CHECK-BE-NEXT: ubfx x10, x9, #28, #4
+; CHECK-BE-NEXT: mov v1.b[8], w10
+; CHECK-BE-NEXT: ubfx w10, w9, #24, #4
+; CHECK-BE-NEXT: mov v1.b[9], w10
+; CHECK-BE-NEXT: ubfx w10, w9, #20, #4
+; CHECK-BE-NEXT: mov v1.b[10], w10
+; CHECK-BE-NEXT: ubfx w10, w9, #16, #4
+; CHECK-BE-NEXT: mov v1.b[11], w10
+; CHECK-BE-NEXT: ubfx w10, w9, #12, #4
+; CHECK-BE-NEXT: mov v1.b[12], w10
+; CHECK-BE-NEXT: ubfx w10, w9, #8, #4
+; CHECK-BE-NEXT: mov v1.b[13], w10
+; CHECK-BE-NEXT: ubfx w10, w9, #4, #4
+; CHECK-BE-NEXT: and w9, w9, #0xf
+; CHECK-BE-NEXT: mov v1.b[14], w10
+; CHECK-BE-NEXT: add x10, x1, #48
+; CHECK-BE-NEXT: mov v1.b[15], w9
+; CHECK-BE-NEXT: add x9, x1, #16
+; CHECK-BE-NEXT: ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT: zip2 v3.8b, v1.8b, v0.8b
+; CHECK-BE-NEXT: zip1 v1.8b, v1.8b, v0.8b
+; CHECK-BE-NEXT: zip1 v4.8b, v2.8b, v0.8b
+; CHECK-BE-NEXT: zip2 v2.8b, v2.8b, v0.8b
+; CHECK-BE-NEXT: rev16 v1.8b, v1.8b
+; CHECK-BE-NEXT: rev16 v3.8b, v3.8b
+; CHECK-BE-NEXT: rev16 v4.8b, v4.8b
+; CHECK-BE-NEXT: rev16 v2.8b, v2.8b
+; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-BE-NEXT: and v1.16b, v1.16b, v0.16b
+; CHECK-BE-NEXT: st1 { v1.4s }, [x1]
+; CHECK-BE-NEXT: add x1, x1, #64
+; CHECK-BE-NEXT: ushll v1.4s, v2.4h, #0
+; CHECK-BE-NEXT: ushll v2.4s, v4.4h, #0
+; CHECK-BE-NEXT: and v3.16b, v3.16b, v0.16b
+; CHECK-BE-NEXT: and v1.16b, v1.16b, v0.16b
+; CHECK-BE-NEXT: st1 { v3.4s }, [x9]
+; CHECK-BE-NEXT: and v2.16b, v2.16b, v0.16b
+; CHECK-BE-NEXT: st1 { v1.4s }, [x10]
+; CHECK-BE-NEXT: st1 { v2.4s }, [x11]
+; CHECK-BE-NEXT: b.ne .LBB13_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i4, i4* %src, i64 %iv
+ %src.gep.cast = bitcast i4* %src.gep to <16 x i4>*
+ %load = load <16 x i4>, <16 x i4>* %src.gep.cast
+ %ext = zext <16 x i4> %load to <16 x i32>
+ %dst.gep = getelementptr i32, i32* %dst, i64 %iv
+ %dst.gep.cast = bitcast i32* %dst.gep to <16 x i32>*
+ store <16 x i32> %ext, <16 x i32>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v16i16_to_v16i64_in_loop(i16* %src, i64* %dst) {
+; CHECK-LABEL: _zext_v16i16_to_v16i64_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB14_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add x9, x0, x8
+; CHECK-NEXT: add x8, x8, #32
+; CHECK-NEXT: cmp x8, #256
+; CHECK-NEXT: ldp q0, q1, [x9]
+; CHECK-NEXT: ushll.4s v2, v0, #0
+; CHECK-NEXT: ushll2.4s v0, v0, #0
+; CHECK-NEXT: ushll.4s v3, v1, #0
+; CHECK-NEXT: ushll2.4s v1, v1, #0
+; CHECK-NEXT: ushll2.2d v5, v0, #0
+; CHECK-NEXT: ushll2.2d v4, v1, #0
+; CHECK-NEXT: ushll.2d v1, v1, #0
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: stp q1, q4, [x1, #96]
+; CHECK-NEXT: ushll2.2d v1, v3, #0
+; CHECK-NEXT: stp q0, q5, [x1, #32]
+; CHECK-NEXT: ushll.2d v3, v3, #0
+; CHECK-NEXT: ushll2.2d v0, v2, #0
+; CHECK-NEXT: stp q3, q1, [x1, #64]
+; CHECK-NEXT: ushll.2d v1, v2, #0
+; CHECK-NEXT: stp q1, q0, [x1], #128
+; CHECK-NEXT: b.ne LBB14_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: zext_v16i16_to_v16i64_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB14_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x10, x1, #48
+; CHECK-BE-NEXT: add x8, x8, #32
+; CHECK-BE-NEXT: cmp x8, #256
+; CHECK-BE-NEXT: ld1 { v0.8h }, [x9]
+; CHECK-BE-NEXT: add x9, x9, #16
+; CHECK-BE-NEXT: ld1 { v2.8h }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #32
+; CHECK-BE-NEXT: ushll2 v1.4s, v0.8h, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: ushll2 v3.2d, v1.4s, #0
+; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-BE-NEXT: st1 { v3.2d }, [x10]
+; CHECK-BE-NEXT: add x10, x1, #112
+; CHECK-BE-NEXT: st1 { v1.2d }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #16
+; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0
+; CHECK-BE-NEXT: ushll2 v1.4s, v2.8h, #0
+; CHECK-BE-NEXT: st1 { v3.2d }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #96
+; CHECK-BE-NEXT: ushll2 v4.2d, v1.4s, #0
+; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-BE-NEXT: st1 { v4.2d }, [x10]
+; CHECK-BE-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-BE-NEXT: add x10, x1, #80
+; CHECK-BE-NEXT: st1 { v0.2d }, [x1]
+; CHECK-BE-NEXT: st1 { v1.2d }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #64
+; CHECK-BE-NEXT: add x1, x1, #128
+; CHECK-BE-NEXT: ushll v3.2d, v2.2s, #0
+; CHECK-BE-NEXT: ushll2 v2.2d, v2.4s, #0
+; CHECK-BE-NEXT: st1 { v3.2d }, [x9]
+; CHECK-BE-NEXT: st1 { v2.2d }, [x10]
+; CHECK-BE-NEXT: b.ne .LBB14_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i16, i16* %src, i64 %iv
+ %src.gep.cast = bitcast i16* %src.gep to <16 x i16>*
+ %load = load <16 x i16>, <16 x i16>* %src.gep.cast
+ %ext = zext <16 x i16> %load to <16 x i64>
+ %dst.gep = getelementptr i64, i64* %dst, i64 %iv
+ %dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>*
+ store <16 x i64> %ext, <16 x i64>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v16i32_to_v16i64_in_loop(i32* %src, i64* %dst) {
+; CHECK-LABEL: _zext_v16i32_to_v16i64_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB15_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add x9, x0, x8
+; CHECK-NEXT: add x8, x8, #64
+; CHECK-NEXT: cmp x8, #512
+; CHECK-NEXT: ldp q1, q0, [x9, #32]
+; CHECK-NEXT: ushll2.2d v5, v1, #0
+; CHECK-NEXT: ushll.2d v1, v1, #0
+; CHECK-NEXT: ldp q3, q2, [x9]
+; CHECK-NEXT: ushll2.2d v4, v0, #0
+; CHECK-NEXT: stp q1, q5, [x1, #64]
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: stp q0, q4, [x1, #96]
+; CHECK-NEXT: ushll2.2d v1, v3, #0
+; CHECK-NEXT: ushll2.2d v0, v2, #0
+; CHECK-NEXT: ushll.2d v2, v2, #0
+; CHECK-NEXT: stp q2, q0, [x1, #32]
+; CHECK-NEXT: ushll.2d v0, v3, #0
+; CHECK-NEXT: stp q0, q1, [x1], #128
+; CHECK-NEXT: b.ne LBB15_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: zext_v16i32_to_v16i64_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB15_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x8, x8, #64
+; CHECK-BE-NEXT: add x10, x9, #48
+; CHECK-BE-NEXT: add x11, x9, #32
+; CHECK-BE-NEXT: cmp x8, #512
+; CHECK-BE-NEXT: ld1 { v0.4s }, [x9]
+; CHECK-BE-NEXT: add x9, x9, #16
+; CHECK-BE-NEXT: ld1 { v1.4s }, [x10]
+; CHECK-BE-NEXT: add x10, x1, #16
+; CHECK-BE-NEXT: ld1 { v2.4s }, [x11]
+; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0
+; CHECK-BE-NEXT: ld1 { v4.4s }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #112
+; CHECK-BE-NEXT: st1 { v3.2d }, [x10]
+; CHECK-BE-NEXT: add x10, x1, #80
+; CHECK-BE-NEXT: ushll2 v3.2d, v1.4s, #0
+; CHECK-BE-NEXT: ushll2 v5.2d, v2.4s, #0
+; CHECK-BE-NEXT: st1 { v3.2d }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #48
+; CHECK-BE-NEXT: st1 { v5.2d }, [x10]
+; CHECK-BE-NEXT: add x10, x1, #96
+; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ushll v3.2d, v4.2s, #0
+; CHECK-BE-NEXT: ushll2 v4.2d, v4.4s, #0
+; CHECK-BE-NEXT: st1 { v0.2d }, [x1]
+; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-BE-NEXT: st1 { v4.2d }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #64
+; CHECK-BE-NEXT: st1 { v1.2d }, [x10]
+; CHECK-BE-NEXT: add x10, x1, #32
+; CHECK-BE-NEXT: add x1, x1, #128
+; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-BE-NEXT: st1 { v3.2d }, [x10]
+; CHECK-BE-NEXT: st1 { v2.2d }, [x9]
+; CHECK-BE-NEXT: b.ne .LBB15_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i32, i32* %src, i64 %iv
+ %src.gep.cast = bitcast i32* %src.gep to <16 x i32>*
+ %load = load <16 x i32>, <16 x i32>* %src.gep.cast
+ %ext = zext <16 x i32> %load to <16 x i64>
+ %dst.gep = getelementptr i64, i64* %dst, i64 %iv
+ %dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>*
+ store <16 x i64> %ext, <16 x i64>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v8i8_to_v8i128_in_loop(i8* %src, i128* %dst) {
+; CHECK-LABEL: _zext_v8i8_to_v8i128_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB16_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr d0, [x0, x8]
+; CHECK-NEXT: add x9, x1, #112
+; CHECK-NEXT: add x10, x1, #80
+; CHECK-NEXT: str xzr, [x1, #120]
+; CHECK-NEXT: str xzr, [x1, #104]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: str xzr, [x1, #88]
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: str xzr, [x1, #72]
+; CHECK-NEXT: str xzr, [x1, #56]
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: str xzr, [x1, #40]
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: str xzr, [x1, #24]
+; CHECK-NEXT: ushll2.2d v2, v1, #0
+; CHECK-NEXT: str xzr, [x1, #8]
+; CHECK-NEXT: ushll.2d v1, v1, #0
+; CHECK-NEXT: st1.d { v2 }[1], [x9]
+; CHECK-NEXT: add x9, x1, #48
+; CHECK-NEXT: str d2, [x1, #96]
+; CHECK-NEXT: ushll2.2d v2, v0, #0
+; CHECK-NEXT: st1.d { v1 }[1], [x10]
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: str d1, [x1, #64]
+; CHECK-NEXT: str d2, [x1, #32]
+; CHECK-NEXT: add x10, x1, #16
+; CHECK-NEXT: str d0, [x1]
+; CHECK-NEXT: add x1, x1, #256
+; CHECK-NEXT: st1.d { v2 }[1], [x9]
+; CHECK-NEXT: st1.d { v0 }[1], [x10]
+; CHECK-NEXT: b.ne LBB16_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: zext_v8i8_to_v8i128_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB16_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x10, x1, #88
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v0.8b }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #120
+; CHECK-BE-NEXT: str xzr, [x1, #112]
+; CHECK-BE-NEXT: str xzr, [x1, #96]
+; CHECK-BE-NEXT: str xzr, [x1, #80]
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: str xzr, [x1, #64]
+; CHECK-BE-NEXT: str xzr, [x1, #48]
+; CHECK-BE-NEXT: ushll2 v1.4s, v0.8h, #0
+; CHECK-BE-NEXT: str xzr, [x1, #32]
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: str xzr, [x1, #16]
+; CHECK-BE-NEXT: ushll2 v2.2d, v1.4s, #0
+; CHECK-BE-NEXT: str xzr, [x1]
+; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-BE-NEXT: st1 { v2.d }[1], [x9]
+; CHECK-BE-NEXT: add x9, x1, #56
+; CHECK-BE-NEXT: str d2, [x1, #104]
+; CHECK-BE-NEXT: ushll2 v2.2d, v0.4s, #0
+; CHECK-BE-NEXT: st1 { v1.d }[1], [x10]
+; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: str d1, [x1, #72]
+; CHECK-BE-NEXT: str d2, [x1, #40]
+; CHECK-BE-NEXT: add x10, x1, #24
+; CHECK-BE-NEXT: str d0, [x1, #8]
+; CHECK-BE-NEXT: add x1, x1, #256
+; CHECK-BE-NEXT: st1 { v2.d }[1], [x9]
+; CHECK-BE-NEXT: st1 { v0.d }[1], [x10]
+; CHECK-BE-NEXT: b.ne .LBB16_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
+ %load = load <8 x i8>, <8 x i8>* %src.gep.cast
+ %ext = zext <8 x i8> %load to <8 x i128>
+ %dst.gep = getelementptr i128, i128* %dst, i64 %iv
+ %dst.gep.cast = bitcast i128* %dst.gep to <8 x i128>*
+ store <8 x i128> %ext, <8 x i128>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(i8* %src, i64* %dst) {
+; CHECK-LABEL: _zext_v8i8_to_v8i64_with_add_in_sequence_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add x9, x0, #8
+; CHECK-NEXT: LBB17_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add x10, x1, x8
+; CHECK-NEXT: add x8, x8, #128
+; CHECK-NEXT: ldp d0, d1, [x9, #-8]
+; CHECK-NEXT: add x9, x9, #16
+; CHECK-NEXT: cmp x8, #1024
+; CHECK-NEXT: ldp q3, q2, [x10, #32]
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ushll.8h v1, v1, #0
+; CHECK-NEXT: ushll2.4s v6, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ldp q5, q4, [x10]
+; CHECK-NEXT: uaddw2.2d v2, v2, v6
+; CHECK-NEXT: uaddw.2d v3, v3, v6
+; CHECK-NEXT: ushll2.4s v7, v1, #0
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: stp q3, q2, [x10, #32]
+; CHECK-NEXT: ldp q17, q16, [x10, #96]
+; CHECK-NEXT: uaddw2.2d v4, v4, v0
+; CHECK-NEXT: uaddw.2d v0, v5, v0
+; CHECK-NEXT: uaddw.2d v3, v17, v7
+; CHECK-NEXT: stp q0, q4, [x10]
+; CHECK-NEXT: ldp q6, q18, [x10, #64]
+; CHECK-NEXT: uaddw2.2d v2, v16, v7
+; CHECK-NEXT: stp q3, q2, [x10, #96]
+; CHECK-NEXT: uaddw2.2d v0, v18, v1
+; CHECK-NEXT: uaddw.2d v1, v6, v1
+; CHECK-NEXT: stp q1, q0, [x10, #64]
+; CHECK-NEXT: b.ne LBB17_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: zext_v8i8_to_v8i64_with_add_in_sequence_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: add x9, x0, #8
+; CHECK-BE-NEXT: .LBB17_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: sub x12, x9, #8
+; CHECK-BE-NEXT: add x10, x1, x8
+; CHECK-BE-NEXT: add x11, x10, #48
+; CHECK-BE-NEXT: add x13, x10, #32
+; CHECK-BE-NEXT: add x14, x10, #16
+; CHECK-BE-NEXT: ld1 { v0.8b }, [x9]
+; CHECK-BE-NEXT: ld1 { v2.8b }, [x12]
+; CHECK-BE-NEXT: add x12, x10, #112
+; CHECK-BE-NEXT: ld1 { v1.2d }, [x11]
+; CHECK-BE-NEXT: add x15, x10, #96
+; CHECK-BE-NEXT: add x16, x10, #64
+; CHECK-BE-NEXT: add x17, x10, #80
+; CHECK-BE-NEXT: ld1 { v3.2d }, [x13]
+; CHECK-BE-NEXT: add x8, x8, #128
+; CHECK-BE-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-BE-NEXT: add x9, x9, #16
+; CHECK-BE-NEXT: ld1 { v5.2d }, [x10]
+; CHECK-BE-NEXT: cmp x8, #1024
+; CHECK-BE-NEXT: ushll2 v7.4s, v2.8h, #0
+; CHECK-BE-NEXT: ld1 { v6.2d }, [x14]
+; CHECK-BE-NEXT: uaddw2 v1.2d, v1.2d, v7.4s
+; CHECK-BE-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-BE-NEXT: ld1 { v16.2d }, [x12]
+; CHECK-BE-NEXT: uaddw v3.2d, v3.2d, v7.2s
+; CHECK-BE-NEXT: ld1 { v17.2d }, [x15]
+; CHECK-BE-NEXT: st1 { v1.2d }, [x11]
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ld1 { v7.2d }, [x17]
+; CHECK-BE-NEXT: st1 { v3.2d }, [x13]
+; CHECK-BE-NEXT: uaddw v5.2d, v5.2d, v2.2s
+; CHECK-BE-NEXT: ld1 { v1.2d }, [x16]
+; CHECK-BE-NEXT: uaddw2 v2.2d, v6.2d, v2.4s
+; CHECK-BE-NEXT: ushll2 v4.4s, v0.8h, #0
+; CHECK-BE-NEXT: st1 { v5.2d }, [x10]
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: uaddw2 v6.2d, v16.2d, v4.4s
+; CHECK-BE-NEXT: st1 { v2.2d }, [x14]
+; CHECK-BE-NEXT: uaddw v3.2d, v17.2d, v4.2s
+; CHECK-BE-NEXT: uaddw2 v2.2d, v7.2d, v0.4s
+; CHECK-BE-NEXT: uaddw v0.2d, v1.2d, v0.2s
+; CHECK-BE-NEXT: st1 { v6.2d }, [x12]
+; CHECK-BE-NEXT: st1 { v3.2d }, [x15]
+; CHECK-BE-NEXT: st1 { v2.2d }, [x17]
+; CHECK-BE-NEXT: st1 { v0.2d }, [x16]
+; CHECK-BE-NEXT: b.ne .LBB17_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
+ %load = load <8 x i8>, <8 x i8>* %src.gep.cast
+ %src.gep.2 = getelementptr i8, i8* %src.gep, i64 8
+ %src.gep.cast.2 = bitcast i8* %src.gep.2 to <8 x i8>*
+ %load.2 = load <8 x i8>, <8 x i8>* %src.gep.cast.2
+ %ext = zext <8 x i8> %load to <8 x i64>
+ %ext.2 = zext <8 x i8> %load.2 to <8 x i64>
+ %dst.gep = getelementptr i64, i64* %dst, i64 %iv
+ %dst.gep.cast = bitcast i64* %dst.gep to <8 x i64>*
+ %load.dst = load <8 x i64>, <8 x i64>* %dst.gep.cast
+ %dst.gep.2 = getelementptr i64, i64* %dst.gep, i64 8
+ %dst.gep.cast.2 = bitcast i64* %dst.gep.2 to <8 x i64>*
+ %load.dst.2 = load <8 x i64>, <8 x i64>* %dst.gep.cast.2
+ %sum = add <8 x i64> %load.dst, %ext
+ %sum.2 = add <8 x i64> %load.dst.2, %ext.2
+ %dst.gep.cast.3 = bitcast i64* %dst.gep to <8 x i64>*
+ store <8 x i64> %sum, <8 x i64>* %dst.gep.cast.3
+ %dst.gep.cast.4 = bitcast i64* %dst.gep.2 to <8 x i64>*
+ store <8 x i64> %sum.2, <8 x i64>* %dst.gep.cast.4
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v16i8_to_v16i64_in_sequence_in_loop(i8* %src, i64* %dst) {
+; CHECK-LABEL: _zext_v16i8_to_v16i64_in_sequence_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add x9, x1, #128
+; CHECK-NEXT: LBB18_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add x10, x0, x8
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: ldp q0, q1, [x10]
+; CHECK-NEXT: ushll.8h v2, v0, #0
+; CHECK-NEXT: ushll2.8h v0, v0, #0
+; CHECK-NEXT: ushll.4s v4, v2, #0
+; CHECK-NEXT: ushll2.4s v5, v0, #0
+; CHECK-NEXT: ushll2.4s v2, v2, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ushll2.2d v6, v5, #0
+; CHECK-NEXT: ushll.2d v5, v5, #0
+; CHECK-NEXT: ushll2.8h v3, v1, #0
+; CHECK-NEXT: ushll2.2d v7, v0, #0
+; CHECK-NEXT: stp q5, q6, [x9, #-32]
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: ushll2.2d v5, v2, #0
+; CHECK-NEXT: ushll.2d v2, v2, #0
+; CHECK-NEXT: stp q0, q7, [x9, #-64]
+; CHECK-NEXT: ushll2.2d v0, v4, #0
+; CHECK-NEXT: stp q2, q5, [x9, #-96]
+; CHECK-NEXT: ushll2.4s v5, v3, #0
+; CHECK-NEXT: ushll.2d v2, v4, #0
+; CHECK-NEXT: ushll2.2d v4, v5, #0
+; CHECK-NEXT: stp q2, q0, [x9, #-128]
+; CHECK-NEXT: ushll.2d v0, v5, #0
+; CHECK-NEXT: ushll.4s v2, v3, #0
+; CHECK-NEXT: stp q0, q4, [x9, #96]
+; CHECK-NEXT: ushll.8h v0, v1, #0
+; CHECK-NEXT: ushll2.2d v1, v2, #0
+; CHECK-NEXT: ushll.2d v2, v2, #0
+; CHECK-NEXT: ushll2.4s v3, v0, #0
+; CHECK-NEXT: stp q2, q1, [x9, #64]
+; CHECK-NEXT: ushll2.2d v1, v3, #0
+; CHECK-NEXT: ushll.2d v2, v3, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: stp q2, q1, [x9, #32]
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: stp q0, q1, [x9], #128
+; CHECK-NEXT: b.ne LBB18_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: zext_v16i8_to_v16i64_in_sequence_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: add x9, x1, #128
+; CHECK-BE-NEXT: .LBB18_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x10, x0, x8
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: add x11, x10, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x10]
+; CHECK-BE-NEXT: sub x10, x9, #16
+; CHECK-BE-NEXT: ld1 { v3.16b }, [x11]
+; CHECK-BE-NEXT: sub x11, x9, #32
+; CHECK-BE-NEXT: ushll2 v1.8h, v0.16b, #0
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll2 v2.4s, v1.8h, #0
+; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT: ushll2 v4.2d, v2.4s, #0
+; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-BE-NEXT: st1 { v4.2d }, [x10]
+; CHECK-BE-NEXT: sub x10, x9, #48
+; CHECK-BE-NEXT: ushll2 v4.2d, v1.4s, #0
+; CHECK-BE-NEXT: st1 { v2.2d }, [x11]
+; CHECK-BE-NEXT: ushll2 v2.4s, v0.8h, #0
+; CHECK-BE-NEXT: sub x11, x9, #80
+; CHECK-BE-NEXT: st1 { v4.2d }, [x10]
+; CHECK-BE-NEXT: sub x10, x9, #64
+; CHECK-BE-NEXT: ushll2 v5.2d, v2.4s, #0
+; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: st1 { v5.2d }, [x11]
+; CHECK-BE-NEXT: sub x11, x9, #96
+; CHECK-BE-NEXT: st1 { v1.2d }, [x10]
+; CHECK-BE-NEXT: ushll v4.8h, v3.8b, #0
+; CHECK-BE-NEXT: sub x10, x9, #112
+; CHECK-BE-NEXT: ushll2 v3.8h, v3.16b, #0
+; CHECK-BE-NEXT: ushll v5.2d, v0.2s, #0
+; CHECK-BE-NEXT: ushll2 v0.2d, v0.4s, #0
+; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-BE-NEXT: ushll v1.4s, v4.4h, #0
+; CHECK-BE-NEXT: st1 { v0.2d }, [x10]
+; CHECK-BE-NEXT: ushll2 v6.4s, v3.8h, #0
+; CHECK-BE-NEXT: st1 { v2.2d }, [x11]
+; CHECK-BE-NEXT: sub x11, x9, #128
+; CHECK-BE-NEXT: add x10, x9, #112
+; CHECK-BE-NEXT: ushll v2.2d, v1.2s, #0
+; CHECK-BE-NEXT: ushll2 v0.2d, v1.4s, #0
+; CHECK-BE-NEXT: ushll2 v1.2d, v6.4s, #0
+; CHECK-BE-NEXT: st1 { v5.2d }, [x11]
+; CHECK-BE-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-BE-NEXT: add x11, x9, #96
+; CHECK-BE-NEXT: st1 { v1.2d }, [x10]
+; CHECK-BE-NEXT: add x10, x9, #80
+; CHECK-BE-NEXT: ushll v5.2d, v6.2s, #0
+; CHECK-BE-NEXT: st1 { v2.2d }, [x9]
+; CHECK-BE-NEXT: ushll2 v1.4s, v4.8h, #0
+; CHECK-BE-NEXT: ushll2 v4.2d, v3.4s, #0
+; CHECK-BE-NEXT: st1 { v5.2d }, [x11]
+; CHECK-BE-NEXT: add x11, x9, #48
+; CHECK-BE-NEXT: st1 { v4.2d }, [x10]
+; CHECK-BE-NEXT: add x10, x9, #64
+; CHECK-BE-NEXT: ushll2 v5.2d, v1.4s, #0
+; CHECK-BE-NEXT: ushll v3.2d, v3.2s, #0
+; CHECK-BE-NEXT: st1 { v5.2d }, [x11]
+; CHECK-BE-NEXT: add x11, x9, #16
+; CHECK-BE-NEXT: st1 { v3.2d }, [x10]
+; CHECK-BE-NEXT: add x10, x9, #32
+; CHECK-BE-NEXT: add x9, x9, #128
+; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-BE-NEXT: st1 { v0.2d }, [x11]
+; CHECK-BE-NEXT: st1 { v1.2d }, [x10]
+; CHECK-BE-NEXT: b.ne .LBB18_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
+ %load = load <16 x i8>, <16 x i8>* %src.gep.cast
+ %src.gep.2 = getelementptr i8, i8* %src.gep, i64 16
+ %src.gep.cast.2 = bitcast i8* %src.gep.2 to <16 x i8>*
+ %load.2 = load <16 x i8>, <16 x i8>* %src.gep.cast.2
+ %ext = zext <16 x i8> %load to <16 x i64>
+ %ext.2 = zext <16 x i8> %load.2 to <16 x i64>
+ %dst.gep = getelementptr i64, i64* %dst, i64 %iv
+ %dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>*
+ store <16 x i64> %ext, <16 x i64>* %dst.gep.cast
+ %dst.gep.2 = getelementptr i64, i64* %dst.gep, i64 16
+ %dst.gep.cast.2 = bitcast i64* %dst.gep.2 to <16 x i64>*
+ store <16 x i64> %ext.2, <16 x i64>* %dst.gep.cast.2
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
define void @zext_v16i8_to_v16i32_in_loop_scalable_vectors(i8* %src, i32* %dst) {
; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop_scalable_vectors:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: LBB8_1: ; %loop
+; CHECK-NEXT: LBB19_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x9, x0, x8
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x8]
@@ -815,7 +1865,7 @@ define void @zext_v16i8_to_v16i32_in_loop_scalable_vectors(i8* %src, i32* %dst)
; CHECK-NEXT: st1w { z1.s }, p0, [x9, #2, mul vl]
; CHECK-NEXT: st1w { z2.s }, p0, [x9, #3, mul vl]
; CHECK-NEXT: st1w { z0.s }, p0, [x9, #1, mul vl]
-; CHECK-NEXT: b.ne LBB8_1
+; CHECK-NEXT: b.ne LBB19_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
;
@@ -823,7 +1873,7 @@ define void @zext_v16i8_to_v16i32_in_loop_scalable_vectors(i8* %src, i32* %dst)
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: mov x8, xzr
; CHECK-BE-NEXT: ptrue p0.s
-; CHECK-BE-NEXT: .LBB8_1: // %loop
+; CHECK-BE-NEXT: .LBB19_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT: add x9, x0, x8
; CHECK-BE-NEXT: ld1b { z0.s }, p0/z, [x0, x8]
@@ -841,7 +1891,7 @@ define void @zext_v16i8_to_v16i32_in_loop_scalable_vectors(i8* %src, i32* %dst)
; CHECK-BE-NEXT: st1w { z1.s }, p0, [x9, #2, mul vl]
; CHECK-BE-NEXT: st1w { z2.s }, p0, [x9, #3, mul vl]
; CHECK-BE-NEXT: st1w { z0.s }, p0, [x9, #1, mul vl]
-; CHECK-BE-NEXT: b.ne .LBB8_1
+; CHECK-BE-NEXT: b.ne .LBB19_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: ret
entry:
@@ -864,3 +1914,519 @@ loop:
exit:
ret void
}
+
+define void @zext_v20i8_to_v20i24_in_loop(i8* %src, i24* %dst) {
+; CHECK-LABEL: _zext_v20i8_to_v20i24_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB20_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add x9, x0, x8
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: ldp q0, q3, [x9]
+; CHECK-NEXT: strb wzr, [x1, #59]
+; CHECK-NEXT: strb wzr, [x1, #56]
+; CHECK-NEXT: strb wzr, [x1, #53]
+; CHECK-NEXT: ushll2.8h v1, v0, #0
+; CHECK-NEXT: strb wzr, [x1, #50]
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: strb wzr, [x1, #47]
+; CHECK-NEXT: ushll2.4s v2, v1, #0
+; CHECK-NEXT: strb wzr, [x1, #44]
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: strb wzr, [x1, #41]
+; CHECK-NEXT: mov.s w9, v2[2]
+; CHECK-NEXT: strb wzr, [x1, #38]
+; CHECK-NEXT: fmov w10, s2
+; CHECK-NEXT: fmov w12, s1
+; CHECK-NEXT: mov.s w11, v2[3]
+; CHECK-NEXT: mov.s w13, v2[1]
+; CHECK-NEXT: ushll.8h v2, v3, #0
+; CHECK-NEXT: strb wzr, [x1, #35]
+; CHECK-NEXT: strh w9, [x1, #42]
+; CHECK-NEXT: mov.s w9, v1[1]
+; CHECK-NEXT: strh w10, [x1, #36]
+; CHECK-NEXT: mov.s w10, v1[3]
+; CHECK-NEXT: strh w12, [x1, #24]
+; CHECK-NEXT: mov.s w12, v1[2]
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: sturh w11, [x1, #45]
+; CHECK-NEXT: sturh w9, [x1, #27]
+; CHECK-NEXT: mov.s w9, v1[1]
+; CHECK-NEXT: mov.s w11, v1[2]
+; CHECK-NEXT: ushll.4s v2, v2, #0
+; CHECK-NEXT: sturh w10, [x1, #33]
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: sturh w13, [x1, #39]
+; CHECK-NEXT: mov.s w10, v1[3]
+; CHECK-NEXT: strh w12, [x1, #30]
+; CHECK-NEXT: sturh w9, [x1, #15]
+; CHECK-NEXT: mov.s w9, v2[2]
+; CHECK-NEXT: strh w11, [x1, #18]
+; CHECK-NEXT: mov.s w11, v2[3]
+; CHECK-NEXT: strb wzr, [x1, #32]
+; CHECK-NEXT: sturh w10, [x1, #21]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: strh w9, [x1, #54]
+; CHECK-NEXT: mov.s w9, v2[1]
+; CHECK-NEXT: sturh w11, [x1, #57]
+; CHECK-NEXT: strh w10, [x1, #12]
+; CHECK-NEXT: strb wzr, [x1, #29]
+; CHECK-NEXT: sturh w9, [x1, #51]
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: strb wzr, [x1, #26]
+; CHECK-NEXT: strb wzr, [x1, #23]
+; CHECK-NEXT: strh w9, [x1, #48]
+; CHECK-NEXT: mov.s w9, v0[3]
+; CHECK-NEXT: strb wzr, [x1, #20]
+; CHECK-NEXT: strb wzr, [x1, #17]
+; CHECK-NEXT: strb wzr, [x1, #14]
+; CHECK-NEXT: sturh w9, [x1, #9]
+; CHECK-NEXT: mov.s w9, v0[2]
+; CHECK-NEXT: strb wzr, [x1, #11]
+; CHECK-NEXT: strb wzr, [x1, #8]
+; CHECK-NEXT: strb wzr, [x1, #5]
+; CHECK-NEXT: strh w9, [x1, #6]
+; CHECK-NEXT: mov.s w9, v0[1]
+; CHECK-NEXT: strb wzr, [x1, #2]
+; CHECK-NEXT: sturh w9, [x1, #3]
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: strh w9, [x1], #64
+; CHECK-NEXT: b.ne LBB20_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: zext_v20i8_to_v20i24_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB20_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: add x10, x9, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v1.16b }, [x9]
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x10]
+; CHECK-BE-NEXT: sturh wzr, [x1, #57]
+; CHECK-BE-NEXT: strh wzr, [x1, #54]
+; CHECK-BE-NEXT: sturh wzr, [x1, #51]
+; CHECK-BE-NEXT: ushll2 v2.8h, v1.16b, #0
+; CHECK-BE-NEXT: strh wzr, [x1, #48]
+; CHECK-BE-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-BE-NEXT: sturh wzr, [x1, #45]
+; CHECK-BE-NEXT: ushll2 v3.4s, v2.8h, #0
+; CHECK-BE-NEXT: strh wzr, [x1, #42]
+; CHECK-BE-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-BE-NEXT: sturh wzr, [x1, #39]
+; CHECK-BE-NEXT: fmov w9, s3
+; CHECK-BE-NEXT: strh wzr, [x1, #36]
+; CHECK-BE-NEXT: mov w10, v3.s[3]
+; CHECK-BE-NEXT: fmov w11, s2
+; CHECK-BE-NEXT: mov w14, v2.s[3]
+; CHECK-BE-NEXT: mov w12, v3.s[2]
+; CHECK-BE-NEXT: strb w9, [x1, #38]
+; CHECK-BE-NEXT: mov w9, v2.s[2]
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: strb w11, [x1, #26]
+; CHECK-BE-NEXT: strb w10, [x1, #47]
+; CHECK-BE-NEXT: mov w10, v2.s[1]
+; CHECK-BE-NEXT: ushll2 v2.4s, v1.8h, #0
+; CHECK-BE-NEXT: mov w13, v3.s[1]
+; CHECK-BE-NEXT: strb w9, [x1, #32]
+; CHECK-BE-NEXT: mov w9, v2.s[1]
+; CHECK-BE-NEXT: mov w11, v2.s[3]
+; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT: strb w12, [x1, #44]
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: strb w10, [x1, #29]
+; CHECK-BE-NEXT: mov w12, v2.s[2]
+; CHECK-BE-NEXT: strb w13, [x1, #41]
+; CHECK-BE-NEXT: strb w9, [x1, #17]
+; CHECK-BE-NEXT: mov w9, v1.s[2]
+; CHECK-BE-NEXT: strb w11, [x1, #23]
+; CHECK-BE-NEXT: mov w11, v1.s[3]
+; CHECK-BE-NEXT: fmov w10, s2
+; CHECK-BE-NEXT: strb w14, [x1, #35]
+; CHECK-BE-NEXT: strb w12, [x1, #20]
+; CHECK-BE-NEXT: strb w9, [x1, #8]
+; CHECK-BE-NEXT: mov w9, v1.s[1]
+; CHECK-BE-NEXT: strb w10, [x1, #14]
+; CHECK-BE-NEXT: strb w11, [x1, #11]
+; CHECK-BE-NEXT: sturh wzr, [x1, #33]
+; CHECK-BE-NEXT: strb w9, [x1, #5]
+; CHECK-BE-NEXT: fmov w9, s1
+; CHECK-BE-NEXT: strh wzr, [x1, #30]
+; CHECK-BE-NEXT: sturh wzr, [x1, #27]
+; CHECK-BE-NEXT: strb w9, [x1, #2]
+; CHECK-BE-NEXT: mov w9, v0.s[3]
+; CHECK-BE-NEXT: strh wzr, [x1, #24]
+; CHECK-BE-NEXT: sturh wzr, [x1, #21]
+; CHECK-BE-NEXT: strh wzr, [x1, #18]
+; CHECK-BE-NEXT: strb w9, [x1, #59]
+; CHECK-BE-NEXT: mov w9, v0.s[2]
+; CHECK-BE-NEXT: sturh wzr, [x1, #15]
+; CHECK-BE-NEXT: strh wzr, [x1, #12]
+; CHECK-BE-NEXT: sturh wzr, [x1, #9]
+; CHECK-BE-NEXT: strb w9, [x1, #56]
+; CHECK-BE-NEXT: mov w9, v0.s[1]
+; CHECK-BE-NEXT: strh wzr, [x1, #6]
+; CHECK-BE-NEXT: sturh wzr, [x1, #3]
+; CHECK-BE-NEXT: strh wzr, [x1]
+; CHECK-BE-NEXT: strb w9, [x1, #53]
+; CHECK-BE-NEXT: fmov w9, s0
+; CHECK-BE-NEXT: strb w9, [x1, #50]
+; CHECK-BE-NEXT: add x1, x1, #64
+; CHECK-BE-NEXT: b.ne .LBB20_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <20 x i8>*
+ %load = load <20 x i8>, <20 x i8>* %src.gep.cast
+ %ext = zext <20 x i8> %load to <20 x i24>
+ %dst.gep = getelementptr i24, i24* %dst, i64 %iv
+ %dst.gep.cast = bitcast i24* %dst.gep to <20 x i24>*
+ store <20 x i24> %ext, <20 x i24>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v23i8_to_v23i48_in_loop(i8* %src, i48* %dst) {
+; CHECK-LABEL: _zext_v23i8_to_v23i48_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB21_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add x9, x0, x8
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: ldp q1, q2, [x9]
+; CHECK-NEXT: strh wzr, [x1, #136]
+; CHECK-NEXT: strh wzr, [x1, #130]
+; CHECK-NEXT: strh wzr, [x1, #124]
+; CHECK-NEXT: ushll2.8h v3, v1, #0
+; CHECK-NEXT: strh wzr, [x1, #118]
+; CHECK-NEXT: ushll.8h v0, v1, #0
+; CHECK-NEXT: strh wzr, [x1, #112]
+; CHECK-NEXT: ushll.8h v2, v2, #0
+; CHECK-NEXT: strh wzr, [x1, #106]
+; CHECK-NEXT: ushll.4s v1, v3, #0
+; CHECK-NEXT: strh wzr, [x1, #100]
+; CHECK-NEXT: ushll2.4s v3, v3, #0
+; CHECK-NEXT: strh wzr, [x1, #94]
+; CHECK-NEXT: ushll2.4s v4, v2, #0
+; CHECK-NEXT: strh wzr, [x1, #88]
+; CHECK-NEXT: ushll.2d v5, v3, #0
+; CHECK-NEXT: strh wzr, [x1, #82]
+; CHECK-NEXT: ushll2.2d v3, v3, #0
+; CHECK-NEXT: strh wzr, [x1, #76]
+; CHECK-NEXT: ushll2.2d v6, v4, #0
+; CHECK-NEXT: strh wzr, [x1, #70]
+; CHECK-NEXT: fmov x10, d3
+; CHECK-NEXT: strh wzr, [x1, #64]
+; CHECK-NEXT: mov.d x9, v3[1]
+; CHECK-NEXT: fmov x11, d6
+; CHECK-NEXT: ushll2.4s v6, v0, #0
+; CHECK-NEXT: strh wzr, [x1, #58]
+; CHECK-NEXT: fmov x12, d5
+; CHECK-NEXT: str w10, [x1, #84]
+; CHECK-NEXT: mov.d x10, v5[1]
+; CHECK-NEXT: str w11, [x1, #132]
+; CHECK-NEXT: ushll2.2d v5, v6, #0
+; CHECK-NEXT: stur w9, [x1, #90]
+; CHECK-NEXT: ushll2.2d v3, v1, #0
+; CHECK-NEXT: str w12, [x1, #72]
+; CHECK-NEXT: fmov x9, d5
+; CHECK-NEXT: strh wzr, [x1, #52]
+; CHECK-NEXT: fmov x13, d3
+; CHECK-NEXT: stur w10, [x1, #78]
+; CHECK-NEXT: mov.d x11, v3[1]
+; CHECK-NEXT: strh wzr, [x1, #46]
+; CHECK-NEXT: ushll.2d v3, v4, #0
+; CHECK-NEXT: str w9, [x1, #36]
+; CHECK-NEXT: ushll.4s v2, v2, #0
+; CHECK-NEXT: str w13, [x1, #60]
+; CHECK-NEXT: fmov x9, d3
+; CHECK-NEXT: strh wzr, [x1, #40]
+; CHECK-NEXT: ushll2.2d v4, v2, #0
+; CHECK-NEXT: stur w11, [x1, #66]
+; CHECK-NEXT: ushll.2d v1, v1, #0
+; CHECK-NEXT: strh wzr, [x1, #34]
+; CHECK-NEXT: str w9, [x1, #120]
+; CHECK-NEXT: fmov x9, d4
+; CHECK-NEXT: mov.d x11, v1[1]
+; CHECK-NEXT: strh wzr, [x1, #28]
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: strh wzr, [x1, #22]
+; CHECK-NEXT: str w9, [x1, #108]
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: ushll.2d v1, v6, #0
+; CHECK-NEXT: strh wzr, [x1, #16]
+; CHECK-NEXT: mov.d x10, v3[1]
+; CHECK-NEXT: stur w11, [x1, #54]
+; CHECK-NEXT: str w9, [x1, #48]
+; CHECK-NEXT: mov.d x9, v1[1]
+; CHECK-NEXT: strh wzr, [x1, #10]
+; CHECK-NEXT: mov.d x12, v5[1]
+; CHECK-NEXT: strh wzr, [x1, #4]
+; CHECK-NEXT: stur w10, [x1, #126]
+; CHECK-NEXT: mov.d x10, v4[1]
+; CHECK-NEXT: stur w9, [x1, #30]
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: stur w12, [x1, #42]
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: str w9, [x1, #24]
+; CHECK-NEXT: mov.d x9, v1[1]
+; CHECK-NEXT: stur w10, [x1, #114]
+; CHECK-NEXT: stur w9, [x1, #18]
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: ushll.2d v1, v2, #0
+; CHECK-NEXT: str w9, [x1, #12]
+; CHECK-NEXT: mov.d x9, v1[1]
+; CHECK-NEXT: stur w9, [x1, #102]
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: str w9, [x1, #96]
+; CHECK-NEXT: mov.d x9, v0[1]
+; CHECK-NEXT: stur w9, [x1, #6]
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: str w9, [x1], #128
+; CHECK-NEXT: b.ne LBB21_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: zext_v23i8_to_v23i48_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB21_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: add x10, x9, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v2.16b }, [x9]
+; CHECK-BE-NEXT: ld1 { v1.16b }, [x10]
+; CHECK-BE-NEXT: str wzr, [x1, #132]
+; CHECK-BE-NEXT: stur wzr, [x1, #126]
+; CHECK-BE-NEXT: str wzr, [x1, #120]
+; CHECK-BE-NEXT: ushll2 v3.8h, v2.16b, #0
+; CHECK-BE-NEXT: stur wzr, [x1, #114]
+; CHECK-BE-NEXT: ushll v0.8h, v2.8b, #0
+; CHECK-BE-NEXT: str wzr, [x1, #108]
+; CHECK-BE-NEXT: ushll v4.4s, v3.4h, #0
+; CHECK-BE-NEXT: stur wzr, [x1, #102]
+; CHECK-BE-NEXT: ushll2 v3.4s, v3.8h, #0
+; CHECK-BE-NEXT: str wzr, [x1, #96]
+; CHECK-BE-NEXT: ushll2 v2.4s, v0.8h, #0
+; CHECK-BE-NEXT: stur wzr, [x1, #90]
+; CHECK-BE-NEXT: ushll2 v7.2d, v3.4s, #0
+; CHECK-BE-NEXT: str wzr, [x1, #84]
+; CHECK-BE-NEXT: ushll v3.2d, v3.2s, #0
+; CHECK-BE-NEXT: stur wzr, [x1, #78]
+; CHECK-BE-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-BE-NEXT: str wzr, [x1, #72]
+; CHECK-BE-NEXT: fmov x12, d3
+; CHECK-BE-NEXT: stur wzr, [x1, #66]
+; CHECK-BE-NEXT: ushll2 v5.2d, v2.4s, #0
+; CHECK-BE-NEXT: fmov x10, d7
+; CHECK-BE-NEXT: mov x9, v7.d[1]
+; CHECK-BE-NEXT: str wzr, [x1, #60]
+; CHECK-BE-NEXT: mov x11, v3.d[1]
+; CHECK-BE-NEXT: strh w12, [x1, #76]
+; CHECK-BE-NEXT: ushll2 v3.4s, v1.8h, #0
+; CHECK-BE-NEXT: strh w10, [x1, #88]
+; CHECK-BE-NEXT: ushll2 v6.2d, v4.4s, #0
+; CHECK-BE-NEXT: stur wzr, [x1, #54]
+; CHECK-BE-NEXT: fmov x14, d5
+; CHECK-BE-NEXT: strh w9, [x1, #94]
+; CHECK-BE-NEXT: mov x12, v5.d[1]
+; CHECK-BE-NEXT: strh w11, [x1, #82]
+; CHECK-BE-NEXT: ushll2 v5.2d, v3.4s, #0
+; CHECK-BE-NEXT: str wzr, [x1, #48]
+; CHECK-BE-NEXT: mov x10, v6.d[1]
+; CHECK-BE-NEXT: strh w14, [x1, #40]
+; CHECK-BE-NEXT: fmov x9, d5
+; CHECK-BE-NEXT: stur wzr, [x1, #42]
+; CHECK-BE-NEXT: ushll v4.2d, v4.2s, #0
+; CHECK-BE-NEXT: strh w12, [x1, #46]
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: str wzr, [x1, #36]
+; CHECK-BE-NEXT: strh w10, [x1, #70]
+; CHECK-BE-NEXT: fmov x13, d6
+; CHECK-BE-NEXT: mov x10, v4.d[1]
+; CHECK-BE-NEXT: strh w9, [x1, #136]
+; CHECK-BE-NEXT: fmov x9, d4
+; CHECK-BE-NEXT: stur wzr, [x1, #30]
+; CHECK-BE-NEXT: ushll2 v5.2d, v0.4s, #0
+; CHECK-BE-NEXT: strh w13, [x1, #64]
+; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-BE-NEXT: str wzr, [x1, #24]
+; CHECK-BE-NEXT: strh w9, [x1, #52]
+; CHECK-BE-NEXT: fmov x9, d5
+; CHECK-BE-NEXT: mov x11, v2.d[1]
+; CHECK-BE-NEXT: strh w10, [x1, #58]
+; CHECK-BE-NEXT: fmov x10, d2
+; CHECK-BE-NEXT: stur wzr, [x1, #18]
+; CHECK-BE-NEXT: ushll v2.2d, v3.2s, #0
+; CHECK-BE-NEXT: strh w9, [x1, #16]
+; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT: str wzr, [x1, #12]
+; CHECK-BE-NEXT: mov x9, v2.d[1]
+; CHECK-BE-NEXT: strh w11, [x1, #34]
+; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: strh w10, [x1, #28]
+; CHECK-BE-NEXT: mov x12, v5.d[1]
+; CHECK-BE-NEXT: stur wzr, [x1, #6]
+; CHECK-BE-NEXT: str wzr, [x1]
+; CHECK-BE-NEXT: strh w9, [x1, #130]
+; CHECK-BE-NEXT: fmov x9, d2
+; CHECK-BE-NEXT: ushll2 v2.2d, v1.4s, #0
+; CHECK-BE-NEXT: strh w12, [x1, #22]
+; CHECK-BE-NEXT: strh w9, [x1, #124]
+; CHECK-BE-NEXT: mov x9, v2.d[1]
+; CHECK-BE-NEXT: strh w9, [x1, #118]
+; CHECK-BE-NEXT: fmov x9, d2
+; CHECK-BE-NEXT: strh w9, [x1, #112]
+; CHECK-BE-NEXT: mov x9, v0.d[1]
+; CHECK-BE-NEXT: strh w9, [x1, #10]
+; CHECK-BE-NEXT: fmov x9, d0
+; CHECK-BE-NEXT: ushll v0.2d, v1.2s, #0
+; CHECK-BE-NEXT: strh w9, [x1, #4]
+; CHECK-BE-NEXT: mov x9, v0.d[1]
+; CHECK-BE-NEXT: strh w9, [x1, #106]
+; CHECK-BE-NEXT: fmov x9, d0
+; CHECK-BE-NEXT: strh w9, [x1, #100]
+; CHECK-BE-NEXT: add x1, x1, #128
+; CHECK-BE-NEXT: b.ne .LBB21_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <23 x i8>*
+ %load = load <23 x i8>, <23 x i8>* %src.gep.cast
+ %ext = zext <23 x i8> %load to <23 x i48>
+ %dst.gep = getelementptr i48, i48* %dst, i64 %iv
+ %dst.gep.cast = bitcast i48* %dst.gep to <23 x i48>*
+ store <23 x i48> %ext, <23 x i48>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v8i8_to_v8i33_in_loop(i8* %src, i33* %dst) {
+; CHECK-LABEL: _zext_v8i8_to_v8i33_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB22_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr d0, [x0, x8]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: strb wzr, [x1, #32]
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ushll.4s v1, v0, #0
+; CHECK-NEXT: ushll2.4s v0, v0, #0
+; CHECK-NEXT: ushll.2d v2, v1, #0
+; CHECK-NEXT: ushll2.2d v3, v0, #0
+; CHECK-NEXT: ushll2.2d v1, v1, #0
+; CHECK-NEXT: mov.d x9, v3[1]
+; CHECK-NEXT: fmov x10, d3
+; CHECK-NEXT: mov.d x12, v1[1]
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: lsl x9, x9, #39
+; CHECK-NEXT: orr x9, x9, x10, lsl #6
+; CHECK-NEXT: fmov x10, d1
+; CHECK-NEXT: mov.d x11, v0[1]
+; CHECK-NEXT: lsl x12, x12, #35
+; CHECK-NEXT: mov.d x14, v2[1]
+; CHECK-NEXT: fmov x13, d0
+; CHECK-NEXT: orr x10, x12, x10, lsl #2
+; CHECK-NEXT: fmov x12, d2
+; CHECK-NEXT: lsl x11, x11, #37
+; CHECK-NEXT: orr x11, x11, x13, lsl #4
+; CHECK-NEXT: orr x12, x12, x14, lsl #33
+; CHECK-NEXT: stp x11, x9, [x1, #16]
+; CHECK-NEXT: stp x12, x10, [x1], #128
+; CHECK-NEXT: b.ne LBB22_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: zext_v8i8_to_v8i33_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB22_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v0.8b }, [x9]
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v1.4s, v0.4h, #0
+; CHECK-BE-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-BE-NEXT: ushll v2.2d, v1.2s, #0
+; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0
+; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ushll2 v1.2d, v1.4s, #0
+; CHECK-BE-NEXT: mov x9, v3.d[1]
+; CHECK-BE-NEXT: fmov x10, d3
+; CHECK-BE-NEXT: mov x11, v0.d[1]
+; CHECK-BE-NEXT: fmov x12, d0
+; CHECK-BE-NEXT: mov x13, v1.d[1]
+; CHECK-BE-NEXT: mov x14, v2.d[1]
+; CHECK-BE-NEXT: orr x10, x9, x10, lsl #33
+; CHECK-BE-NEXT: fmov x15, d1
+; CHECK-BE-NEXT: strb w9, [x1, #32]
+; CHECK-BE-NEXT: fmov x16, d2
+; CHECK-BE-NEXT: lsl x11, x11, #2
+; CHECK-BE-NEXT: lsl x13, x13, #4
+; CHECK-BE-NEXT: orr x12, x11, x12, lsl #35
+; CHECK-BE-NEXT: lsl x14, x14, #6
+; CHECK-BE-NEXT: orr x15, x13, x15, lsl #37
+; CHECK-BE-NEXT: extr x10, x11, x10, #8
+; CHECK-BE-NEXT: orr x11, x14, x16, lsl #39
+; CHECK-BE-NEXT: extr x12, x13, x12, #8
+; CHECK-BE-NEXT: extr x9, x14, x15, #8
+; CHECK-BE-NEXT: extr x11, xzr, x11, #8
+; CHECK-BE-NEXT: stp x12, x10, [x1, #16]
+; CHECK-BE-NEXT: stp x11, x9, [x1], #128
+; CHECK-BE-NEXT: b.ne .LBB22_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
+ %load = load <8 x i8>, <8 x i8>* %src.gep.cast
+ %ext = zext <8 x i8> %load to <8 x i33>
+ %dst.gep = getelementptr i33, i33* %dst, i64 %iv
+ %dst.gep.cast = bitcast i33* %dst.gep to <8 x i33>*
+ store <8 x i33> %ext, <8 x i33>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
More information about the llvm-commits
mailing list