[llvm] ebe7587 - [AArch64] Add some tests for bitcast vector loads and scalarizing loaded vectors. NFC
David Green via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 18 23:49:27 PDT 2025
Author: David Green
Date: 2025-09-19T07:49:22+01:00
New Revision: ebe7587256597af0a19707e65e801f9d53307e09
URL: https://github.com/llvm/llvm-project/commit/ebe7587256597af0a19707e65e801f9d53307e09
DIFF: https://github.com/llvm/llvm-project/commit/ebe7587256597af0a19707e65e801f9d53307e09.diff
LOG: [AArch64] Add some tests for bitcast vector loads and scalarizing loaded vectors. NFC
Added:
llvm/test/CodeGen/AArch64/scalarize-vector-load.ll
Modified:
llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
llvm/test/CodeGen/AArch64/bitcast-extend.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll b/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
index a13b82bb903bb..59f887a1143c0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
@@ -1,8 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
define i32 @foo(ptr %__a) nounwind {
; CHECK-LABEL: foo:
-; CHECK: umov.h w{{[0-9]+}}, v{{[0-9]+}}[0]
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: umov.h w8, v0[0]
+; CHECK-NEXT: umov.h w9, v0[0]
+; CHECK-NEXT: add w0, w9, w8, uxth #1
+; CHECK-NEXT: ret
%tmp18 = load <4 x i16>, ptr %__a, align 8
%vget_lane = extractelement <4 x i16> %tmp18, i32 0
%conv = zext i16 %vget_lane to i32
diff --git a/llvm/test/CodeGen/AArch64/bitcast-extend.ll b/llvm/test/CodeGen/AArch64/bitcast-extend.ll
index f62303ecea663..741dcf3ad4c2f 100644
--- a/llvm/test/CodeGen/AArch64/bitcast-extend.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast-extend.ll
@@ -1,6 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI: warning: Instruction selection used fallback path for load_zext_i8_v4bf16
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for load_zext_i16_v4bf16
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for load_zext_i32_v4bf16
define <4 x i16> @z_i32_v4i16(i32 %x) {
; CHECK-SD-LABEL: z_i32_v4i16:
@@ -284,6 +288,324 @@ define void @extractbitcastext_s(i32 %bytes, ptr %output) {
ret void
}
+define <8 x i8> @load_zext_i8_v8i8(ptr %p) {
+; CHECK-LABEL: load_zext_i8_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i8, ptr %p
+ %z = zext i8 %l to i64
+ %b = bitcast i64 %z to <8 x i8>
+ ret <8 x i8> %b
+}
+
+define <8 x i8> @load_zext_i16_v8i8(ptr %p) {
+; CHECK-LABEL: load_zext_i16_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i16, ptr %p
+ %z = zext i16 %l to i64
+ %b = bitcast i64 %z to <8 x i8>
+ ret <8 x i8> %b
+}
+
+define <8 x i8> @load_zext_i32_v8i8(ptr %p) {
+; CHECK-LABEL: load_zext_i32_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i32, ptr %p
+ %z = zext i32 %l to i64
+ %b = bitcast i64 %z to <8 x i8>
+ ret <8 x i8> %b
+}
+
+define <8 x i8> @load_sext_i32_v8i8(ptr %p) {
+; CHECK-LABEL: load_sext_i32_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrsw x8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i32, ptr %p
+ %z = sext i32 %l to i64
+ %b = bitcast i64 %z to <8 x i8>
+ ret <8 x i8> %b
+}
+
+define <16 x i8> @load_zext_v16i8(ptr %p) {
+; CHECK-SD-LABEL: load_zext_v16i8:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT: ldr w8, [x0]
+; CHECK-SD-NEXT: mov v0.d[0], x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: load_zext_v16i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr w8, [x0]
+; CHECK-GI-NEXT: mov v0.d[0], x8
+; CHECK-GI-NEXT: mov v0.d[1], xzr
+; CHECK-GI-NEXT: ret
+ %l = load i32, ptr %p
+ %z = zext i32 %l to i128
+ %b = bitcast i128 %z to <16 x i8>
+ ret <16 x i8> %b
+}
+
+
+define <4 x i16> @load_zext_i8_v4i16(ptr %p) {
+; CHECK-LABEL: load_zext_i8_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i8, ptr %p
+ %z = zext i8 %l to i64
+ %b = bitcast i64 %z to <4 x i16>
+ ret <4 x i16> %b
+}
+
+define <4 x i16> @load_zext_i16_v4i16(ptr %p) {
+; CHECK-LABEL: load_zext_i16_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i16, ptr %p
+ %z = zext i16 %l to i64
+ %b = bitcast i64 %z to <4 x i16>
+ ret <4 x i16> %b
+}
+
+define <4 x i16> @load_zext_i32_v4i16(ptr %p) {
+; CHECK-LABEL: load_zext_i32_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i32, ptr %p
+ %z = zext i32 %l to i64
+ %b = bitcast i64 %z to <4 x i16>
+ ret <4 x i16> %b
+}
+
+define <2 x i32> @load_zext_i8_v2i32(ptr %p) {
+; CHECK-LABEL: load_zext_i8_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i8, ptr %p
+ %z = zext i8 %l to i64
+ %b = bitcast i64 %z to <2 x i32>
+ ret <2 x i32> %b
+}
+
+define <2 x i32> @load_zext_i16_v2i32(ptr %p) {
+; CHECK-LABEL: load_zext_i16_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i16, ptr %p
+ %z = zext i16 %l to i64
+ %b = bitcast i64 %z to <2 x i32>
+ ret <2 x i32> %b
+}
+
+define <2 x i32> @load_zext_i32_v2i32(ptr %p) {
+; CHECK-LABEL: load_zext_i32_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i32, ptr %p
+ %z = zext i32 %l to i64
+ %b = bitcast i64 %z to <2 x i32>
+ ret <2 x i32> %b
+}
+
+define <1 x i64> @load_zext_i8_v1i64(ptr %p) {
+; CHECK-LABEL: load_zext_i8_v1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i8, ptr %p
+ %z = zext i8 %l to i64
+ %b = bitcast i64 %z to <1 x i64>
+ ret <1 x i64> %b
+}
+
+define <1 x i64> @load_zext_i16_v1i64(ptr %p) {
+; CHECK-LABEL: load_zext_i16_v1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i16, ptr %p
+ %z = zext i16 %l to i64
+ %b = bitcast i64 %z to <1 x i64>
+ ret <1 x i64> %b
+}
+
+define <1 x i64> @load_zext_i32_v1i64(ptr %p) {
+; CHECK-LABEL: load_zext_i32_v1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i32, ptr %p
+ %z = zext i32 %l to i64
+ %b = bitcast i64 %z to <1 x i64>
+ ret <1 x i64> %b
+}
+
+
+define <4 x half> @load_zext_i8_v4f16(ptr %p) {
+; CHECK-LABEL: load_zext_i8_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i8, ptr %p
+ %z = zext i8 %l to i64
+ %b = bitcast i64 %z to <4 x half>
+ ret <4 x half> %b
+}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
+define <4 x half> @load_zext_i16_v4f16(ptr %p) {
+; CHECK-LABEL: load_zext_i16_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i16, ptr %p
+ %z = zext i16 %l to i64
+ %b = bitcast i64 %z to <4 x half>
+ ret <4 x half> %b
+}
+
+define <4 x half> @load_zext_i32_v4f16(ptr %p) {
+; CHECK-LABEL: load_zext_i32_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i32, ptr %p
+ %z = zext i32 %l to i64
+ %b = bitcast i64 %z to <4 x half>
+ ret <4 x half> %b
+}
+
+define <4 x bfloat> @load_zext_i8_v4bf16(ptr %p) {
+; CHECK-LABEL: load_zext_i8_v4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i8, ptr %p
+ %z = zext i8 %l to i64
+ %b = bitcast i64 %z to <4 x bfloat>
+ ret <4 x bfloat> %b
+}
+
+define <4 x bfloat> @load_zext_i16_v4bf16(ptr %p) {
+; CHECK-LABEL: load_zext_i16_v4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i16, ptr %p
+ %z = zext i16 %l to i64
+ %b = bitcast i64 %z to <4 x bfloat>
+ ret <4 x bfloat> %b
+}
+
+define <4 x bfloat> @load_zext_i32_v4bf16(ptr %p) {
+; CHECK-LABEL: load_zext_i32_v4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i32, ptr %p
+ %z = zext i32 %l to i64
+ %b = bitcast i64 %z to <4 x bfloat>
+ ret <4 x bfloat> %b
+}
+
+define <2 x float> @load_zext_i8_v2f32(ptr %p) {
+; CHECK-LABEL: load_zext_i8_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i8, ptr %p
+ %z = zext i8 %l to i64
+ %b = bitcast i64 %z to <2 x float>
+ ret <2 x float> %b
+}
+
+define <2 x float> @load_zext_i16_v2f32(ptr %p) {
+; CHECK-LABEL: load_zext_i16_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i16, ptr %p
+ %z = zext i16 %l to i64
+ %b = bitcast i64 %z to <2 x float>
+ ret <2 x float> %b
+}
+
+define <2 x float> @load_zext_i32_v2f32(ptr %p) {
+; CHECK-LABEL: load_zext_i32_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i32, ptr %p
+ %z = zext i32 %l to i64
+ %b = bitcast i64 %z to <2 x float>
+ ret <2 x float> %b
+}
+
+define <1 x double> @load_zext_i8_v1f64(ptr %p) {
+; CHECK-LABEL: load_zext_i8_v1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i8, ptr %p
+ %z = zext i8 %l to i64
+ %b = bitcast i64 %z to <1 x double>
+ ret <1 x double> %b
+}
+
+define <1 x double> @load_zext_i16_v1f64(ptr %p) {
+; CHECK-LABEL: load_zext_i16_v1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i16, ptr %p
+ %z = zext i16 %l to i64
+ %b = bitcast i64 %z to <1 x double>
+ ret <1 x double> %b
+}
+
+define <1 x double> @load_zext_i32_v1f64(ptr %p) {
+; CHECK-LABEL: load_zext_i32_v1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+ %l = load i32, ptr %p
+ %z = zext i32 %l to i64
+ %b = bitcast i64 %z to <1 x double>
+ ret <1 x double> %b
+}
diff --git a/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll b/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll
new file mode 100644
index 0000000000000..eb3a0391eb79e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll
@@ -0,0 +1,723 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i8 @scalarize_v16i8(ptr %p) {
+; CHECK-LABEL: scalarize_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: umov w8, v0.b[0]
+; CHECK-NEXT: umov w9, v0.b[1]
+; CHECK-NEXT: umov w10, v0.b[2]
+; CHECK-NEXT: umov w11, v0.b[3]
+; CHECK-NEXT: umov w12, v0.b[4]
+; CHECK-NEXT: umov w13, v0.b[5]
+; CHECK-NEXT: umov w14, v0.b[6]
+; CHECK-NEXT: umov w15, v0.b[7]
+; CHECK-NEXT: umov w16, v0.b[8]
+; CHECK-NEXT: umov w17, v0.b[9]
+; CHECK-NEXT: umov w18, v0.b[10]
+; CHECK-NEXT: umov w0, v0.b[11]
+; CHECK-NEXT: umov w1, v0.b[12]
+; CHECK-NEXT: umov w2, v0.b[13]
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: umov w3, v0.b[14]
+; CHECK-NEXT: umov w4, v0.b[15]
+; CHECK-NEXT: add w9, w10, w11
+; CHECK-NEXT: add w10, w12, w13
+; CHECK-NEXT: add w11, w14, w15
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: add w12, w16, w17
+; CHECK-NEXT: add w13, w18, w0
+; CHECK-NEXT: add w9, w10, w11
+; CHECK-NEXT: add w14, w1, w2
+; CHECK-NEXT: add w10, w12, w13
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: add w15, w3, w4
+; CHECK-NEXT: add w11, w14, w15
+; CHECK-NEXT: add w9, w10, w11
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %wide.load = load <16 x i8>, ptr %p, align 4
+ %l0 = extractelement <16 x i8> %wide.load, i32 0
+ %l1 = extractelement <16 x i8> %wide.load, i32 1
+ %l2 = extractelement <16 x i8> %wide.load, i32 2
+ %l3 = extractelement <16 x i8> %wide.load, i32 3
+ %l4 = extractelement <16 x i8> %wide.load, i32 4
+ %l5 = extractelement <16 x i8> %wide.load, i32 5
+ %l6 = extractelement <16 x i8> %wide.load, i32 6
+ %l7 = extractelement <16 x i8> %wide.load, i32 7
+ %l8 = extractelement <16 x i8> %wide.load, i32 8
+ %l9 = extractelement <16 x i8> %wide.load, i32 9
+ %l10 = extractelement <16 x i8> %wide.load, i32 10
+ %l11 = extractelement <16 x i8> %wide.load, i32 11
+ %l12 = extractelement <16 x i8> %wide.load, i32 12
+ %l13 = extractelement <16 x i8> %wide.load, i32 13
+ %l14 = extractelement <16 x i8> %wide.load, i32 14
+ %l15 = extractelement <16 x i8> %wide.load, i32 15
+ %a0 = add i8 %l0, %l1
+ %a1 = add i8 %l2, %l3
+ %a2 = add i8 %l4, %l5
+ %a3 = add i8 %l6, %l7
+ %a4 = add i8 %l8, %l9
+ %a5 = add i8 %l10, %l11
+ %a6 = add i8 %l12, %l13
+ %a7 = add i8 %l14, %l15
+ %b0 = add i8 %a0, %a1
+ %b1 = add i8 %a2, %a3
+ %b2 = add i8 %a4, %a5
+ %b3 = add i8 %a6, %a7
+ %c0 = add i8 %b0, %b1
+ %c1 = add i8 %b2, %b3
+ %r = add i8 %c0, %c1
+ ret i8 %r
+}
+
+define i8 @scalarize_v8i8(ptr %p) {
+; CHECK-LABEL: scalarize_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: umov w8, v0.b[0]
+; CHECK-NEXT: umov w9, v0.b[1]
+; CHECK-NEXT: umov w10, v0.b[2]
+; CHECK-NEXT: umov w11, v0.b[3]
+; CHECK-NEXT: umov w12, v0.b[4]
+; CHECK-NEXT: umov w13, v0.b[5]
+; CHECK-NEXT: umov w14, v0.b[6]
+; CHECK-NEXT: umov w15, v0.b[7]
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: add w9, w10, w11
+; CHECK-NEXT: add w10, w12, w13
+; CHECK-NEXT: add w11, w14, w15
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: add w9, w10, w11
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %wide.load = load <8 x i8>, ptr %p, align 4
+ %l0 = extractelement <8 x i8> %wide.load, i32 0
+ %l1 = extractelement <8 x i8> %wide.load, i32 1
+ %l2 = extractelement <8 x i8> %wide.load, i32 2
+ %l3 = extractelement <8 x i8> %wide.load, i32 3
+ %l4 = extractelement <8 x i8> %wide.load, i32 4
+ %l5 = extractelement <8 x i8> %wide.load, i32 5
+ %l6 = extractelement <8 x i8> %wide.load, i32 6
+ %l7 = extractelement <8 x i8> %wide.load, i32 7
+ %a0 = add i8 %l0, %l1
+ %a1 = add i8 %l2, %l3
+ %a2 = add i8 %l4, %l5
+ %a3 = add i8 %l6, %l7
+ %b0 = add i8 %a0, %a1
+ %b1 = add i8 %a2, %a3
+ %r = add i8 %b0, %b1
+ ret i8 %r
+}
+
+define i16 @scalarize_v8i16(ptr %p) {
+; CHECK-LABEL: scalarize_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: umov w8, v0.h[0]
+; CHECK-NEXT: umov w9, v0.h[1]
+; CHECK-NEXT: umov w10, v0.h[2]
+; CHECK-NEXT: umov w11, v0.h[3]
+; CHECK-NEXT: umov w12, v0.h[4]
+; CHECK-NEXT: umov w13, v0.h[5]
+; CHECK-NEXT: umov w14, v0.h[6]
+; CHECK-NEXT: umov w15, v0.h[7]
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: add w9, w10, w11
+; CHECK-NEXT: add w10, w12, w13
+; CHECK-NEXT: add w11, w14, w15
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: add w9, w10, w11
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %wide.load = load <8 x i16>, ptr %p, align 4
+ %l0 = extractelement <8 x i16> %wide.load, i32 0
+ %l1 = extractelement <8 x i16> %wide.load, i32 1
+ %l2 = extractelement <8 x i16> %wide.load, i32 2
+ %l3 = extractelement <8 x i16> %wide.load, i32 3
+ %l4 = extractelement <8 x i16> %wide.load, i32 4
+ %l5 = extractelement <8 x i16> %wide.load, i32 5
+ %l6 = extractelement <8 x i16> %wide.load, i32 6
+ %l7 = extractelement <8 x i16> %wide.load, i32 7
+ %a0 = add i16 %l0, %l1
+ %a1 = add i16 %l2, %l3
+ %a2 = add i16 %l4, %l5
+ %a3 = add i16 %l6, %l7
+ %b0 = add i16 %a0, %a1
+ %b1 = add i16 %a2, %a3
+ %r = add i16 %b0, %b1
+ ret i16 %r
+}
+
+define i16 @scalarize_v4i16(ptr %p) {
+; CHECK-LABEL: scalarize_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: umov w8, v0.h[0]
+; CHECK-NEXT: umov w9, v0.h[1]
+; CHECK-NEXT: umov w10, v0.h[2]
+; CHECK-NEXT: umov w11, v0.h[3]
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: add w9, w10, w11
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %wide.load = load <4 x i16>, ptr %p, align 4
+ %l0 = extractelement <4 x i16> %wide.load, i32 0
+ %l1 = extractelement <4 x i16> %wide.load, i32 1
+ %l2 = extractelement <4 x i16> %wide.load, i32 2
+ %l3 = extractelement <4 x i16> %wide.load, i32 3
+ %a0 = add i16 %l0, %l1
+ %a1 = add i16 %l2, %l3
+ %r = add i16 %a0, %a1
+ ret i16 %r
+}
+
+define i32 @scalarize_v4i32(ptr %p) {
+; CHECK-LABEL: scalarize_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: mov w9, v0.s[2]
+; CHECK-NEXT: mov w10, v0.s[3]
+; CHECK-NEXT: fmov w11, s0
+; CHECK-NEXT: add w8, w11, w8
+; CHECK-NEXT: add w9, w9, w10
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %wide.load = load <4 x i32>, ptr %p, align 4
+ %l0 = extractelement <4 x i32> %wide.load, i32 0
+ %l1 = extractelement <4 x i32> %wide.load, i32 1
+ %l2 = extractelement <4 x i32> %wide.load, i32 2
+ %l3 = extractelement <4 x i32> %wide.load, i32 3
+ %a0 = add i32 %l0, %l1
+ %a1 = add i32 %l2, %l3
+ %r = add i32 %a0, %a1
+ ret i32 %r
+}
+
+define i64 @scalarize_v4i64(ptr %p) {
+; CHECK-LABEL: scalarize_v4i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q1, q0, [x0]
+; CHECK-NEXT: addp d1, v1.2d
+; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: add x0, x8, x9
+; CHECK-NEXT: ret
+ %wide.load = load <4 x i64>, ptr %p, align 4
+ %l0 = extractelement <4 x i64> %wide.load, i32 0
+ %l1 = extractelement <4 x i64> %wide.load, i32 1
+ %l2 = extractelement <4 x i64> %wide.load, i32 2
+ %l3 = extractelement <4 x i64> %wide.load, i32 3
+ %a0 = add i64 %l0, %l1
+ %a1 = add i64 %l2, %l3
+ %r = add i64 %a0, %a1
+ ret i64 %r
+}
+
+define i64 @scalarize_v4i32_sext(ptr %p) {
+; CHECK-LABEL: scalarize_v4i32_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: sshll2 v1.2d, v0.4s, #0
+; CHECK-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: addp d1, v1.2d
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: add x0, x8, x9
+; CHECK-NEXT: ret
+ %wide.load = load <4 x i32>, ptr %p, align 4
+ %ext = sext <4 x i32> %wide.load to <4 x i64>
+ %l0 = extractelement <4 x i64> %ext, i32 0
+ %l1 = extractelement <4 x i64> %ext, i32 1
+ %l2 = extractelement <4 x i64> %ext, i32 2
+ %l3 = extractelement <4 x i64> %ext, i32 3
+ %a0 = add i64 %l0, %l1
+ %a1 = add i64 %l2, %l3
+ %r = add i64 %a0, %a1
+ ret i64 %r
+}
+
+define i64 @scalarize_v4i32_zext(ptr %p) {
+; CHECK-LABEL: scalarize_v4i32_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0
+; CHECK-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: addp d1, v1.2d
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: add x0, x8, x9
+; CHECK-NEXT: ret
+ %wide.load = load <4 x i32>, ptr %p, align 4
+ %ext = zext <4 x i32> %wide.load to <4 x i64>
+ %l0 = extractelement <4 x i64> %ext, i32 0
+ %l1 = extractelement <4 x i64> %ext, i32 1
+ %l2 = extractelement <4 x i64> %ext, i32 2
+ %l3 = extractelement <4 x i64> %ext, i32 3
+ %a0 = add i64 %l0, %l1
+ %a1 = add i64 %l2, %l3
+ %r = add i64 %a0, %a1
+ ret i64 %r
+}
+
+define half @scalarize_v4f16(ptr %p) {
+; CHECK-LABEL: scalarize_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: mov h1, v0.h[1]
+; CHECK-NEXT: mov h2, v0.h[2]
+; CHECK-NEXT: mov h3, v0.h[3]
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s3, h3
+; CHECK-NEXT: fcvt s2, h2
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fadd s1, s2, s3
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: fcvt h1, s1
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+ %wide.load = load <4 x half>, ptr %p, align 4
+ %l0 = extractelement <4 x half> %wide.load, i32 0
+ %l1 = extractelement <4 x half> %wide.load, i32 1
+ %l2 = extractelement <4 x half> %wide.load, i32 2
+ %l3 = extractelement <4 x half> %wide.load, i32 3
+ %a0 = fadd half %l0, %l1
+ %a1 = fadd half %l2, %l3
+ %r = fadd half %a0, %a1
+ ret half %r
+}
+
+define float @scalarize_v4f32(ptr %p) {
+; CHECK-LABEL: scalarize_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: mov s1, v0.s[2]
+; CHECK-NEXT: mov s2, v0.s[3]
+; CHECK-NEXT: faddp s0, v0.2s
+; CHECK-NEXT: fadd s1, s1, s2
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
+ %wide.load = load <4 x float>, ptr %p, align 4
+ %l0 = extractelement <4 x float> %wide.load, i32 0
+ %l1 = extractelement <4 x float> %wide.load, i32 1
+ %l2 = extractelement <4 x float> %wide.load, i32 2
+ %l3 = extractelement <4 x float> %wide.load, i32 3
+ %a0 = fadd float %l0, %l1
+ %a1 = fadd float %l2, %l3
+ %r = fadd float %a0, %a1
+ ret float %r
+}
+
+define double @scalarize_v4f64(ptr %p) {
+; CHECK-LABEL: scalarize_v4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q1, q0, [x0]
+; CHECK-NEXT: faddp d1, v1.2d
+; CHECK-NEXT: faddp d0, v0.2d
+; CHECK-NEXT: fadd d0, d1, d0
+; CHECK-NEXT: ret
+ %wide.load = load <4 x double>, ptr %p, align 4
+ %l0 = extractelement <4 x double> %wide.load, i32 0
+ %l1 = extractelement <4 x double> %wide.load, i32 1
+ %l2 = extractelement <4 x double> %wide.load, i32 2
+ %l3 = extractelement <4 x double> %wide.load, i32 3
+ %a0 = fadd double %l0, %l1
+ %a1 = fadd double %l2, %l3
+ %r = fadd double %a0, %a1
+ ret double %r
+}
+
+
+define float @scalarize_into_load(i64 %22, ptr %23, ptr %rawA, ptr %rawB) {
+; CHECK-LABEL: scalarize_into_load:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldp q1, q0, [x1]
+; CHECK-NEXT: ldp q3, q2, [x1, #96]
+; CHECK-NEXT: ldp q5, q4, [x1, #64]
+; CHECK-NEXT: ldp q7, q6, [x1, #32]
+; CHECK-NEXT: mov x8, v1.d[1]
+; CHECK-NEXT: mov x10, v0.d[1]
+; CHECK-NEXT: mov x1, v3.d[1]
+; CHECK-NEXT: mov x4, v2.d[1]
+; CHECK-NEXT: mov x16, v5.d[1]
+; CHECK-NEXT: mov x18, v4.d[1]
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: mov x12, v7.d[1]
+; CHECK-NEXT: mov x14, v6.d[1]
+; CHECK-NEXT: fmov x11, d0
+; CHECK-NEXT: fmov x13, d7
+; CHECK-NEXT: fmov x15, d6
+; CHECK-NEXT: fmov x17, d5
+; CHECK-NEXT: fmov x0, d4
+; CHECK-NEXT: fmov x3, d3
+; CHECK-NEXT: fmov x5, d2
+; CHECK-NEXT: ldr s0, [x2, x9, lsl #2]
+; CHECK-NEXT: ldr s1, [x2, x8, lsl #2]
+; CHECK-NEXT: ldr s2, [x2, x11, lsl #2]
+; CHECK-NEXT: ldr s3, [x2, x10, lsl #2]
+; CHECK-NEXT: ldr s4, [x2, x13, lsl #2]
+; CHECK-NEXT: ldr s5, [x2, x12, lsl #2]
+; CHECK-NEXT: ldr s6, [x2, x15, lsl #2]
+; CHECK-NEXT: ldr s7, [x2, x14, lsl #2]
+; CHECK-NEXT: ldr s16, [x2, x17, lsl #2]
+; CHECK-NEXT: ldr s17, [x2, x16, lsl #2]
+; CHECK-NEXT: ldr s18, [x2, x0, lsl #2]
+; CHECK-NEXT: ldr s19, [x2, x18, lsl #2]
+; CHECK-NEXT: ldr s20, [x2, x3, lsl #2]
+; CHECK-NEXT: ldr s21, [x2, x1, lsl #2]
+; CHECK-NEXT: ldr s22, [x2, x5, lsl #2]
+; CHECK-NEXT: ldr s23, [x2, x4, lsl #2]
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fadd s1, s2, s3
+; CHECK-NEXT: fadd s2, s4, s5
+; CHECK-NEXT: fadd s3, s6, s7
+; CHECK-NEXT: fadd s4, s16, s17
+; CHECK-NEXT: fadd s5, s18, s19
+; CHECK-NEXT: fadd s6, s20, s21
+; CHECK-NEXT: fadd s7, s22, s23
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fadd s1, s2, s3
+; CHECK-NEXT: fadd s2, s4, s5
+; CHECK-NEXT: fadd s3, s6, s7
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fadd s1, s2, s3
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
+entry:
+ %wide.load = load <16 x i64>, ptr %23, align 4
+ %25 = extractelement <16 x i64> %wide.load, i32 0
+ %26 = getelementptr inbounds float, ptr %rawA, i64 %25
+ %27 = extractelement <16 x i64> %wide.load, i32 1
+ %28 = getelementptr inbounds float, ptr %rawA, i64 %27
+ %29 = extractelement <16 x i64> %wide.load, i32 2
+ %30 = getelementptr inbounds float, ptr %rawA, i64 %29
+ %31 = extractelement <16 x i64> %wide.load, i32 3
+ %32 = getelementptr inbounds float, ptr %rawA, i64 %31
+ %33 = extractelement <16 x i64> %wide.load, i32 4
+ %34 = getelementptr inbounds float, ptr %rawA, i64 %33
+ %35 = extractelement <16 x i64> %wide.load, i32 5
+ %36 = getelementptr inbounds float, ptr %rawA, i64 %35
+ %37 = extractelement <16 x i64> %wide.load, i32 6
+ %38 = getelementptr inbounds float, ptr %rawA, i64 %37
+ %39 = extractelement <16 x i64> %wide.load, i32 7
+ %40 = getelementptr inbounds float, ptr %rawA, i64 %39
+ %41 = extractelement <16 x i64> %wide.load, i32 8
+ %42 = getelementptr inbounds float, ptr %rawA, i64 %41
+ %43 = extractelement <16 x i64> %wide.load, i32 9
+ %44 = getelementptr inbounds float, ptr %rawA, i64 %43
+ %45 = extractelement <16 x i64> %wide.load, i32 10
+ %46 = getelementptr inbounds float, ptr %rawA, i64 %45
+ %47 = extractelement <16 x i64> %wide.load, i32 11
+ %48 = getelementptr inbounds float, ptr %rawA, i64 %47
+ %49 = extractelement <16 x i64> %wide.load, i32 12
+ %50 = getelementptr inbounds float, ptr %rawA, i64 %49
+ %51 = extractelement <16 x i64> %wide.load, i32 13
+ %52 = getelementptr inbounds float, ptr %rawA, i64 %51
+ %53 = extractelement <16 x i64> %wide.load, i32 14
+ %54 = getelementptr inbounds float, ptr %rawA, i64 %53
+ %55 = extractelement <16 x i64> %wide.load, i32 15
+ %56 = getelementptr inbounds float, ptr %rawA, i64 %55
+ %59 = load float, ptr %26, align 4
+ %60 = load float, ptr %28, align 4
+ %61 = load float, ptr %30, align 4
+ %62 = load float, ptr %32, align 4
+ %63 = load float, ptr %34, align 4
+ %64 = load float, ptr %36, align 4
+ %65 = load float, ptr %38, align 4
+ %66 = load float, ptr %40, align 4
+ %67 = load float, ptr %42, align 4
+ %68 = load float, ptr %44, align 4
+ %69 = load float, ptr %46, align 4
+ %70 = load float, ptr %48, align 4
+ %71 = load float, ptr %50, align 4
+ %72 = load float, ptr %52, align 4
+ %73 = load float, ptr %54, align 4
+ %74 = load float, ptr %56, align 4
+ %a1 = fadd float %59, %60
+ %a2 = fadd float %61, %62
+ %a3 = fadd float %63, %64
+ %a4 = fadd float %65, %66
+ %a5 = fadd float %67, %68
+ %a6 = fadd float %69, %70
+ %a7 = fadd float %71, %72
+ %a8 = fadd float %73, %74
+ %a9 = fadd float %a1, %a2
+ %a10 = fadd float %a3, %a4
+ %a11 = fadd float %a5, %a6
+ %a12 = fadd float %a7, %a8
+ %a13 = fadd float %a9, %a10
+ %a14 = fadd float %a11, %a12
+ %a15 = fadd float %a13, %a14
+ ret float %a15
+}
+
+define float @scalarize_into_load_sext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) {
+; CHECK-LABEL: scalarize_into_load_sext:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldp q0, q2, [x1]
+; CHECK-NEXT: ldp q4, q1, [x1, #32]
+; CHECK-NEXT: sshll v3.2d, v0.2s, #0
+; CHECK-NEXT: sshll2 v0.2d, v0.4s, #0
+; CHECK-NEXT: sshll2 v6.2d, v2.4s, #0
+; CHECK-NEXT: sshll2 v5.2d, v1.4s, #0
+; CHECK-NEXT: sshll v1.2d, v1.2s, #0
+; CHECK-NEXT: sshll v2.2d, v2.2s, #0
+; CHECK-NEXT: sshll2 v7.2d, v4.4s, #0
+; CHECK-NEXT: sshll v4.2d, v4.2s, #0
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: mov x10, v0.d[1]
+; CHECK-NEXT: mov x14, v6.d[1]
+; CHECK-NEXT: mov x12, v2.d[1]
+; CHECK-NEXT: mov x1, v1.d[1]
+; CHECK-NEXT: mov x4, v5.d[1]
+; CHECK-NEXT: mov x16, v4.d[1]
+; CHECK-NEXT: mov x18, v7.d[1]
+; CHECK-NEXT: fmov x9, d3
+; CHECK-NEXT: fmov x11, d0
+; CHECK-NEXT: fmov x13, d2
+; CHECK-NEXT: fmov x15, d6
+; CHECK-NEXT: fmov x17, d4
+; CHECK-NEXT: fmov x0, d7
+; CHECK-NEXT: ldr s2, [x2, x8, lsl #2]
+; CHECK-NEXT: fmov x3, d1
+; CHECK-NEXT: fmov x5, d5
+; CHECK-NEXT: ldr s0, [x2, x9, lsl #2]
+; CHECK-NEXT: ldr s1, [x2, x11, lsl #2]
+; CHECK-NEXT: ldr s3, [x2, x10, lsl #2]
+; CHECK-NEXT: ldr s4, [x2, x13, lsl #2]
+; CHECK-NEXT: ldr s5, [x2, x12, lsl #2]
+; CHECK-NEXT: ldr s6, [x2, x15, lsl #2]
+; CHECK-NEXT: ldr s7, [x2, x14, lsl #2]
+; CHECK-NEXT: ldr s16, [x2, x17, lsl #2]
+; CHECK-NEXT: ldr s17, [x2, x16, lsl #2]
+; CHECK-NEXT: ldr s18, [x2, x0, lsl #2]
+; CHECK-NEXT: ldr s19, [x2, x18, lsl #2]
+; CHECK-NEXT: ldr s20, [x2, x3, lsl #2]
+; CHECK-NEXT: ldr s21, [x2, x1, lsl #2]
+; CHECK-NEXT: ldr s22, [x2, x5, lsl #2]
+; CHECK-NEXT: ldr s23, [x2, x4, lsl #2]
+; CHECK-NEXT: fadd s0, s0, s2
+; CHECK-NEXT: fadd s1, s1, s3
+; CHECK-NEXT: fadd s2, s4, s5
+; CHECK-NEXT: fadd s3, s6, s7
+; CHECK-NEXT: fadd s4, s16, s17
+; CHECK-NEXT: fadd s5, s18, s19
+; CHECK-NEXT: fadd s6, s20, s21
+; CHECK-NEXT: fadd s7, s22, s23
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fadd s1, s2, s3
+; CHECK-NEXT: fadd s2, s4, s5
+; CHECK-NEXT: fadd s3, s6, s7
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fadd s1, s2, s3
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
+entry:
+ %wide.load = load <16 x i32>, ptr %23, align 4
+ %24 = sext <16 x i32> %wide.load to <16 x i64>
+ %25 = extractelement <16 x i64> %24, i32 0
+ %26 = getelementptr inbounds float, ptr %rawA, i64 %25
+ %27 = extractelement <16 x i64> %24, i32 1
+ %28 = getelementptr inbounds float, ptr %rawA, i64 %27
+ %29 = extractelement <16 x i64> %24, i32 2
+ %30 = getelementptr inbounds float, ptr %rawA, i64 %29
+ %31 = extractelement <16 x i64> %24, i32 3
+ %32 = getelementptr inbounds float, ptr %rawA, i64 %31
+ %33 = extractelement <16 x i64> %24, i32 4
+ %34 = getelementptr inbounds float, ptr %rawA, i64 %33
+ %35 = extractelement <16 x i64> %24, i32 5
+ %36 = getelementptr inbounds float, ptr %rawA, i64 %35
+ %37 = extractelement <16 x i64> %24, i32 6
+ %38 = getelementptr inbounds float, ptr %rawA, i64 %37
+ %39 = extractelement <16 x i64> %24, i32 7
+ %40 = getelementptr inbounds float, ptr %rawA, i64 %39
+ %41 = extractelement <16 x i64> %24, i32 8
+ %42 = getelementptr inbounds float, ptr %rawA, i64 %41
+ %43 = extractelement <16 x i64> %24, i32 9
+ %44 = getelementptr inbounds float, ptr %rawA, i64 %43
+ %45 = extractelement <16 x i64> %24, i32 10
+ %46 = getelementptr inbounds float, ptr %rawA, i64 %45
+ %47 = extractelement <16 x i64> %24, i32 11
+ %48 = getelementptr inbounds float, ptr %rawA, i64 %47
+ %49 = extractelement <16 x i64> %24, i32 12
+ %50 = getelementptr inbounds float, ptr %rawA, i64 %49
+ %51 = extractelement <16 x i64> %24, i32 13
+ %52 = getelementptr inbounds float, ptr %rawA, i64 %51
+ %53 = extractelement <16 x i64> %24, i32 14
+ %54 = getelementptr inbounds float, ptr %rawA, i64 %53
+ %55 = extractelement <16 x i64> %24, i32 15
+ %56 = getelementptr inbounds float, ptr %rawA, i64 %55
+ %59 = load float, ptr %26, align 4
+ %60 = load float, ptr %28, align 4
+ %61 = load float, ptr %30, align 4
+ %62 = load float, ptr %32, align 4
+ %63 = load float, ptr %34, align 4
+ %64 = load float, ptr %36, align 4
+ %65 = load float, ptr %38, align 4
+ %66 = load float, ptr %40, align 4
+ %67 = load float, ptr %42, align 4
+ %68 = load float, ptr %44, align 4
+ %69 = load float, ptr %46, align 4
+ %70 = load float, ptr %48, align 4
+ %71 = load float, ptr %50, align 4
+ %72 = load float, ptr %52, align 4
+ %73 = load float, ptr %54, align 4
+ %74 = load float, ptr %56, align 4
+ %a1 = fadd float %59, %60
+ %a2 = fadd float %61, %62
+ %a3 = fadd float %63, %64
+ %a4 = fadd float %65, %66
+ %a5 = fadd float %67, %68
+ %a6 = fadd float %69, %70
+ %a7 = fadd float %71, %72
+ %a8 = fadd float %73, %74
+ %a9 = fadd float %a1, %a2
+ %a10 = fadd float %a3, %a4
+ %a11 = fadd float %a5, %a6
+ %a12 = fadd float %a7, %a8
+ %a13 = fadd float %a9, %a10
+ %a14 = fadd float %a11, %a12
+ %a15 = fadd float %a13, %a14
+ ret float %a15
+}
+
+define float @scalarize_into_load_zext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) {
+; CHECK-LABEL: scalarize_into_load_zext:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldp q0, q2, [x1]
+; CHECK-NEXT: ldp q4, q1, [x1, #32]
+; CHECK-NEXT: ushll v3.2d, v0.2s, #0
+; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0
+; CHECK-NEXT: ushll2 v6.2d, v2.4s, #0
+; CHECK-NEXT: ushll2 v5.2d, v1.4s, #0
+; CHECK-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-NEXT: ushll2 v7.2d, v4.4s, #0
+; CHECK-NEXT: ushll v4.2d, v4.2s, #0
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: mov x10, v0.d[1]
+; CHECK-NEXT: mov x14, v6.d[1]
+; CHECK-NEXT: mov x12, v2.d[1]
+; CHECK-NEXT: mov x1, v1.d[1]
+; CHECK-NEXT: mov x4, v5.d[1]
+; CHECK-NEXT: mov x16, v4.d[1]
+; CHECK-NEXT: mov x18, v7.d[1]
+; CHECK-NEXT: fmov x9, d3
+; CHECK-NEXT: fmov x11, d0
+; CHECK-NEXT: fmov x13, d2
+; CHECK-NEXT: fmov x15, d6
+; CHECK-NEXT: fmov x17, d4
+; CHECK-NEXT: fmov x0, d7
+; CHECK-NEXT: ldr s2, [x2, x8, lsl #2]
+; CHECK-NEXT: fmov x3, d1
+; CHECK-NEXT: fmov x5, d5
+; CHECK-NEXT: ldr s0, [x2, x9, lsl #2]
+; CHECK-NEXT: ldr s1, [x2, x11, lsl #2]
+; CHECK-NEXT: ldr s3, [x2, x10, lsl #2]
+; CHECK-NEXT: ldr s4, [x2, x13, lsl #2]
+; CHECK-NEXT: ldr s5, [x2, x12, lsl #2]
+; CHECK-NEXT: ldr s6, [x2, x15, lsl #2]
+; CHECK-NEXT: ldr s7, [x2, x14, lsl #2]
+; CHECK-NEXT: ldr s16, [x2, x17, lsl #2]
+; CHECK-NEXT: ldr s17, [x2, x16, lsl #2]
+; CHECK-NEXT: ldr s18, [x2, x0, lsl #2]
+; CHECK-NEXT: ldr s19, [x2, x18, lsl #2]
+; CHECK-NEXT: ldr s20, [x2, x3, lsl #2]
+; CHECK-NEXT: ldr s21, [x2, x1, lsl #2]
+; CHECK-NEXT: ldr s22, [x2, x5, lsl #2]
+; CHECK-NEXT: ldr s23, [x2, x4, lsl #2]
+; CHECK-NEXT: fadd s0, s0, s2
+; CHECK-NEXT: fadd s1, s1, s3
+; CHECK-NEXT: fadd s2, s4, s5
+; CHECK-NEXT: fadd s3, s6, s7
+; CHECK-NEXT: fadd s4, s16, s17
+; CHECK-NEXT: fadd s5, s18, s19
+; CHECK-NEXT: fadd s6, s20, s21
+; CHECK-NEXT: fadd s7, s22, s23
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fadd s1, s2, s3
+; CHECK-NEXT: fadd s2, s4, s5
+; CHECK-NEXT: fadd s3, s6, s7
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fadd s1, s2, s3
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
+entry:
+ %wide.load = load <16 x i32>, ptr %23, align 4
+ %24 = zext <16 x i32> %wide.load to <16 x i64>
+ %25 = extractelement <16 x i64> %24, i32 0
+ %26 = getelementptr inbounds float, ptr %rawA, i64 %25
+ %27 = extractelement <16 x i64> %24, i32 1
+ %28 = getelementptr inbounds float, ptr %rawA, i64 %27
+ %29 = extractelement <16 x i64> %24, i32 2
+ %30 = getelementptr inbounds float, ptr %rawA, i64 %29
+ %31 = extractelement <16 x i64> %24, i32 3
+ %32 = getelementptr inbounds float, ptr %rawA, i64 %31
+ %33 = extractelement <16 x i64> %24, i32 4
+ %34 = getelementptr inbounds float, ptr %rawA, i64 %33
+ %35 = extractelement <16 x i64> %24, i32 5
+ %36 = getelementptr inbounds float, ptr %rawA, i64 %35
+ %37 = extractelement <16 x i64> %24, i32 6
+ %38 = getelementptr inbounds float, ptr %rawA, i64 %37
+ %39 = extractelement <16 x i64> %24, i32 7
+ %40 = getelementptr inbounds float, ptr %rawA, i64 %39
+ %41 = extractelement <16 x i64> %24, i32 8
+ %42 = getelementptr inbounds float, ptr %rawA, i64 %41
+ %43 = extractelement <16 x i64> %24, i32 9
+ %44 = getelementptr inbounds float, ptr %rawA, i64 %43
+ %45 = extractelement <16 x i64> %24, i32 10
+ %46 = getelementptr inbounds float, ptr %rawA, i64 %45
+ %47 = extractelement <16 x i64> %24, i32 11
+ %48 = getelementptr inbounds float, ptr %rawA, i64 %47
+ %49 = extractelement <16 x i64> %24, i32 12
+ %50 = getelementptr inbounds float, ptr %rawA, i64 %49
+ %51 = extractelement <16 x i64> %24, i32 13
+ %52 = getelementptr inbounds float, ptr %rawA, i64 %51
+ %53 = extractelement <16 x i64> %24, i32 14
+ %54 = getelementptr inbounds float, ptr %rawA, i64 %53
+ %55 = extractelement <16 x i64> %24, i32 15
+ %56 = getelementptr inbounds float, ptr %rawA, i64 %55
+ %59 = load float, ptr %26, align 4
+ %60 = load float, ptr %28, align 4
+ %61 = load float, ptr %30, align 4
+ %62 = load float, ptr %32, align 4
+ %63 = load float, ptr %34, align 4
+ %64 = load float, ptr %36, align 4
+ %65 = load float, ptr %38, align 4
+ %66 = load float, ptr %40, align 4
+ %67 = load float, ptr %42, align 4
+ %68 = load float, ptr %44, align 4
+ %69 = load float, ptr %46, align 4
+ %70 = load float, ptr %48, align 4
+ %71 = load float, ptr %50, align 4
+ %72 = load float, ptr %52, align 4
+ %73 = load float, ptr %54, align 4
+ %74 = load float, ptr %56, align 4
+ %a1 = fadd float %59, %60
+ %a2 = fadd float %61, %62
+ %a3 = fadd float %63, %64
+ %a4 = fadd float %65, %66
+ %a5 = fadd float %67, %68
+ %a6 = fadd float %69, %70
+ %a7 = fadd float %71, %72
+ %a8 = fadd float %73, %74
+ %a9 = fadd float %a1, %a2
+ %a10 = fadd float %a3, %a4
+ %a11 = fadd float %a5, %a6
+ %a12 = fadd float %a7, %a8
+ %a13 = fadd float %a9, %a10
+ %a14 = fadd float %a11, %a12
+ %a15 = fadd float %a13, %a14
+ ret float %a15
+}
+
+
More information about the llvm-commits
mailing list