[llvm] [AArch64][SVE] Support lowering fixed-length BUILD_VECTORS to ZIPs (PR #111698)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 10 06:06:19 PDT 2024
================
@@ -222,3 +222,258 @@ define void @build_vector_no_stride_v4f64(ptr %a) {
store <4 x double> <double 0.0, double 4.0, double 1.0, double 8.0>, ptr %a, align 8
ret void
}
+
+define void @build_vector_non_const_v4i1(i1 %a, i1 %b, i1 %c, i1 %d, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: orr w8, w0, w1, lsl #1
+; CHECK-NEXT: orr w8, w8, w2, lsl #2
+; CHECK-NEXT: orr w8, w8, w3, lsl #3
+; CHECK-NEXT: strb w8, [x4]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4i1:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: orr w8, w0, w1, lsl #1
+; NONEON-NOSVE-NEXT: orr w8, w8, w2, lsl #2
+; NONEON-NOSVE-NEXT: orr w8, w8, w3, lsl #3
+; NONEON-NOSVE-NEXT: strb w8, [x4]
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <4 x i1> undef, i1 %a, i64 0
+ %2 = insertelement <4 x i1> %1, i1 %b, i64 1
+ %3 = insertelement <4 x i1> %2, i1 %c, i64 2
+ %4 = insertelement <4 x i1> %3, i1 %d, i64 3
+ store <4 x i1> %4, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v2f64(double %a, double %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: ldr q0, [sp]
+; NONEON-NOSVE-NEXT: str q0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <2 x double> undef, double %a, i64 0
+ %2 = insertelement <2 x double> %1, double %b, i64 1
+ store <2 x double> %2, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v2f32(float %a, float %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: // kill: def $s1 killed $s1 def $z1
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT: str d0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <2 x float> undef, float %a, i64 0
+ %2 = insertelement <2 x float> %1, float %b, i64 1
+ store <2 x float> %2, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v4f32(float %a, float %b, float %c, float %d, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $s2 killed $s2 def $z2
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: // kill: def $s3 killed $s3 def $z3
+; CHECK-NEXT: // kill: def $s1 killed $s1 def $z1
+; CHECK-NEXT: zip1 z2.s, z2.s, z3.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: zip1 z0.d, z0.d, z2.d
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4f32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: stp s2, s3, [sp, #8]
+; NONEON-NOSVE-NEXT: stp s0, s1, [sp]
+; NONEON-NOSVE-NEXT: ldr q0, [sp]
+; NONEON-NOSVE-NEXT: str q0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <4 x float> undef, float %a, i64 0
+ %2 = insertelement <4 x float> %1, float %b, i64 1
+ %3 = insertelement <4 x float> %2, float %c, i64 2
+ %4 = insertelement <4 x float> %3, float %d, i64 3
+ store <4 x float> %4, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v4f64(double %a, double %b, double %c, double %d, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d3 killed $d3 def $z3
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: stp q0, q2, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v4f64:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT: stp d2, d3, [sp, #16]
+; NONEON-NOSVE-NEXT: ldp q1, q0, [sp]
+; NONEON-NOSVE-NEXT: stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #32
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <4 x double> undef, double %a, i64 0
+ %2 = insertelement <4 x double> %1, double %b, i64 1
+ %3 = insertelement <4 x double> %2, double %c, i64 2
+ %4 = insertelement <4 x double> %3, double %d, i64 3
+ store <4 x double> %4, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v8f16(half %a, half %b, half %c, half %d, half %e, half %f, half %g, half %h, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h6 killed $h6 def $z6
+; CHECK-NEXT: // kill: def $h4 killed $h4 def $z4
+; CHECK-NEXT: // kill: def $h2 killed $h2 def $z2
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: // kill: def $h7 killed $h7 def $z7
+; CHECK-NEXT: // kill: def $h5 killed $h5 def $z5
+; CHECK-NEXT: // kill: def $h3 killed $h3 def $z3
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $z1
+; CHECK-NEXT: zip1 z6.h, z6.h, z7.h
+; CHECK-NEXT: zip1 z4.h, z4.h, z5.h
+; CHECK-NEXT: zip1 z2.h, z2.h, z3.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT: zip1 z1.s, z4.s, z6.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z2.s
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v8f16:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: str h7, [sp, #14]
+; NONEON-NOSVE-NEXT: str h6, [sp, #12]
+; NONEON-NOSVE-NEXT: str h5, [sp, #10]
+; NONEON-NOSVE-NEXT: str h4, [sp, #8]
+; NONEON-NOSVE-NEXT: str h3, [sp, #6]
+; NONEON-NOSVE-NEXT: str h2, [sp, #4]
+; NONEON-NOSVE-NEXT: str h1, [sp, #2]
+; NONEON-NOSVE-NEXT: str h0, [sp]
+; NONEON-NOSVE-NEXT: ldr q0, [sp]
+; NONEON-NOSVE-NEXT: str q0, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <8 x half> undef, half %a, i64 0
+ %2 = insertelement <8 x half> %1, half %b, i64 1
+ %3 = insertelement <8 x half> %2, half %c, i64 2
+ %4 = insertelement <8 x half> %3, half %d, i64 3
+ %5 = insertelement <8 x half> %4, half %e, i64 4
+ %6 = insertelement <8 x half> %5, half %f, i64 5
+ %7 = insertelement <8 x half> %6, half %g, i64 6
+ %8 = insertelement <8 x half> %7, half %h, i64 7
+ store <8 x half> %8, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v2i32(i32 %a, i32 %b, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, w1
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: str d0, [x2]
+; CHECK-NEXT: ret
+;
+; NONEON-NOSVE-LABEL: build_vector_non_const_v2i32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT: stp w0, w1, [sp, #8]
+; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT: str d0, [x2]
+; NONEON-NOSVE-NEXT: add sp, sp, #16
+; NONEON-NOSVE-NEXT: ret
+ %1 = insertelement <2 x i32> undef, i32 %a, i64 0
+ %2 = insertelement <2 x i32> %1, i32 %b, i64 1
+ store <2 x i32> %2, ptr %out
+ ret void
+}
+
+define void @build_vector_non_const_v8i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8 %h, ptr %out) {
+; CHECK-LABEL: build_vector_non_const_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, w7
+; CHECK-NEXT: fmov s1, w6
+; CHECK-NEXT: ldr x8, [sp]
+; CHECK-NEXT: fmov s2, w4
+; CHECK-NEXT: fmov s3, w3
+; CHECK-NEXT: fmov s4, w2
+; CHECK-NEXT: fmov s5, w1
+; CHECK-NEXT: fmov s6, w0
+; CHECK-NEXT: zip1 z0.b, z1.b, z0.b
+; CHECK-NEXT: fmov s1, w5
+; CHECK-NEXT: zip1 z1.b, z2.b, z1.b
+; CHECK-NEXT: zip1 z2.b, z4.b, z3.b
+; CHECK-NEXT: zip1 z3.b, z6.b, z5.b
+; CHECK-NEXT: zip1 z0.h, z1.h, z0.h
+; CHECK-NEXT: zip1 z1.h, z3.h, z2.h
+; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: str d0, [x8]
+; CHECK-NEXT: ret
----------------
MacDue wrote:
Added an (arbitrary) limit of at most four elements for int types now, which seems alright. I allow int types that directly come from vector extracts (which happens a few times) though as then the `fmovs` are not needed.
https://github.com/llvm/llvm-project/pull/111698
More information about the llvm-commits
mailing list