[llvm] [RISCV] Split long build_vector sequences to reduce critical path (PR #81312)
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 13 13:06:15 PST 2024
================
@@ -1181,89 +1181,46 @@ define <8 x i64> @v8xi64_exact_undef_prefix(i64 %a, i64 %b, i64 %c, i64 %d) vsca
define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
-; RV32-LABEL: buildvec_v16i8_loads_contigous:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset s0, -4
-; RV32-NEXT: lbu a1, 1(a0)
-; RV32-NEXT: lbu a2, 2(a0)
-; RV32-NEXT: lbu a3, 3(a0)
-; RV32-NEXT: lbu a4, 4(a0)
-; RV32-NEXT: lbu a5, 5(a0)
-; RV32-NEXT: lbu a6, 6(a0)
-; RV32-NEXT: lbu a7, 7(a0)
-; RV32-NEXT: lbu t0, 8(a0)
-; RV32-NEXT: lbu t1, 9(a0)
-; RV32-NEXT: lbu t2, 10(a0)
-; RV32-NEXT: lbu t3, 11(a0)
-; RV32-NEXT: lbu t4, 12(a0)
-; RV32-NEXT: lbu t5, 13(a0)
-; RV32-NEXT: lbu t6, 14(a0)
-; RV32-NEXT: lbu s0, 15(a0)
-; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; RV32-NEXT: vlse8.v v8, (a0), zero
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: vslide1down.vx v8, v8, a2
-; RV32-NEXT: vslide1down.vx v8, v8, a3
-; RV32-NEXT: vslide1down.vx v8, v8, a4
-; RV32-NEXT: vslide1down.vx v8, v8, a5
-; RV32-NEXT: vslide1down.vx v8, v8, a6
-; RV32-NEXT: vslide1down.vx v8, v8, a7
-; RV32-NEXT: vslide1down.vx v8, v8, t0
-; RV32-NEXT: vslide1down.vx v8, v8, t1
-; RV32-NEXT: vslide1down.vx v8, v8, t2
-; RV32-NEXT: vslide1down.vx v8, v8, t3
-; RV32-NEXT: vslide1down.vx v8, v8, t4
-; RV32-NEXT: vslide1down.vx v8, v8, t5
-; RV32-NEXT: vslide1down.vx v8, v8, t6
-; RV32-NEXT: vslide1down.vx v8, v8, s0
-; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
-;
-; RV64-LABEL: buildvec_v16i8_loads_contigous:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset s0, -8
-; RV64-NEXT: lbu a1, 1(a0)
-; RV64-NEXT: lbu a2, 2(a0)
-; RV64-NEXT: lbu a3, 3(a0)
-; RV64-NEXT: lbu a4, 4(a0)
-; RV64-NEXT: lbu a5, 5(a0)
-; RV64-NEXT: lbu a6, 6(a0)
-; RV64-NEXT: lbu a7, 7(a0)
-; RV64-NEXT: lbu t0, 8(a0)
-; RV64-NEXT: lbu t1, 9(a0)
-; RV64-NEXT: lbu t2, 10(a0)
-; RV64-NEXT: lbu t3, 11(a0)
-; RV64-NEXT: lbu t4, 12(a0)
-; RV64-NEXT: lbu t5, 13(a0)
-; RV64-NEXT: lbu t6, 14(a0)
-; RV64-NEXT: lbu s0, 15(a0)
-; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; RV64-NEXT: vlse8.v v8, (a0), zero
-; RV64-NEXT: vslide1down.vx v8, v8, a1
-; RV64-NEXT: vslide1down.vx v8, v8, a2
-; RV64-NEXT: vslide1down.vx v8, v8, a3
-; RV64-NEXT: vslide1down.vx v8, v8, a4
-; RV64-NEXT: vslide1down.vx v8, v8, a5
-; RV64-NEXT: vslide1down.vx v8, v8, a6
-; RV64-NEXT: vslide1down.vx v8, v8, a7
-; RV64-NEXT: vslide1down.vx v8, v8, t0
-; RV64-NEXT: vslide1down.vx v8, v8, t1
-; RV64-NEXT: vslide1down.vx v8, v8, t2
-; RV64-NEXT: vslide1down.vx v8, v8, t3
-; RV64-NEXT: vslide1down.vx v8, v8, t4
-; RV64-NEXT: vslide1down.vx v8, v8, t5
-; RV64-NEXT: vslide1down.vx v8, v8, t6
-; RV64-NEXT: vslide1down.vx v8, v8, s0
-; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: ret
+; CHECK-LABEL: buildvec_v16i8_loads_contigous:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lbu a1, 1(a0)
+; CHECK-NEXT: lbu a2, 2(a0)
+; CHECK-NEXT: lbu a3, 3(a0)
+; CHECK-NEXT: lbu a4, 4(a0)
+; CHECK-NEXT: lbu a5, 5(a0)
+; CHECK-NEXT: lbu a6, 6(a0)
+; CHECK-NEXT: lbu a7, 7(a0)
+; CHECK-NEXT: lbu t0, 9(a0)
+; CHECK-NEXT: lbu t1, 10(a0)
+; CHECK-NEXT: lbu t2, 11(a0)
+; CHECK-NEXT: lbu t3, 12(a0)
+; CHECK-NEXT: lbu t4, 13(a0)
+; CHECK-NEXT: lbu t5, 14(a0)
+; CHECK-NEXT: lbu t6, 15(a0)
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vlse8.v v8, (a0), zero
+; CHECK-NEXT: addi a0, a0, 8
+; CHECK-NEXT: vslide1down.vx v8, v8, a1
+; CHECK-NEXT: vslide1down.vx v8, v8, a2
+; CHECK-NEXT: vslide1down.vx v8, v8, a3
+; CHECK-NEXT: vslide1down.vx v8, v8, a4
+; CHECK-NEXT: vlse8.v v9, (a0), zero
+; CHECK-NEXT: vslide1down.vx v8, v8, a5
+; CHECK-NEXT: vslide1down.vx v8, v8, a6
+; CHECK-NEXT: vslide1down.vx v10, v8, a7
+; CHECK-NEXT: vslide1down.vx v8, v9, t0
+; CHECK-NEXT: vslide1down.vx v8, v8, t1
+; CHECK-NEXT: vslide1down.vx v8, v8, t2
+; CHECK-NEXT: vslide1down.vx v8, v8, t3
+; CHECK-NEXT: vslide1down.vx v8, v8, t4
+; CHECK-NEXT: vslide1down.vx v8, v8, t5
+; CHECK-NEXT: vslide1down.vx v8, v8, t6
+; CHECK-NEXT: li a0, 255
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; CHECK-NEXT: vslidedown.vi v8, v10, 8, v0.t
----------------
preames wrote:
Agreed, but I don't think having this done via masking is problematic either. I see this as a low priority follow up.
https://github.com/llvm/llvm-project/pull/81312
More information about the llvm-commits
mailing list