[llvm] [RISCV] Vectorize phi for loop carried @llvm.vector.reduce.fadd (PR #78244)
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 17 07:37:17 PST 2024
================
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+declare i64 @llvm.vscale.i64()
+declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>)
+
+define float @reduce_fadd(ptr nocapture noundef readonly %f, i32 noundef signext %N) {
+; CHECK-LABEL: reduce_fadd:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: blez a1, .LBB0_3
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: addi sp, sp, -48
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: sd ra, 40(sp) # 8-byte Folded Spill
+; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill
+; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill
+; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill
+; CHECK-NEXT: sd s3, 8(sp) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset ra, -8
+; CHECK-NEXT: .cfi_offset s0, -16
+; CHECK-NEXT: .cfi_offset s1, -24
+; CHECK-NEXT: .cfi_offset s2, -32
+; CHECK-NEXT: .cfi_offset s3, -40
+; CHECK-NEXT: csrr s1, vlenb
+; CHECK-NEXT: srli s0, s1, 1
+; CHECK-NEXT: bgeu a1, s0, .LBB0_4
+; CHECK-NEXT: # %bb.2:
+; CHECK-NEXT: li a2, 0
+; CHECK-NEXT: fmv.w.x fa0, zero
+; CHECK-NEXT: j .LBB0_7
+; CHECK-NEXT: .LBB0_3:
+; CHECK-NEXT: fmv.w.x fa0, zero
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_4: # %vector.ph
+; CHECK-NEXT: srli a2, s1, 3
+; CHECK-NEXT: lui a3, 524288
+; CHECK-NEXT: addiw a3, a3, -4
+; CHECK-NEXT: mv s2, a0
+; CHECK-NEXT: mv a0, a2
+; CHECK-NEXT: mv s3, a1
+; CHECK-NEXT: mv a1, a3
+; CHECK-NEXT: call __muldi3
+; CHECK-NEXT: mv a1, s3
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: mv a0, s2
+; CHECK-NEXT: and a2, a2, s3
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: slli a3, s1, 1
+; CHECK-NEXT: mv a4, s2
+; CHECK-NEXT: mv a5, a2
+; CHECK-NEXT: .LBB0_5: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vl2re32.v v10, (a4)
+; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfredosum.vs v8, v10, v8
+; CHECK-NEXT: sub a5, a5, s0
+; CHECK-NEXT: add a4, a4, a3
+; CHECK-NEXT: bnez a5, .LBB0_5
+; CHECK-NEXT: # %bb.6: # %middle.block
+; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: beq a2, a1, .LBB0_9
+; CHECK-NEXT: .LBB0_7: # %for.body.preheader7
+; CHECK-NEXT: slli a2, a2, 2
+; CHECK-NEXT: add a2, a0, a2
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: .LBB0_8: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: flw fa5, 0(a2)
+; CHECK-NEXT: addi a2, a2, 4
+; CHECK-NEXT: fadd.s fa0, fa0, fa5
+; CHECK-NEXT: bne a2, a0, .LBB0_8
+; CHECK-NEXT: .LBB0_9:
+; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
+; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
+; CHECK-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
+; CHECK-NEXT: ld s2, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT: ld s3, 8(sp) # 8-byte Folded Reload
+; CHECK-NEXT: addi sp, sp, 48
+; CHECK-NEXT: ret
+entry:
+ %cmp4 = icmp sgt i32 %N, 0
----------------
preames wrote:
For both of these tests, you don't need the whole loop structure emitted by loop vectorizer. You just need the main vector loop.
https://github.com/llvm/llvm-project/pull/78244
More information about the llvm-commits
mailing list