[llvm] [SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands (PR #147583)

Thu Jul 10 07:49:11 PDT 2025

asb wrote:

The changed translation unit from the Blur microbenchmark is quite small, so isolating that:

tc.ll:
```llvm
; ModuleID = 'gaussianBlurKernel.bc'
source_filename = "./MicroBenchmarks/ImageProcessing/Blur/gaussianBlurKernel.c"
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"

; Function Attrs: nounwind uwtable vscale_range(2,1024)
define dso_local void @gaussianBlurKernel(i32 noundef signext %height, i32 noundef signext %width, ptr noundef %inputImage, ptr noundef %outputImage) #0 {
entry:
  %height.addr = alloca i32, align 4
  %width.addr = alloca i32, align 4
  %inputImage.addr = alloca ptr, align 8
  %outputImage.addr = alloca ptr, align 8
  %sigma = alloca float, align 4
  %s = alloca float, align 4
  %offset = alloca i32, align 4
  %sum = alloca float, align 4
  %gaussianFilter = alloca [9 x [9 x float]], align 4
  %x = alloca i32, align 4
  %cleanup.dest.slot = alloca i32, align 4
  %y = alloca i32, align 4
  %sum_in_current_frame = alloca float, align 4
  %i = alloca i32, align 4
  %j = alloca i32, align 4
  %k = alloca i32, align 4
  %l = alloca i32, align 4
  store i32 %height, ptr %height.addr, align 4, !tbaa !9
  store i32 %width, ptr %width.addr, align 4, !tbaa !9
  store ptr %inputImage, ptr %inputImage.addr, align 8, !tbaa !13
  store ptr %outputImage, ptr %outputImage.addr, align 8, !tbaa !13
  %0 = load i32, ptr %height.addr, align 4, !tbaa !9
  %1 = zext i32 %0 to i64
  %2 = load i32, ptr %width.addr, align 4, !tbaa !9
  %3 = zext i32 %2 to i64
  %4 = load i32, ptr %height.addr, align 4, !tbaa !9
  %5 = zext i32 %4 to i64
  %6 = load i32, ptr %width.addr, align 4, !tbaa !9
  %7 = zext i32 %6 to i64
  call void @llvm.lifetime.start.p0(i64 4, ptr %sigma) #4
  store float 9.000000e+00, ptr %sigma, align 4, !tbaa !16
  call void @llvm.lifetime.start.p0(i64 4, ptr %s) #4
  %8 = load float, ptr %sigma, align 4, !tbaa !16
  %conv = fpext float %8 to double
  %mul = fmul double 2.000000e+00, %conv
  %9 = load float, ptr %sigma, align 4, !tbaa !16
  %conv1 = fpext float %9 to double
  %mul2 = fmul double %mul, %conv1
  %conv3 = fptrunc double %mul2 to float
  store float %conv3, ptr %s, align 4, !tbaa !16
  call void @llvm.lifetime.start.p0(i64 4, ptr %offset) #4
  store i32 4, ptr %offset, align 4, !tbaa !9
  call void @llvm.lifetime.start.p0(i64 4, ptr %sum) #4
  store float 0.000000e+00, ptr %sum, align 4, !tbaa !16
  call void @llvm.lifetime.start.p0(i64 324, ptr %gaussianFilter) #4
  call void @llvm.memset.p0.i64(ptr align 4 %gaussianFilter, i8 0, i64 324, i1 false)
  call void @llvm.lifetime.start.p0(i64 4, ptr %x) #4
  %10 = load i32, ptr %offset, align 4, !tbaa !9
  %mul4 = mul nsw i32 -1, %10
  store i32 %mul4, ptr %x, align 4, !tbaa !9
  br label %for.cond

for.cond:                                         ; preds = %for.inc31, %entry
  %11 = load i32, ptr %x, align 4, !tbaa !9
  %12 = load i32, ptr %offset, align 4, !tbaa !9
  %cmp = icmp sle i32 %11, %12
  br i1 %cmp, label %for.body, label %for.cond.cleanup

for.cond.cleanup:                                 ; preds = %for.cond
  store i32 2, ptr %cleanup.dest.slot, align 4
  call void @llvm.lifetime.end.p0(i64 4, ptr %x) #4
  br label %for.end33

for.body:                                         ; preds = %for.cond
  call void @llvm.lifetime.start.p0(i64 4, ptr %y) #4
  %13 = load i32, ptr %offset, align 4, !tbaa !9
  %mul6 = mul nsw i32 -1, %13
  store i32 %mul6, ptr %y, align 4, !tbaa !9
  br label %for.cond7

for.cond7:                                        ; preds = %for.inc, %for.body
  %14 = load i32, ptr %y, align 4, !tbaa !9
  %15 = load i32, ptr %offset, align 4, !tbaa !9
  %cmp8 = icmp sle i32 %14, %15
  br i1 %cmp8, label %for.body11, label %for.cond.cleanup10

for.cond.cleanup10:                               ; preds = %for.cond7
  store i32 5, ptr %cleanup.dest.slot, align 4
  call void @llvm.lifetime.end.p0(i64 4, ptr %y) #4
  br label %for.end

for.body11:                                       ; preds = %for.cond7
  %16 = load i32, ptr %x, align 4, !tbaa !9
  %17 = load i32, ptr %x, align 4, !tbaa !9
  %mul12 = mul nsw i32 %16, %17
  %18 = load i32, ptr %y, align 4, !tbaa !9
  %19 = load i32, ptr %y, align 4, !tbaa !9
  %mul13 = mul nsw i32 %18, %19
  %add = add nsw i32 %mul12, %mul13
  %sub = sub nsw i32 0, %add
  %conv14 = sitofp i32 %sub to float
  %20 = load float, ptr %s, align 4, !tbaa !16
  %div = fdiv float %conv14, %20
  %conv15 = fpext float %div to double
  %call = call double @exp(double noundef %conv15) #4, !tbaa !9
  %21 = load float, ptr %s, align 4, !tbaa !16
  %conv16 = fpext float %21 to double
  %mul17 = fmul double 0x400921FB54442D18, %conv16
  %div18 = fdiv double %call, %mul17
  %conv19 = fptrunc double %div18 to float
  %22 = load i32, ptr %x, align 4, !tbaa !9
  %23 = load i32, ptr %offset, align 4, !tbaa !9
  %add20 = add nsw i32 %22, %23
  %idxprom = sext i32 %add20 to i64
  %arrayidx = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %idxprom
  %24 = load i32, ptr %y, align 4, !tbaa !9
  %25 = load i32, ptr %offset, align 4, !tbaa !9
  %add21 = add nsw i32 %24, %25
  %idxprom22 = sext i32 %add21 to i64
  %arrayidx23 = getelementptr inbounds [9 x float], ptr %arrayidx, i64 0, i64 %idxprom22
  store float %conv19, ptr %arrayidx23, align 4, !tbaa !16
  %26 = load i32, ptr %x, align 4, !tbaa !9
  %27 = load i32, ptr %offset, align 4, !tbaa !9
  %add24 = add nsw i32 %26, %27
  %idxprom25 = sext i32 %add24 to i64
  %arrayidx26 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %idxprom25
  %28 = load i32, ptr %y, align 4, !tbaa !9
  %29 = load i32, ptr %offset, align 4, !tbaa !9
  %add27 = add nsw i32 %28, %29
  %idxprom28 = sext i32 %add27 to i64
  %arrayidx29 = getelementptr inbounds [9 x float], ptr %arrayidx26, i64 0, i64 %idxprom28
  %30 = load float, ptr %arrayidx29, align 4, !tbaa !16
  %31 = load float, ptr %sum, align 4, !tbaa !16
  %add30 = fadd float %31, %30
  store float %add30, ptr %sum, align 4, !tbaa !16
  br label %for.inc

for.inc:                                          ; preds = %for.body11
  %32 = load i32, ptr %y, align 4, !tbaa !9
  %inc = add nsw i32 %32, 1
  store i32 %inc, ptr %y, align 4, !tbaa !9
  br label %for.cond7, !llvm.loop !18

for.end:                                          ; preds = %for.cond.cleanup10
  br label %for.inc31

for.inc31:                                        ; preds = %for.end
  %33 = load i32, ptr %x, align 4, !tbaa !9
  %inc32 = add nsw i32 %33, 1
  store i32 %inc32, ptr %x, align 4, !tbaa !9
  br label %for.cond, !llvm.loop !20

for.end33:                                        ; preds = %for.cond.cleanup
  call void @llvm.lifetime.start.p0(i64 4, ptr %sum_in_current_frame) #4
  store float 0.000000e+00, ptr %sum_in_current_frame, align 4, !tbaa !16
  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #4
  %34 = load i32, ptr %offset, align 4, !tbaa !9
  store i32 %34, ptr %i, align 4, !tbaa !9
  br label %for.cond34

for.cond34:                                       ; preds = %for.inc88, %for.end33
  %35 = load i32, ptr %i, align 4, !tbaa !9
  %36 = load i32, ptr %height.addr, align 4, !tbaa !9
  %37 = load i32, ptr %offset, align 4, !tbaa !9
  %sub35 = sub nsw i32 %36, %37
  %cmp36 = icmp slt i32 %35, %sub35
  br i1 %cmp36, label %for.body39, label %for.cond.cleanup38

for.cond.cleanup38:                               ; preds = %for.cond34
  store i32 8, ptr %cleanup.dest.slot, align 4
  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #4
  br label %for.end90

for.body39:                                       ; preds = %for.cond34
  call void @llvm.lifetime.start.p0(i64 4, ptr %j) #4
  %38 = load i32, ptr %offset, align 4, !tbaa !9
  store i32 %38, ptr %j, align 4, !tbaa !9
  br label %for.cond40

for.cond40:                                       ; preds = %for.inc85, %for.body39
  %39 = load i32, ptr %j, align 4, !tbaa !9
  %40 = load i32, ptr %width.addr, align 4, !tbaa !9
  %41 = load i32, ptr %offset, align 4, !tbaa !9
  %sub41 = sub nsw i32 %40, %41
  %cmp42 = icmp slt i32 %39, %sub41
  br i1 %cmp42, label %for.body45, label %for.cond.cleanup44

for.cond.cleanup44:                               ; preds = %for.cond40
  store i32 11, ptr %cleanup.dest.slot, align 4
  call void @llvm.lifetime.end.p0(i64 4, ptr %j) #4
  br label %for.end87

for.body45:                                       ; preds = %for.cond40
  store float 0.000000e+00, ptr %sum_in_current_frame, align 4, !tbaa !16
  call void @llvm.lifetime.start.p0(i64 4, ptr %k) #4
  %42 = load i32, ptr %offset, align 4, !tbaa !9
  %mul46 = mul nsw i32 -1, %42
  store i32 %mul46, ptr %k, align 4, !tbaa !9
  br label %for.cond47

for.cond47:                                       ; preds = %for.inc76, %for.body45
  %43 = load i32, ptr %k, align 4, !tbaa !9
  %44 = load i32, ptr %offset, align 4, !tbaa !9
  %cmp48 = icmp sle i32 %43, %44
  br i1 %cmp48, label %for.body51, label %for.cond.cleanup50

for.cond.cleanup50:                               ; preds = %for.cond47
  store i32 14, ptr %cleanup.dest.slot, align 4
  call void @llvm.lifetime.end.p0(i64 4, ptr %k) #4
  br label %for.end78

for.body51:                                       ; preds = %for.cond47
  call void @llvm.lifetime.start.p0(i64 4, ptr %l) #4
  %45 = load i32, ptr %offset, align 4, !tbaa !9
  %mul52 = mul nsw i32 -1, %45
  store i32 %mul52, ptr %l, align 4, !tbaa !9
  br label %for.cond53

for.cond53:                                       ; preds = %for.inc73, %for.body51
  %46 = load i32, ptr %l, align 4, !tbaa !9
  %47 = load i32, ptr %offset, align 4, !tbaa !9
  %cmp54 = icmp sle i32 %46, %47
  br i1 %cmp54, label %for.body57, label %for.cond.cleanup56

for.cond.cleanup56:                               ; preds = %for.cond53
  store i32 17, ptr %cleanup.dest.slot, align 4
  call void @llvm.lifetime.end.p0(i64 4, ptr %l) #4
  br label %for.end75

for.body57:                                       ; preds = %for.cond53
  %48 = load ptr, ptr %inputImage.addr, align 8, !tbaa !13
  %49 = load i32, ptr %i, align 4, !tbaa !9
  %50 = load i32, ptr %k, align 4, !tbaa !9
  %add58 = add nsw i32 %49, %50
  %idxprom59 = sext i32 %add58 to i64
  %51 = mul nsw i64 %idxprom59, %3
  %arrayidx60 = getelementptr inbounds i32, ptr %48, i64 %51
  %52 = load i32, ptr %j, align 4, !tbaa !9
  %53 = load i32, ptr %l, align 4, !tbaa !9
  %add61 = add nsw i32 %52, %53
  %idxprom62 = sext i32 %add61 to i64
  %arrayidx63 = getelementptr inbounds i32, ptr %arrayidx60, i64 %idxprom62
  %54 = load i32, ptr %arrayidx63, align 4, !tbaa !9
  %conv64 = sitofp i32 %54 to float
  %55 = load i32, ptr %k, align 4, !tbaa !9
  %56 = load i32, ptr %offset, align 4, !tbaa !9
  %add65 = add nsw i32 %55, %56
  %idxprom66 = sext i32 %add65 to i64
  %arrayidx67 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %idxprom66
  %57 = load i32, ptr %l, align 4, !tbaa !9
  %58 = load i32, ptr %offset, align 4, !tbaa !9
  %add68 = add nsw i32 %57, %58
  %idxprom69 = sext i32 %add68 to i64
  %arrayidx70 = getelementptr inbounds [9 x float], ptr %arrayidx67, i64 0, i64 %idxprom69
  %59 = load float, ptr %arrayidx70, align 4, !tbaa !16
  %mul71 = fmul float %conv64, %59
  %60 = load float, ptr %sum_in_current_frame, align 4, !tbaa !16
  %add72 = fadd float %60, %mul71
  store float %add72, ptr %sum_in_current_frame, align 4, !tbaa !16
  br label %for.inc73

for.inc73:                                        ; preds = %for.body57
  %61 = load i32, ptr %l, align 4, !tbaa !9
  %inc74 = add nsw i32 %61, 1
  store i32 %inc74, ptr %l, align 4, !tbaa !9
  br label %for.cond53, !llvm.loop !21

for.end75:                                        ; preds = %for.cond.cleanup56
  br label %for.inc76

for.inc76:                                        ; preds = %for.end75
  %62 = load i32, ptr %k, align 4, !tbaa !9
  %inc77 = add nsw i32 %62, 1
  store i32 %inc77, ptr %k, align 4, !tbaa !9
  br label %for.cond47, !llvm.loop !22

for.end78:                                        ; preds = %for.cond.cleanup50
  %63 = load float, ptr %sum_in_current_frame, align 4, !tbaa !16
  %64 = load float, ptr %sum, align 4, !tbaa !16
  %div79 = fdiv float %63, %64
  %conv80 = fptosi float %div79 to i32
  %65 = load ptr, ptr %outputImage.addr, align 8, !tbaa !13
  %66 = load i32, ptr %i, align 4, !tbaa !9
  %idxprom81 = sext i32 %66 to i64
  %67 = mul nsw i64 %idxprom81, %7
  %arrayidx82 = getelementptr inbounds i32, ptr %65, i64 %67
  %68 = load i32, ptr %j, align 4, !tbaa !9
  %idxprom83 = sext i32 %68 to i64
  %arrayidx84 = getelementptr inbounds i32, ptr %arrayidx82, i64 %idxprom83
  store i32 %conv80, ptr %arrayidx84, align 4, !tbaa !9
  br label %for.inc85

for.inc85:                                        ; preds = %for.end78
  %69 = load i32, ptr %j, align 4, !tbaa !9
  %inc86 = add nsw i32 %69, 1
  store i32 %inc86, ptr %j, align 4, !tbaa !9
  br label %for.cond40, !llvm.loop !23

for.end87:                                        ; preds = %for.cond.cleanup44
  br label %for.inc88

for.inc88:                                        ; preds = %for.end87
  %70 = load i32, ptr %i, align 4, !tbaa !9
  %inc89 = add nsw i32 %70, 1
  store i32 %inc89, ptr %i, align 4, !tbaa !9
  br label %for.cond34, !llvm.loop !24

for.end90:                                        ; preds = %for.cond.cleanup38
  call void @llvm.lifetime.end.p0(i64 4, ptr %sum_in_current_frame) #4
  call void @llvm.lifetime.end.p0(i64 324, ptr %gaussianFilter) #4
  call void @llvm.lifetime.end.p0(i64 4, ptr %sum) #4
  call void @llvm.lifetime.end.p0(i64 4, ptr %offset) #4
  call void @llvm.lifetime.end.p0(i64 4, ptr %s) #4
  call void @llvm.lifetime.end.p0(i64 4, ptr %sigma) #4
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.start.p0(i64 immarg, ptr captures(none)) #1

; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
declare void @llvm.memset.p0.i64(ptr writeonly captures(none), i8, i64, i1 immarg) #2

; Function Attrs: nounwind
declare double @exp(double noundef) #3

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.end.p0(i64 immarg, ptr captures(none)) #1

attributes #0 = { nounwind uwtable vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+b,+c,+d,+f,+m,+relax,+supm,+v,+za64rs,+zaamo,+zalrsc,+zawrs,+zba,+zbb,+zbs,+zca,+zcb,+zcd,+zcmop,+zfa,+zfhmin,+zic64b,+zicbom,+zicbop,+zicboz,+ziccamoa,+ziccif,+zicclsm,+ziccrse,+zicntr,+zicond,+zicsr,+zihintntl,+zihintpause,+zihpm,+zimop,+zkt,+zmmul,+zvbb,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvfhmin,+zvkb,+zvkt,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-p,-experimental-smctr,-experimental-ssctr,-experimental-svukte,-experimental-xqccmp,-experimental-xqcia,-experimental-xqciac,-experimental-xqcibi,-experimental-xqcibm,-experimental-xqcicli,-experimental-xqcicm,-experimental-xqcics,-experimental-xqcicsr,-experimental-xqciint,-experimental-xqciio,-experimental-xqcilb,-experimental-xqcili,-experimental-xqcilia,-experimental-xqcilo,-experimental-xqcilsm,-experimental-xqcisim,-experimental-xqcisls,-experimental-xqcisync,-experimental-xrivosvisni,-experimental-xrivosvizip,-experimental-xsfmclic,-experimental-xsfsclic,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-experimental-zvqdotq,-h,-q,-sdext,-sdtrig,-sha,-shcounterenw,-shgatpa,-shlcofideleg,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcntrpmf,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xandesperf,-xandesvbfhcvt,-xandesvdot,-xandesvpackfph,-xandesvsintload,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xmipscbop,-xmipscmov,-xmipslsp,-xsfcease,-xsfmm128t,-xsfmm16t,-xsfmm32a16f,-xsfmm32a32f,-xsfmm32a8f,-xsfmm32a8i,-xsfmm32t,-xsfmm64a64f,-xsfmm64t,-xsfmmbase,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-zabha,-zacas,-zama16b,-zbc,-zbkb,-zbkc,-zbkx,-zce,-zcf,-zclsd,-zcmp,-zcmt,-zdinx,-zfbfmin,-zfh,-zfinx,-zhinx,-zhinxmin,-ziccamoc,-zifencei,-zilsd,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-ztso,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
attributes #3 = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+b,+c,+d,+f,+m,+relax,+supm,+v,+za64rs,+zaamo,+zalrsc,+zawrs,+zba,+zbb,+zbs,+zca,+zcb,+zcd,+zcmop,+zfa,+zfhmin,+zic64b,+zicbom,+zicbop,+zicboz,+ziccamoa,+ziccif,+zicclsm,+ziccrse,+zicntr,+zicond,+zicsr,+zihintntl,+zihintpause,+zihpm,+zimop,+zkt,+zmmul,+zvbb,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvfhmin,+zvkb,+zvkt,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-p,-experimental-smctr,-experimental-ssctr,-experimental-svukte,-experimental-xqccmp,-experimental-xqcia,-experimental-xqciac,-experimental-xqcibi,-experimental-xqcibm,-experimental-xqcicli,-experimental-xqcicm,-experimental-xqcics,-experimental-xqcicsr,-experimental-xqciint,-experimental-xqciio,-experimental-xqcilb,-experimental-xqcili,-experimental-xqcilia,-experimental-xqcilo,-experimental-xqcilsm,-experimental-xqcisim,-experimental-xqcisls,-experimental-xqcisync,-experimental-xrivosvisni,-experimental-xrivosvizip,-experimental-xsfmclic,-experimental-xsfsclic,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-experimental-zvqdotq,-h,-q,-sdext,-sdtrig,-sha,-shcounterenw,-shgatpa,-shlcofideleg,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcntrpmf,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xandesperf,-xandesvbfhcvt,-xandesvdot,-xandesvpackfph,-xandesvsintload,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xmipscbop,-xmipscmov,-xmipslsp,-xsfcease,-xsfmm128t,-xsfmm16t,-xsfmm32a16f,-xsfmm32a32f,-xsfmm32a8f,-xsfmm32a8i,-xsfmm32t,-xsfmm64a64f,-xsfmm64t,-xsfmmbase,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-zabha,-zacas,-zama16b,-zbc,-zbkb,-zbkc,-zbkx,-zce,-zcf,-zclsd,-zcmp,-zcmt,-zdinx,-zfbfmin,-zfh,-zfinx,-zhinx,-zhinxmin,-ziccamoc,-zifencei,-zilsd,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-ztso,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
attributes #4 = { nounwind }

!llvm.module.flags = !{!0, !1, !2, !4, !5, !6, !7}
!llvm.ident = !{!8}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 1, !"target-abi", !"lp64d"}
!2 = !{i32 6, !"riscv-isa", !3}
!3 = !{!"rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_v1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zaamo1p0_zalrsc1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcd1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_supm1p0"}
!4 = !{i32 8, !"PIC Level", i32 2}
!5 = !{i32 7, !"PIE Level", i32 2}
!6 = !{i32 7, !"uwtable", i32 2}
!7 = !{i32 8, !"SmallDataLimit", i32 0}
!8 = !{!"clang version 21.0.0git"}
!9 = !{!10, !10, i64 0}
!10 = !{!"int", !11, i64 0}
!11 = !{!"omnipotent char", !12, i64 0}
!12 = !{!"Simple C/C++ TBAA"}
!13 = !{!14, !14, i64 0}
!14 = !{!"p1 int", !15, i64 0}
!15 = !{!"any pointer", !11, i64 0}
!16 = !{!17, !17, i64 0}
!17 = !{!"float", !11, i64 0}
!18 = distinct !{!18, !19}
!19 = !{!"llvm.loop.mustprogress"}
!20 = distinct !{!20, !19}
!21 = distinct !{!21, !19}
!22 = distinct !{!22, !19}
!23 = distinct !{!23, !19}
!24 = distinct !{!24, !19}
```
And build with:
```
clang --target=riscv64-linux-gnu -march=rva23u64 -O3 -ffp-contract=off tc.ll -c -S -o -
```

Looking at the dump of `-mllvm -debug -mllvm -print-after-all` for LLVM with/without this patch, the amount of logic dropped by SLP after this patch seems suspect.

```diff
 ; *** IR Dump After SLPVectorizerPass on gaussianBlurKernel ***
 ; Function Attrs: nofree norecurse nounwind memory(argmem: readwrite, errnomem: write) uwtable vscale_range(2,1024)
 define dso_local void @gaussianBlurKernel(i32 noundef signext %height, i32 noundef signext %width, ptr noundef readonly captures(none) %inputImage, ptr noundef writeonly captures(none) %outputImage) local_unnamed_addr #0 {
 entry:
   %gaussianFilter = alloca [9 x [9 x float]], align 4
-  call void @llvm.lifetime.start.p0(i64 324, ptr nonnull %gaussianFilter) #4
+  call void @llvm.lifetime.start.p0(i64 324, ptr nonnull %gaussianFilter) #5
   call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 dereferenceable(324) %gaussianFilter, i8 0, i64 324, i1 false)
   br label %for.cond7.preheader
 
@@ -41577,7 +40757,7 @@
   %conv14 = sitofp i32 %3 to float
   %div = fdiv float %conv14, 1.620000e+02
   %conv15 = fpext float %div to double
-  %call = tail call double @exp(double noundef %conv15) #4, !tbaa !9
+  %call = tail call double @exp(double noundef %conv15) #5, !tbaa !9
   %div18 = fdiv double %call, 0x407FCF0216A64912
   %conv19 = fptrunc double %div18 to float
   %arrayidx23 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 0
@@ -41588,7 +40768,7 @@
   %conv14.1 = sitofp i32 %5 to float
   %div.1 = fdiv float %conv14.1, 1.620000e+02
   %conv15.1 = fpext float %div.1 to double
-  %call.1 = tail call double @exp(double noundef %conv15.1) #4, !tbaa !9
+  %call.1 = tail call double @exp(double noundef %conv15.1) #5, !tbaa !9
   %div18.1 = fdiv double %call.1, 0x407FCF0216A64912
   %conv19.1 = fptrunc double %div18.1 to float
   %arrayidx23.1 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 1
@@ -41599,7 +40779,7 @@
   %conv14.2 = sitofp i32 %7 to float
   %div.2 = fdiv float %conv14.2, 1.620000e+02
   %conv15.2 = fpext float %div.2 to double
-  %call.2 = tail call double @exp(double noundef %conv15.2) #4, !tbaa !9
+  %call.2 = tail call double @exp(double noundef %conv15.2) #5, !tbaa !9
   %div18.2 = fdiv double %call.2, 0x407FCF0216A64912
   %conv19.2 = fptrunc double %div18.2 to float
   %arrayidx23.2 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 2
@@ -41610,7 +40790,7 @@
   %conv14.3 = sitofp i32 %9 to float
   %div.3 = fdiv float %conv14.3, 1.620000e+02
   %conv15.3 = fpext float %div.3 to double
-  %call.3 = tail call double @exp(double noundef %conv15.3) #4, !tbaa !9
+  %call.3 = tail call double @exp(double noundef %conv15.3) #5, !tbaa !9
   %div18.3 = fdiv double %call.3, 0x407FCF0216A64912
   %conv19.3 = fptrunc double %div18.3 to float
   %arrayidx23.3 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 3
@@ -41621,31 +40801,31 @@
   %conv14.4 = sitofp i32 %11 to float
   %div.4 = fdiv float %conv14.4, 1.620000e+02
   %conv15.4 = fpext float %div.4 to double
-  %call.4 = tail call double @exp(double noundef %conv15.4) #4, !tbaa !9
+  %call.4 = tail call double @exp(double noundef %conv15.4) #5, !tbaa !9
   %div18.4 = fdiv double %call.4, 0x407FCF0216A64912
   %conv19.4 = fptrunc double %div18.4 to float
   %arrayidx23.4 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 4
   store float %conv19.4, ptr %arrayidx23.4, align 4, !tbaa !13
   %add30.4 = fadd float %add30.3, %conv19.4
-  %call.5 = tail call double @exp(double noundef %conv15.3) #4, !tbaa !9
+  %call.5 = tail call double @exp(double noundef %conv15.3) #5, !tbaa !9
   %div18.5 = fdiv double %call.5, 0x407FCF0216A64912
   %conv19.5 = fptrunc double %div18.5 to float
   %arrayidx23.5 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 5
   store float %conv19.5, ptr %arrayidx23.5, align 4, !tbaa !13
   %add30.5 = fadd float %add30.4, %conv19.5
-  %call.6 = tail call double @exp(double noundef %conv15.2) #4, !tbaa !9
+  %call.6 = tail call double @exp(double noundef %conv15.2) #5, !tbaa !9
   %div18.6 = fdiv double %call.6, 0x407FCF0216A64912
   %conv19.6 = fptrunc double %div18.6 to float
   %arrayidx23.6 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 6
   store float %conv19.6, ptr %arrayidx23.6, align 4, !tbaa !13
   %add30.6 = fadd float %add30.5, %conv19.6
-  %call.7 = tail call double @exp(double noundef %conv15.1) #4, !tbaa !9
+  %call.7 = tail call double @exp(double noundef %conv15.1) #5, !tbaa !9
   %div18.7 = fdiv double %call.7, 0x407FCF0216A64912
   %conv19.7 = fptrunc double %div18.7 to float
   %arrayidx23.7 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 7
   store float %conv19.7, ptr %arrayidx23.7, align 4, !tbaa !13
   %add30.7 = fadd float %add30.6, %conv19.7
-  %call.8 = tail call double @exp(double noundef %conv15) #4, !tbaa !9
+  %call.8 = tail call double @exp(double noundef %conv15) #5, !tbaa !9
   %div18.8 = fdiv double %call.8, 0x407FCF0216A64912
   %conv19.8 = fptrunc double %div18.8 to float
   %arrayidx23.8 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 8
@@ -41680,74 +40860,38 @@
   %conv80.us = fptosi float %div79.us to i32
   %arrayidx84.us = getelementptr inbounds nuw i32, ptr %arrayidx82.us, i64 %indvars.iv167
   store i32 %conv80.us, ptr %arrayidx84.us, align 4, !tbaa !9
-  %exitcond169.not = icmp eq i64 %47, %wide.trip.count
+  %exitcond169.not = icmp eq i64 %26, %wide.trip.count
   br i1 %exitcond169.not, label %for.cond40.for.cond.cleanup44_crit_edge.us, label %for.cond47.preheader.us, !llvm.loop !17
 
 for.cond53.preheader.us:                          ; preds = %for.cond47.preheader.us, %for.cond53.preheader.us
   %indvars.iv162 = phi i64 [ -4, %for.cond47.preheader.us ], [ %indvars.iv.next163, %for.cond53.preheader.us ]
-  %sum_in_current_frame.0144.us = phi float [ 0.000000e+00, %for.cond47.preheader.us ], [ %add72.us.8, %for.cond53.preheader.us ]
   %14 = add nsw i64 %indvars.iv162, %indvars.iv170
   %15 = mul nuw nsw i64 %14, %12
   %arrayidx60.us = getelementptr inbounds i32, ptr %inputImage, i64 %15
   %16 = add nsw i64 %indvars.iv162, 4
-  %17 = getelementptr i32, ptr %arrayidx60.us, i64 %indvars.iv167
-  %arrayidx63.us = getelementptr i8, ptr %17, i64 -16
-  %arrayidx70.us = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 0
-  %18 = load <2 x i32>, ptr %arrayidx63.us, align 4, !tbaa !9
+  %17 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
+  %arrayidx63.us.6 = getelementptr inbounds nuw i8, ptr %17, i64 8
+  %arrayidx70.us.6 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 6
+  %18 = load <2 x i32>, ptr %arrayidx63.us.6, align 4, !tbaa !9
   %19 = sitofp <2 x i32> %18 to <2 x float>
-  %20 = load <2 x float>, ptr %arrayidx70.us, align 4, !tbaa !13
+  %20 = load <2 x float>, ptr %arrayidx70.us.6, align 4, !tbaa !13
   %21 = fmul <2 x float> %20, %19
-  %22 = extractelement <2 x float> %21, i32 0
-  %add72.us = fadd float %sum_in_current_frame.0144.us, %22
-  %23 = extractelement <2 x float> %21, i32 1
-  %add72.us.1 = fadd float %add72.us, %23
-  %24 = getelementptr i32, ptr %arrayidx60.us, i64 %indvars.iv167
-  %arrayidx63.us.2 = getelementptr i8, ptr %24, i64 -8
-  %arrayidx70.us.2 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 2
-  %25 = load <2 x i32>, ptr %arrayidx63.us.2, align 4, !tbaa !9
-  %26 = sitofp <2 x i32> %25 to <2 x float>
-  %27 = load <2 x float>, ptr %arrayidx70.us.2, align 4, !tbaa !13
-  %28 = fmul <2 x float> %27, %26
-  %29 = extractelement <2 x float> %28, i32 0
-  %add72.us.2 = fadd float %add72.us.1, %29
-  %30 = extractelement <2 x float> %28, i32 1
-  %add72.us.3 = fadd float %add72.us.2, %30
-  %arrayidx63.us.4 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
-  %arrayidx70.us.4 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 4
-  %31 = load <2 x i32>, ptr %arrayidx63.us.4, align 4, !tbaa !9
-  %32 = sitofp <2 x i32> %31 to <2 x float>
-  %33 = load <2 x float>, ptr %arrayidx70.us.4, align 4, !tbaa !13
-  %34 = fmul <2 x float> %33, %32
-  %35 = extractelement <2 x float> %34, i32 0
-  %add72.us.4 = fadd float %add72.us.3, %35
-  %36 = extractelement <2 x float> %34, i32 1
-  %add72.us.5 = fadd float %add72.us.4, %36
-  %37 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
-  %arrayidx63.us.6 = getelementptr inbounds nuw i8, ptr %37, i64 8
-  %arrayidx70.us.6 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 6
-  %38 = load <2 x i32>, ptr %arrayidx63.us.6, align 4, !tbaa !9
-  %39 = sitofp <2 x i32> %38 to <2 x float>
-  %40 = load <2 x float>, ptr %arrayidx70.us.6, align 4, !tbaa !13
-  %41 = fmul <2 x float> %40, %39
-  %42 = extractelement <2 x float> %41, i32 0
-  %add72.us.6 = fadd float %add72.us.5, %42
-  %43 = extractelement <2 x float> %41, i32 1
-  %add72.us.7 = fadd float %add72.us.6, %43
-  %44 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
-  %arrayidx63.us.8 = getelementptr inbounds nuw i8, ptr %44, i64 16
-  %45 = load i32, ptr %arrayidx63.us.8, align 4, !tbaa !9
-  %conv64.us.8 = sitofp i32 %45 to float
+  %22 = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %21)
+  %23 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
+  %arrayidx63.us.8 = getelementptr inbounds nuw i8, ptr %23, i64 16
+  %24 = load i32, ptr %arrayidx63.us.8, align 4, !tbaa !9
+  %conv64.us.8 = sitofp i32 %24 to float
   %arrayidx70.us.8 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 8
-  %46 = load float, ptr %arrayidx70.us.8, align 4, !tbaa !13
-  %mul71.us.8 = fmul float %46, %conv64.us.8
-  %add72.us.8 = fadd float %add72.us.7, %mul71.us.8
+  %25 = load float, ptr %arrayidx70.us.8, align 4, !tbaa !13
+  %mul71.us.8 = fmul float %25, %conv64.us.8
+  %add72.us.8 = fadd float %22, %mul71.us.8
   %indvars.iv.next163 = add nsw i64 %indvars.iv162, 1
   %exitcond166.not = icmp eq i64 %indvars.iv.next163, 5
   br i1 %exitcond166.not, label %for.cond.cleanup50.us, label %for.cond53.preheader.us, !llvm.loop !18
 
 for.cond47.preheader.us:                          ; preds = %for.cond40.preheader.us, %for.cond.cleanup50.us
-  %indvars.iv167 = phi i64 [ 4, %for.cond40.preheader.us ], [ %47, %for.cond.cleanup50.us ]
-  %47 = add nuw nsw i64 %indvars.iv167, 1
+  %indvars.iv167 = phi i64 [ 4, %for.cond40.preheader.us ], [ %26, %for.cond.cleanup50.us ]
+  %26 = add nuw nsw i64 %indvars.iv167, 1
   br label %for.cond53.preheader.us
 
 for.cond40.for.cond.cleanup44_crit_edge.us:       ; preds = %for.cond.cleanup50.us
@@ -41756,14 +40900,14 @@
   br i1 %exitcond173.not, label %for.cond.cleanup38, label %for.cond40.preheader.us, !llvm.loop !19
 
 for.cond.cleanup38:                               ; preds = %for.cond40.for.cond.cleanup44_crit_edge.us, %for.cond34.preheader
-  call void @llvm.lifetime.end.p0(i64 324, ptr nonnull %gaussianFilter) #4
+  call void @llvm.lifetime.end.p0(i64 324, ptr nonnull %gaussianFilter) #5
   ret void
 }

```

https://github.com/llvm/llvm-project/pull/147583