[llvm] [SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands (PR #147583)
Alex Bradbury via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 10 07:49:11 PDT 2025
asb wrote:
The changed translation unit from the Blur microbenchmark is quite small, so isolating that:
tc.ll:
```llvm
; ModuleID = 'gaussianBlurKernel.bc'
source_filename = "./MicroBenchmarks/ImageProcessing/Blur/gaussianBlurKernel.c"
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable vscale_range(2,1024)
define dso_local void @gaussianBlurKernel(i32 noundef signext %height, i32 noundef signext %width, ptr noundef %inputImage, ptr noundef %outputImage) #0 {
entry:
%height.addr = alloca i32, align 4
%width.addr = alloca i32, align 4
%inputImage.addr = alloca ptr, align 8
%outputImage.addr = alloca ptr, align 8
%sigma = alloca float, align 4
%s = alloca float, align 4
%offset = alloca i32, align 4
%sum = alloca float, align 4
%gaussianFilter = alloca [9 x [9 x float]], align 4
%x = alloca i32, align 4
%cleanup.dest.slot = alloca i32, align 4
%y = alloca i32, align 4
%sum_in_current_frame = alloca float, align 4
%i = alloca i32, align 4
%j = alloca i32, align 4
%k = alloca i32, align 4
%l = alloca i32, align 4
store i32 %height, ptr %height.addr, align 4, !tbaa !9
store i32 %width, ptr %width.addr, align 4, !tbaa !9
store ptr %inputImage, ptr %inputImage.addr, align 8, !tbaa !13
store ptr %outputImage, ptr %outputImage.addr, align 8, !tbaa !13
%0 = load i32, ptr %height.addr, align 4, !tbaa !9
%1 = zext i32 %0 to i64
%2 = load i32, ptr %width.addr, align 4, !tbaa !9
%3 = zext i32 %2 to i64
%4 = load i32, ptr %height.addr, align 4, !tbaa !9
%5 = zext i32 %4 to i64
%6 = load i32, ptr %width.addr, align 4, !tbaa !9
%7 = zext i32 %6 to i64
call void @llvm.lifetime.start.p0(i64 4, ptr %sigma) #4
store float 9.000000e+00, ptr %sigma, align 4, !tbaa !16
call void @llvm.lifetime.start.p0(i64 4, ptr %s) #4
%8 = load float, ptr %sigma, align 4, !tbaa !16
%conv = fpext float %8 to double
%mul = fmul double 2.000000e+00, %conv
%9 = load float, ptr %sigma, align 4, !tbaa !16
%conv1 = fpext float %9 to double
%mul2 = fmul double %mul, %conv1
%conv3 = fptrunc double %mul2 to float
store float %conv3, ptr %s, align 4, !tbaa !16
call void @llvm.lifetime.start.p0(i64 4, ptr %offset) #4
store i32 4, ptr %offset, align 4, !tbaa !9
call void @llvm.lifetime.start.p0(i64 4, ptr %sum) #4
store float 0.000000e+00, ptr %sum, align 4, !tbaa !16
call void @llvm.lifetime.start.p0(i64 324, ptr %gaussianFilter) #4
call void @llvm.memset.p0.i64(ptr align 4 %gaussianFilter, i8 0, i64 324, i1 false)
call void @llvm.lifetime.start.p0(i64 4, ptr %x) #4
%10 = load i32, ptr %offset, align 4, !tbaa !9
%mul4 = mul nsw i32 -1, %10
store i32 %mul4, ptr %x, align 4, !tbaa !9
br label %for.cond
for.cond: ; preds = %for.inc31, %entry
%11 = load i32, ptr %x, align 4, !tbaa !9
%12 = load i32, ptr %offset, align 4, !tbaa !9
%cmp = icmp sle i32 %11, %12
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
store i32 2, ptr %cleanup.dest.slot, align 4
call void @llvm.lifetime.end.p0(i64 4, ptr %x) #4
br label %for.end33
for.body: ; preds = %for.cond
call void @llvm.lifetime.start.p0(i64 4, ptr %y) #4
%13 = load i32, ptr %offset, align 4, !tbaa !9
%mul6 = mul nsw i32 -1, %13
store i32 %mul6, ptr %y, align 4, !tbaa !9
br label %for.cond7
for.cond7: ; preds = %for.inc, %for.body
%14 = load i32, ptr %y, align 4, !tbaa !9
%15 = load i32, ptr %offset, align 4, !tbaa !9
%cmp8 = icmp sle i32 %14, %15
br i1 %cmp8, label %for.body11, label %for.cond.cleanup10
for.cond.cleanup10: ; preds = %for.cond7
store i32 5, ptr %cleanup.dest.slot, align 4
call void @llvm.lifetime.end.p0(i64 4, ptr %y) #4
br label %for.end
for.body11: ; preds = %for.cond7
%16 = load i32, ptr %x, align 4, !tbaa !9
%17 = load i32, ptr %x, align 4, !tbaa !9
%mul12 = mul nsw i32 %16, %17
%18 = load i32, ptr %y, align 4, !tbaa !9
%19 = load i32, ptr %y, align 4, !tbaa !9
%mul13 = mul nsw i32 %18, %19
%add = add nsw i32 %mul12, %mul13
%sub = sub nsw i32 0, %add
%conv14 = sitofp i32 %sub to float
%20 = load float, ptr %s, align 4, !tbaa !16
%div = fdiv float %conv14, %20
%conv15 = fpext float %div to double
%call = call double @exp(double noundef %conv15) #4, !tbaa !9
%21 = load float, ptr %s, align 4, !tbaa !16
%conv16 = fpext float %21 to double
%mul17 = fmul double 0x400921FB54442D18, %conv16
%div18 = fdiv double %call, %mul17
%conv19 = fptrunc double %div18 to float
%22 = load i32, ptr %x, align 4, !tbaa !9
%23 = load i32, ptr %offset, align 4, !tbaa !9
%add20 = add nsw i32 %22, %23
%idxprom = sext i32 %add20 to i64
%arrayidx = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %idxprom
%24 = load i32, ptr %y, align 4, !tbaa !9
%25 = load i32, ptr %offset, align 4, !tbaa !9
%add21 = add nsw i32 %24, %25
%idxprom22 = sext i32 %add21 to i64
%arrayidx23 = getelementptr inbounds [9 x float], ptr %arrayidx, i64 0, i64 %idxprom22
store float %conv19, ptr %arrayidx23, align 4, !tbaa !16
%26 = load i32, ptr %x, align 4, !tbaa !9
%27 = load i32, ptr %offset, align 4, !tbaa !9
%add24 = add nsw i32 %26, %27
%idxprom25 = sext i32 %add24 to i64
%arrayidx26 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %idxprom25
%28 = load i32, ptr %y, align 4, !tbaa !9
%29 = load i32, ptr %offset, align 4, !tbaa !9
%add27 = add nsw i32 %28, %29
%idxprom28 = sext i32 %add27 to i64
%arrayidx29 = getelementptr inbounds [9 x float], ptr %arrayidx26, i64 0, i64 %idxprom28
%30 = load float, ptr %arrayidx29, align 4, !tbaa !16
%31 = load float, ptr %sum, align 4, !tbaa !16
%add30 = fadd float %31, %30
store float %add30, ptr %sum, align 4, !tbaa !16
br label %for.inc
for.inc: ; preds = %for.body11
%32 = load i32, ptr %y, align 4, !tbaa !9
%inc = add nsw i32 %32, 1
store i32 %inc, ptr %y, align 4, !tbaa !9
br label %for.cond7, !llvm.loop !18
for.end: ; preds = %for.cond.cleanup10
br label %for.inc31
for.inc31: ; preds = %for.end
%33 = load i32, ptr %x, align 4, !tbaa !9
%inc32 = add nsw i32 %33, 1
store i32 %inc32, ptr %x, align 4, !tbaa !9
br label %for.cond, !llvm.loop !20
for.end33: ; preds = %for.cond.cleanup
call void @llvm.lifetime.start.p0(i64 4, ptr %sum_in_current_frame) #4
store float 0.000000e+00, ptr %sum_in_current_frame, align 4, !tbaa !16
call void @llvm.lifetime.start.p0(i64 4, ptr %i) #4
%34 = load i32, ptr %offset, align 4, !tbaa !9
store i32 %34, ptr %i, align 4, !tbaa !9
br label %for.cond34
for.cond34: ; preds = %for.inc88, %for.end33
%35 = load i32, ptr %i, align 4, !tbaa !9
%36 = load i32, ptr %height.addr, align 4, !tbaa !9
%37 = load i32, ptr %offset, align 4, !tbaa !9
%sub35 = sub nsw i32 %36, %37
%cmp36 = icmp slt i32 %35, %sub35
br i1 %cmp36, label %for.body39, label %for.cond.cleanup38
for.cond.cleanup38: ; preds = %for.cond34
store i32 8, ptr %cleanup.dest.slot, align 4
call void @llvm.lifetime.end.p0(i64 4, ptr %i) #4
br label %for.end90
for.body39: ; preds = %for.cond34
call void @llvm.lifetime.start.p0(i64 4, ptr %j) #4
%38 = load i32, ptr %offset, align 4, !tbaa !9
store i32 %38, ptr %j, align 4, !tbaa !9
br label %for.cond40
for.cond40: ; preds = %for.inc85, %for.body39
%39 = load i32, ptr %j, align 4, !tbaa !9
%40 = load i32, ptr %width.addr, align 4, !tbaa !9
%41 = load i32, ptr %offset, align 4, !tbaa !9
%sub41 = sub nsw i32 %40, %41
%cmp42 = icmp slt i32 %39, %sub41
br i1 %cmp42, label %for.body45, label %for.cond.cleanup44
for.cond.cleanup44: ; preds = %for.cond40
store i32 11, ptr %cleanup.dest.slot, align 4
call void @llvm.lifetime.end.p0(i64 4, ptr %j) #4
br label %for.end87
for.body45: ; preds = %for.cond40
store float 0.000000e+00, ptr %sum_in_current_frame, align 4, !tbaa !16
call void @llvm.lifetime.start.p0(i64 4, ptr %k) #4
%42 = load i32, ptr %offset, align 4, !tbaa !9
%mul46 = mul nsw i32 -1, %42
store i32 %mul46, ptr %k, align 4, !tbaa !9
br label %for.cond47
for.cond47: ; preds = %for.inc76, %for.body45
%43 = load i32, ptr %k, align 4, !tbaa !9
%44 = load i32, ptr %offset, align 4, !tbaa !9
%cmp48 = icmp sle i32 %43, %44
br i1 %cmp48, label %for.body51, label %for.cond.cleanup50
for.cond.cleanup50: ; preds = %for.cond47
store i32 14, ptr %cleanup.dest.slot, align 4
call void @llvm.lifetime.end.p0(i64 4, ptr %k) #4
br label %for.end78
for.body51: ; preds = %for.cond47
call void @llvm.lifetime.start.p0(i64 4, ptr %l) #4
%45 = load i32, ptr %offset, align 4, !tbaa !9
%mul52 = mul nsw i32 -1, %45
store i32 %mul52, ptr %l, align 4, !tbaa !9
br label %for.cond53
for.cond53: ; preds = %for.inc73, %for.body51
%46 = load i32, ptr %l, align 4, !tbaa !9
%47 = load i32, ptr %offset, align 4, !tbaa !9
%cmp54 = icmp sle i32 %46, %47
br i1 %cmp54, label %for.body57, label %for.cond.cleanup56
for.cond.cleanup56: ; preds = %for.cond53
store i32 17, ptr %cleanup.dest.slot, align 4
call void @llvm.lifetime.end.p0(i64 4, ptr %l) #4
br label %for.end75
for.body57: ; preds = %for.cond53
%48 = load ptr, ptr %inputImage.addr, align 8, !tbaa !13
%49 = load i32, ptr %i, align 4, !tbaa !9
%50 = load i32, ptr %k, align 4, !tbaa !9
%add58 = add nsw i32 %49, %50
%idxprom59 = sext i32 %add58 to i64
%51 = mul nsw i64 %idxprom59, %3
%arrayidx60 = getelementptr inbounds i32, ptr %48, i64 %51
%52 = load i32, ptr %j, align 4, !tbaa !9
%53 = load i32, ptr %l, align 4, !tbaa !9
%add61 = add nsw i32 %52, %53
%idxprom62 = sext i32 %add61 to i64
%arrayidx63 = getelementptr inbounds i32, ptr %arrayidx60, i64 %idxprom62
%54 = load i32, ptr %arrayidx63, align 4, !tbaa !9
%conv64 = sitofp i32 %54 to float
%55 = load i32, ptr %k, align 4, !tbaa !9
%56 = load i32, ptr %offset, align 4, !tbaa !9
%add65 = add nsw i32 %55, %56
%idxprom66 = sext i32 %add65 to i64
%arrayidx67 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %idxprom66
%57 = load i32, ptr %l, align 4, !tbaa !9
%58 = load i32, ptr %offset, align 4, !tbaa !9
%add68 = add nsw i32 %57, %58
%idxprom69 = sext i32 %add68 to i64
%arrayidx70 = getelementptr inbounds [9 x float], ptr %arrayidx67, i64 0, i64 %idxprom69
%59 = load float, ptr %arrayidx70, align 4, !tbaa !16
%mul71 = fmul float %conv64, %59
%60 = load float, ptr %sum_in_current_frame, align 4, !tbaa !16
%add72 = fadd float %60, %mul71
store float %add72, ptr %sum_in_current_frame, align 4, !tbaa !16
br label %for.inc73
for.inc73: ; preds = %for.body57
%61 = load i32, ptr %l, align 4, !tbaa !9
%inc74 = add nsw i32 %61, 1
store i32 %inc74, ptr %l, align 4, !tbaa !9
br label %for.cond53, !llvm.loop !21
for.end75: ; preds = %for.cond.cleanup56
br label %for.inc76
for.inc76: ; preds = %for.end75
%62 = load i32, ptr %k, align 4, !tbaa !9
%inc77 = add nsw i32 %62, 1
store i32 %inc77, ptr %k, align 4, !tbaa !9
br label %for.cond47, !llvm.loop !22
for.end78: ; preds = %for.cond.cleanup50
%63 = load float, ptr %sum_in_current_frame, align 4, !tbaa !16
%64 = load float, ptr %sum, align 4, !tbaa !16
%div79 = fdiv float %63, %64
%conv80 = fptosi float %div79 to i32
%65 = load ptr, ptr %outputImage.addr, align 8, !tbaa !13
%66 = load i32, ptr %i, align 4, !tbaa !9
%idxprom81 = sext i32 %66 to i64
%67 = mul nsw i64 %idxprom81, %7
%arrayidx82 = getelementptr inbounds i32, ptr %65, i64 %67
%68 = load i32, ptr %j, align 4, !tbaa !9
%idxprom83 = sext i32 %68 to i64
%arrayidx84 = getelementptr inbounds i32, ptr %arrayidx82, i64 %idxprom83
store i32 %conv80, ptr %arrayidx84, align 4, !tbaa !9
br label %for.inc85
for.inc85: ; preds = %for.end78
%69 = load i32, ptr %j, align 4, !tbaa !9
%inc86 = add nsw i32 %69, 1
store i32 %inc86, ptr %j, align 4, !tbaa !9
br label %for.cond40, !llvm.loop !23
for.end87: ; preds = %for.cond.cleanup44
br label %for.inc88
for.inc88: ; preds = %for.end87
%70 = load i32, ptr %i, align 4, !tbaa !9
%inc89 = add nsw i32 %70, 1
store i32 %inc89, ptr %i, align 4, !tbaa !9
br label %for.cond34, !llvm.loop !24
for.end90: ; preds = %for.cond.cleanup38
call void @llvm.lifetime.end.p0(i64 4, ptr %sum_in_current_frame) #4
call void @llvm.lifetime.end.p0(i64 324, ptr %gaussianFilter) #4
call void @llvm.lifetime.end.p0(i64 4, ptr %sum) #4
call void @llvm.lifetime.end.p0(i64 4, ptr %offset) #4
call void @llvm.lifetime.end.p0(i64 4, ptr %s) #4
call void @llvm.lifetime.end.p0(i64 4, ptr %sigma) #4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.start.p0(i64 immarg, ptr captures(none)) #1
; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
declare void @llvm.memset.p0.i64(ptr writeonly captures(none), i8, i64, i1 immarg) #2
; Function Attrs: nounwind
declare double @exp(double noundef) #3
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.end.p0(i64 immarg, ptr captures(none)) #1
attributes #0 = { nounwind uwtable vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+b,+c,+d,+f,+m,+relax,+supm,+v,+za64rs,+zaamo,+zalrsc,+zawrs,+zba,+zbb,+zbs,+zca,+zcb,+zcd,+zcmop,+zfa,+zfhmin,+zic64b,+zicbom,+zicbop,+zicboz,+ziccamoa,+ziccif,+zicclsm,+ziccrse,+zicntr,+zicond,+zicsr,+zihintntl,+zihintpause,+zihpm,+zimop,+zkt,+zmmul,+zvbb,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvfhmin,+zvkb,+zvkt,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-p,-experimental-smctr,-experimental-ssctr,-experimental-svukte,-experimental-xqccmp,-experimental-xqcia,-experimental-xqciac,-experimental-xqcibi,-experimental-xqcibm,-experimental-xqcicli,-experimental-xqcicm,-experimental-xqcics,-experimental-xqcicsr,-experimental-xqciint,-experimental-xqciio,-experimental-xqcilb,-experimental-xqcili,-experimental-xqcilia,-experimental-xqcilo,-experimental-xqcilsm,-experimental-xqcisim,-experimental-xqcisls,-experimental-xqcisync,-experimental-xrivosvisni,-experimental-xrivosvizip,-experimental-xsfmclic,-experimental-xsfsclic,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-experimental-zvqdotq,-h,-q,-sdext,-sdtrig,-sha,-shcounterenw,-shgatpa,-shlcofideleg,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcntrpmf,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xandesperf,-xandesvbfhcvt,-xandesvdot,-xandesvpackfph,-xandesvsintload,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xmipscbop,-xmipscmov,-xmipslsp,-xsfcease,-xsfmm128t,-xsfmm16t,-xsfmm32a16f,-xsfmm32a32f,-xsfmm32a8f,-xsfmm32a8i,-xsfmm32t,-xsfmm64a64f,-xsfmm64t,-xsfmmbase,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-zabha,-zacas,-zama16b,-zbc,-zbkb,-zbkc,-zbkx,-zce,-zcf,-zclsd,-zcmp,-zcmt,-zdinx,-zfbfmin,-zfh,-zfinx,-zhinx,-zhinxmin,-ziccamoc,-zifencei,-zilsd,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-ztso,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
attributes #3 = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+b,+c,+d,+f,+m,+relax,+supm,+v,+za64rs,+zaamo,+zalrsc,+zawrs,+zba,+zbb,+zbs,+zca,+zcb,+zcd,+zcmop,+zfa,+zfhmin,+zic64b,+zicbom,+zicbop,+zicboz,+ziccamoa,+ziccif,+zicclsm,+ziccrse,+zicntr,+zicond,+zicsr,+zihintntl,+zihintpause,+zihpm,+zimop,+zkt,+zmmul,+zvbb,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvfhmin,+zvkb,+zvkt,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-p,-experimental-smctr,-experimental-ssctr,-experimental-svukte,-experimental-xqccmp,-experimental-xqcia,-experimental-xqciac,-experimental-xqcibi,-experimental-xqcibm,-experimental-xqcicli,-experimental-xqcicm,-experimental-xqcics,-experimental-xqcicsr,-experimental-xqciint,-experimental-xqciio,-experimental-xqcilb,-experimental-xqcili,-experimental-xqcilia,-experimental-xqcilo,-experimental-xqcilsm,-experimental-xqcisim,-experimental-xqcisls,-experimental-xqcisync,-experimental-xrivosvisni,-experimental-xrivosvizip,-experimental-xsfmclic,-experimental-xsfsclic,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-experimental-zvqdotq,-h,-q,-sdext,-sdtrig,-sha,-shcounterenw,-shgatpa,-shlcofideleg,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcntrpmf,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xandesperf,-xandesvbfhcvt,-xandesvdot,-xandesvpackfph,-xandesvsintload,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xmipscbop,-xmipscmov,-xmipslsp,-xsfcease,-xsfmm128t,-xsfmm16t,-xsfmm32a16f,-xsfmm32a32f,-xsfmm32a8f,-xsfmm32a8i,-xsfmm32t,-xsfmm64a64f,-xsfmm64t,-xsfmmbase,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-zabha,-zacas,-zama16b,-zbc,-zbkb,-zbkc,-zbkx,-zce,-zcf,-zclsd,-zcmp,-zcmt,-zdinx,-zfbfmin,-zfh,-zfinx,-zhinx,-zhinxmin,-ziccamoc,-zifencei,-zilsd,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-ztso,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
attributes #4 = { nounwind }
!llvm.module.flags = !{!0, !1, !2, !4, !5, !6, !7}
!llvm.ident = !{!8}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 1, !"target-abi", !"lp64d"}
!2 = !{i32 6, !"riscv-isa", !3}
!3 = !{!"rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_v1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zaamo1p0_zalrsc1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcd1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_supm1p0"}
!4 = !{i32 8, !"PIC Level", i32 2}
!5 = !{i32 7, !"PIE Level", i32 2}
!6 = !{i32 7, !"uwtable", i32 2}
!7 = !{i32 8, !"SmallDataLimit", i32 0}
!8 = !{!"clang version 21.0.0git"}
!9 = !{!10, !10, i64 0}
!10 = !{!"int", !11, i64 0}
!11 = !{!"omnipotent char", !12, i64 0}
!12 = !{!"Simple C/C++ TBAA"}
!13 = !{!14, !14, i64 0}
!14 = !{!"p1 int", !15, i64 0}
!15 = !{!"any pointer", !11, i64 0}
!16 = !{!17, !17, i64 0}
!17 = !{!"float", !11, i64 0}
!18 = distinct !{!18, !19}
!19 = !{!"llvm.loop.mustprogress"}
!20 = distinct !{!20, !19}
!21 = distinct !{!21, !19}
!22 = distinct !{!22, !19}
!23 = distinct !{!23, !19}
!24 = distinct !{!24, !19}
```
And build with:
```
clang --target=riscv64-linux-gnu -march=rva23u64 -O3 -ffp-contract=off tc.ll -c -S -o -
```
Looking at the dump of `-mllvm -debug -mllvm -print-after-all` for LLVM with/without this patch, the amount of logic dropped by SLP after this patch seems suspect.
```diff
; *** IR Dump After SLPVectorizerPass on gaussianBlurKernel ***
; Function Attrs: nofree norecurse nounwind memory(argmem: readwrite, errnomem: write) uwtable vscale_range(2,1024)
define dso_local void @gaussianBlurKernel(i32 noundef signext %height, i32 noundef signext %width, ptr noundef readonly captures(none) %inputImage, ptr noundef writeonly captures(none) %outputImage) local_unnamed_addr #0 {
entry:
%gaussianFilter = alloca [9 x [9 x float]], align 4
- call void @llvm.lifetime.start.p0(i64 324, ptr nonnull %gaussianFilter) #4
+ call void @llvm.lifetime.start.p0(i64 324, ptr nonnull %gaussianFilter) #5
call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 dereferenceable(324) %gaussianFilter, i8 0, i64 324, i1 false)
br label %for.cond7.preheader
@@ -41577,7 +40757,7 @@
%conv14 = sitofp i32 %3 to float
%div = fdiv float %conv14, 1.620000e+02
%conv15 = fpext float %div to double
- %call = tail call double @exp(double noundef %conv15) #4, !tbaa !9
+ %call = tail call double @exp(double noundef %conv15) #5, !tbaa !9
%div18 = fdiv double %call, 0x407FCF0216A64912
%conv19 = fptrunc double %div18 to float
%arrayidx23 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 0
@@ -41588,7 +40768,7 @@
%conv14.1 = sitofp i32 %5 to float
%div.1 = fdiv float %conv14.1, 1.620000e+02
%conv15.1 = fpext float %div.1 to double
- %call.1 = tail call double @exp(double noundef %conv15.1) #4, !tbaa !9
+ %call.1 = tail call double @exp(double noundef %conv15.1) #5, !tbaa !9
%div18.1 = fdiv double %call.1, 0x407FCF0216A64912
%conv19.1 = fptrunc double %div18.1 to float
%arrayidx23.1 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 1
@@ -41599,7 +40779,7 @@
%conv14.2 = sitofp i32 %7 to float
%div.2 = fdiv float %conv14.2, 1.620000e+02
%conv15.2 = fpext float %div.2 to double
- %call.2 = tail call double @exp(double noundef %conv15.2) #4, !tbaa !9
+ %call.2 = tail call double @exp(double noundef %conv15.2) #5, !tbaa !9
%div18.2 = fdiv double %call.2, 0x407FCF0216A64912
%conv19.2 = fptrunc double %div18.2 to float
%arrayidx23.2 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 2
@@ -41610,7 +40790,7 @@
%conv14.3 = sitofp i32 %9 to float
%div.3 = fdiv float %conv14.3, 1.620000e+02
%conv15.3 = fpext float %div.3 to double
- %call.3 = tail call double @exp(double noundef %conv15.3) #4, !tbaa !9
+ %call.3 = tail call double @exp(double noundef %conv15.3) #5, !tbaa !9
%div18.3 = fdiv double %call.3, 0x407FCF0216A64912
%conv19.3 = fptrunc double %div18.3 to float
%arrayidx23.3 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 3
@@ -41621,31 +40801,31 @@
%conv14.4 = sitofp i32 %11 to float
%div.4 = fdiv float %conv14.4, 1.620000e+02
%conv15.4 = fpext float %div.4 to double
- %call.4 = tail call double @exp(double noundef %conv15.4) #4, !tbaa !9
+ %call.4 = tail call double @exp(double noundef %conv15.4) #5, !tbaa !9
%div18.4 = fdiv double %call.4, 0x407FCF0216A64912
%conv19.4 = fptrunc double %div18.4 to float
%arrayidx23.4 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 4
store float %conv19.4, ptr %arrayidx23.4, align 4, !tbaa !13
%add30.4 = fadd float %add30.3, %conv19.4
- %call.5 = tail call double @exp(double noundef %conv15.3) #4, !tbaa !9
+ %call.5 = tail call double @exp(double noundef %conv15.3) #5, !tbaa !9
%div18.5 = fdiv double %call.5, 0x407FCF0216A64912
%conv19.5 = fptrunc double %div18.5 to float
%arrayidx23.5 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 5
store float %conv19.5, ptr %arrayidx23.5, align 4, !tbaa !13
%add30.5 = fadd float %add30.4, %conv19.5
- %call.6 = tail call double @exp(double noundef %conv15.2) #4, !tbaa !9
+ %call.6 = tail call double @exp(double noundef %conv15.2) #5, !tbaa !9
%div18.6 = fdiv double %call.6, 0x407FCF0216A64912
%conv19.6 = fptrunc double %div18.6 to float
%arrayidx23.6 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 6
store float %conv19.6, ptr %arrayidx23.6, align 4, !tbaa !13
%add30.6 = fadd float %add30.5, %conv19.6
- %call.7 = tail call double @exp(double noundef %conv15.1) #4, !tbaa !9
+ %call.7 = tail call double @exp(double noundef %conv15.1) #5, !tbaa !9
%div18.7 = fdiv double %call.7, 0x407FCF0216A64912
%conv19.7 = fptrunc double %div18.7 to float
%arrayidx23.7 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 7
store float %conv19.7, ptr %arrayidx23.7, align 4, !tbaa !13
%add30.7 = fadd float %add30.6, %conv19.7
- %call.8 = tail call double @exp(double noundef %conv15) #4, !tbaa !9
+ %call.8 = tail call double @exp(double noundef %conv15) #5, !tbaa !9
%div18.8 = fdiv double %call.8, 0x407FCF0216A64912
%conv19.8 = fptrunc double %div18.8 to float
%arrayidx23.8 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %1, i64 8
@@ -41680,74 +40860,38 @@
%conv80.us = fptosi float %div79.us to i32
%arrayidx84.us = getelementptr inbounds nuw i32, ptr %arrayidx82.us, i64 %indvars.iv167
store i32 %conv80.us, ptr %arrayidx84.us, align 4, !tbaa !9
- %exitcond169.not = icmp eq i64 %47, %wide.trip.count
+ %exitcond169.not = icmp eq i64 %26, %wide.trip.count
br i1 %exitcond169.not, label %for.cond40.for.cond.cleanup44_crit_edge.us, label %for.cond47.preheader.us, !llvm.loop !17
for.cond53.preheader.us: ; preds = %for.cond47.preheader.us, %for.cond53.preheader.us
%indvars.iv162 = phi i64 [ -4, %for.cond47.preheader.us ], [ %indvars.iv.next163, %for.cond53.preheader.us ]
- %sum_in_current_frame.0144.us = phi float [ 0.000000e+00, %for.cond47.preheader.us ], [ %add72.us.8, %for.cond53.preheader.us ]
%14 = add nsw i64 %indvars.iv162, %indvars.iv170
%15 = mul nuw nsw i64 %14, %12
%arrayidx60.us = getelementptr inbounds i32, ptr %inputImage, i64 %15
%16 = add nsw i64 %indvars.iv162, 4
- %17 = getelementptr i32, ptr %arrayidx60.us, i64 %indvars.iv167
- %arrayidx63.us = getelementptr i8, ptr %17, i64 -16
- %arrayidx70.us = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 0
- %18 = load <2 x i32>, ptr %arrayidx63.us, align 4, !tbaa !9
+ %17 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
+ %arrayidx63.us.6 = getelementptr inbounds nuw i8, ptr %17, i64 8
+ %arrayidx70.us.6 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 6
+ %18 = load <2 x i32>, ptr %arrayidx63.us.6, align 4, !tbaa !9
%19 = sitofp <2 x i32> %18 to <2 x float>
- %20 = load <2 x float>, ptr %arrayidx70.us, align 4, !tbaa !13
+ %20 = load <2 x float>, ptr %arrayidx70.us.6, align 4, !tbaa !13
%21 = fmul <2 x float> %20, %19
- %22 = extractelement <2 x float> %21, i32 0
- %add72.us = fadd float %sum_in_current_frame.0144.us, %22
- %23 = extractelement <2 x float> %21, i32 1
- %add72.us.1 = fadd float %add72.us, %23
- %24 = getelementptr i32, ptr %arrayidx60.us, i64 %indvars.iv167
- %arrayidx63.us.2 = getelementptr i8, ptr %24, i64 -8
- %arrayidx70.us.2 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 2
- %25 = load <2 x i32>, ptr %arrayidx63.us.2, align 4, !tbaa !9
- %26 = sitofp <2 x i32> %25 to <2 x float>
- %27 = load <2 x float>, ptr %arrayidx70.us.2, align 4, !tbaa !13
- %28 = fmul <2 x float> %27, %26
- %29 = extractelement <2 x float> %28, i32 0
- %add72.us.2 = fadd float %add72.us.1, %29
- %30 = extractelement <2 x float> %28, i32 1
- %add72.us.3 = fadd float %add72.us.2, %30
- %arrayidx63.us.4 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
- %arrayidx70.us.4 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 4
- %31 = load <2 x i32>, ptr %arrayidx63.us.4, align 4, !tbaa !9
- %32 = sitofp <2 x i32> %31 to <2 x float>
- %33 = load <2 x float>, ptr %arrayidx70.us.4, align 4, !tbaa !13
- %34 = fmul <2 x float> %33, %32
- %35 = extractelement <2 x float> %34, i32 0
- %add72.us.4 = fadd float %add72.us.3, %35
- %36 = extractelement <2 x float> %34, i32 1
- %add72.us.5 = fadd float %add72.us.4, %36
- %37 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
- %arrayidx63.us.6 = getelementptr inbounds nuw i8, ptr %37, i64 8
- %arrayidx70.us.6 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 6
- %38 = load <2 x i32>, ptr %arrayidx63.us.6, align 4, !tbaa !9
- %39 = sitofp <2 x i32> %38 to <2 x float>
- %40 = load <2 x float>, ptr %arrayidx70.us.6, align 4, !tbaa !13
- %41 = fmul <2 x float> %40, %39
- %42 = extractelement <2 x float> %41, i32 0
- %add72.us.6 = fadd float %add72.us.5, %42
- %43 = extractelement <2 x float> %41, i32 1
- %add72.us.7 = fadd float %add72.us.6, %43
- %44 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
- %arrayidx63.us.8 = getelementptr inbounds nuw i8, ptr %44, i64 16
- %45 = load i32, ptr %arrayidx63.us.8, align 4, !tbaa !9
- %conv64.us.8 = sitofp i32 %45 to float
+ %22 = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %21)
+ %23 = getelementptr inbounds nuw i32, ptr %arrayidx60.us, i64 %indvars.iv167
+ %arrayidx63.us.8 = getelementptr inbounds nuw i8, ptr %23, i64 16
+ %24 = load i32, ptr %arrayidx63.us.8, align 4, !tbaa !9
+ %conv64.us.8 = sitofp i32 %24 to float
%arrayidx70.us.8 = getelementptr inbounds [9 x [9 x float]], ptr %gaussianFilter, i64 0, i64 %16, i64 8
- %46 = load float, ptr %arrayidx70.us.8, align 4, !tbaa !13
- %mul71.us.8 = fmul float %46, %conv64.us.8
- %add72.us.8 = fadd float %add72.us.7, %mul71.us.8
+ %25 = load float, ptr %arrayidx70.us.8, align 4, !tbaa !13
+ %mul71.us.8 = fmul float %25, %conv64.us.8
+ %add72.us.8 = fadd float %22, %mul71.us.8
%indvars.iv.next163 = add nsw i64 %indvars.iv162, 1
%exitcond166.not = icmp eq i64 %indvars.iv.next163, 5
br i1 %exitcond166.not, label %for.cond.cleanup50.us, label %for.cond53.preheader.us, !llvm.loop !18
for.cond47.preheader.us: ; preds = %for.cond40.preheader.us, %for.cond.cleanup50.us
- %indvars.iv167 = phi i64 [ 4, %for.cond40.preheader.us ], [ %47, %for.cond.cleanup50.us ]
- %47 = add nuw nsw i64 %indvars.iv167, 1
+ %indvars.iv167 = phi i64 [ 4, %for.cond40.preheader.us ], [ %26, %for.cond.cleanup50.us ]
+ %26 = add nuw nsw i64 %indvars.iv167, 1
br label %for.cond53.preheader.us
for.cond40.for.cond.cleanup44_crit_edge.us: ; preds = %for.cond.cleanup50.us
@@ -41756,14 +40900,14 @@
br i1 %exitcond173.not, label %for.cond.cleanup38, label %for.cond40.preheader.us, !llvm.loop !19
for.cond.cleanup38: ; preds = %for.cond40.for.cond.cleanup44_crit_edge.us, %for.cond34.preheader
- call void @llvm.lifetime.end.p0(i64 324, ptr nonnull %gaussianFilter) #4
+ call void @llvm.lifetime.end.p0(i64 324, ptr nonnull %gaussianFilter) #5
ret void
}
```
https://github.com/llvm/llvm-project/pull/147583
More information about the llvm-commits
mailing list