[llvm] [LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding. (PR #149042)
Alex Bradbury via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 16 07:29:50 PST 2025
asb wrote:
I'm not sure it's overly useful, but here is a reduction of SingleSource/Benchmarks/Stanford/Oscar.c that produces an expected result without this patch, and a different one with it:
```c
void printf(char *, ...);
struct complex {
float rp, ip
} z[256], w[256];
float e_1_1, e_1_0, z_0_1, zr;
int Printcomplex_finish, Fft_i, Fft_j, Fft_k, Fft_l, Fft_m, Fft_index, Oscar_i,
Oscar_s = 5767;
void Uniform11(int *iy, float *yfl) {
*iy = 4855 * *iy + 1731 & 8191;
*yfl = *iy / 8192.0f;
}
void Fft(int n, struct complex w[], float sqrinv) {
Fft_m = n / 2;
Fft_l = 1;
do {
Fft_k = 0;
Fft_j = Fft_l;
Fft_i = 1;
do {
do {
w[Fft_i + Fft_k].rp = z[Fft_i].rp + z[Fft_m + Fft_i].rp;
w[Fft_i + Fft_j].rp = z[Fft_i].rp - z[Fft_i + Fft_m].rp - e_1_1;
w[Fft_j].ip = e_1_0 * z_0_1 * z[Fft_i].rp - z[Fft_m].rp;
Fft_i = Fft_i + 1;
} while (Fft_i <= Fft_j);
Fft_k = Fft_j;
Fft_j = Fft_j + Fft_l;
} while (Fft_j <= Fft_m);
Fft_index = 1;
do {
z[Fft_index] = w[Fft_index];
Fft_index = Fft_index + 1;
} while (Fft_index <= n);
Fft_l = Fft_l + Fft_l;
} while (Fft_l <= Fft_m);
Fft_i = 1;
for (; Fft_i <= n; Fft_i++)
z[Fft_i].rp = sqrinv * z[Fft_i].rp;
}
void main() {
Oscar_i = 1;
Uniform11(&Oscar_s, &zr);
z[Oscar_i].rp = 20.0f * zr - 10.0f;
for (; Oscar_i <= 20; Oscar_i++)
Fft(256, w, 0.0625f);
printf("\n"
" %15.3f%15.3f",
z[1].rp);
while (Printcomplex_finish)
;
}
```
The output:
```
./tc.good/bin/clang --target=riscv64-linux-gnu --sysroot=$HOME/rvsysroot -march=rva23u64 -O3 -o Oscar.good Oscar.i
./tc.bad/bin/clang --target=riscv64-linux-gnu --sysroot=$HOME/rvsysroot -march=rva23u64 -O3 -o Oscar.bad Oscar.i
./Oscar.good # output follows
-9.365 0.000
./Oscar.bad # output follows
-1130966548480.000 0.000
```
Reducing the .ll such that you get the same answer for the 'good' toolchain and a different one for the 'bad' one:
```llvm
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"
%struct.complex = type { float, float }
@Oscar_s = global i32 5767
@Fft_m = dso_local global i32 0
@Fft_l = global i32 0
@Fft_k = global i32 0
@Fft_j = global i32 0
@Fft_i = global i32 0
@z = global [256 x %struct.complex] zeroinitializer
@e_1_1 = dso_local global float 0.000000e+00
@e_1_0 = global float 0.000000e+00
@Fft_index = global i32 0
@Oscar_i = global i32 0
@w = global [256 x %struct.complex] zeroinitializer
@.str = constant [16 x i8] c"\0A %15.3f%15.3f\00"
define void @Uniform11(ptr %iy, ptr %yfl) {
entry:
%0 = load i32, ptr %iy, align 4
%mul = mul i32 %0, 4855
%add = add i32 %mul, 1731
%and = and i32 %add, 8191
%conv = uitofp i32 %and to float
%div = fmul float %conv, 0x3F20000000000000
store float %div, ptr %yfl, align 4
ret void
}
define void @Fft(i32 %n, ptr %w, float %sqrinv) {
entry:
%div = sdiv i32 %n, 2
store i32 %div, ptr @Fft_m, align 4
store i32 1, ptr @Fft_l, align 4
br label %do.body
do.body: ; preds = %do.end45, %entry
store i32 0, ptr @Fft_k, align 4
store i32 0, ptr @Fft_j, align 4
store i32 0, ptr @Fft_i, align 4
br label %do.body1
do.body1: ; preds = %do.end, %do.body
br label %do.body2
do.body2: ; preds = %do.body2, %do.body1
%0 = load i32, ptr @Fft_i, align 4
%idxprom = sext i32 %0 to i64
%arrayidx = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom
%1 = load float, ptr %arrayidx, align 4
%2 = load i32, ptr @Fft_m, align 4, !tbaa !0
%3 = load i32, ptr @Fft_i, align 4
%add = add i32 %2, %3
%idxprom3 = sext i32 %add to i64
%arrayidx4 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom3
%4 = load float, ptr %arrayidx4, align 4
%add6 = fadd float %1, %4
%5 = load i32, ptr @Fft_i, align 4
%6 = load i32, ptr @Fft_k, align 4, !tbaa !0
%add7 = add i32 %5, %6
%idxprom8 = sext i32 %add7 to i64
%arrayidx9 = getelementptr %struct.complex, ptr %w, i64 %idxprom8
store float %add6, ptr %arrayidx9, align 4, !tbaa !4
%7 = load i32, ptr @Fft_i, align 4, !tbaa !0
%idxprom11 = sext i32 %7 to i64
%arrayidx12 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom11
%8 = load float, ptr %arrayidx12, align 4
%9 = load i32, ptr @Fft_i, align 4
%10 = load i32, ptr @Fft_m, align 4, !tbaa !0
%add14 = add i32 %9, %10
%idxprom15 = sext i32 %add14 to i64
%arrayidx16 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom15
%11 = load float, ptr %arrayidx16, align 4
%sub = fsub float %8, %11
%12 = load float, ptr @e_1_1, align 4
%sub18 = fsub float %sub, %12
%13 = load i32, ptr @Fft_i, align 4
%14 = load i32, ptr @Fft_j, align 4, !tbaa !0
%add19 = add i32 %13, %14
%idxprom20 = sext i32 %add19 to i64
%arrayidx21 = getelementptr %struct.complex, ptr %w, i64 %idxprom20
store float %sub18, ptr %arrayidx21, align 4, !tbaa !4
%15 = load i32, ptr @Fft_i, align 4, !tbaa !0
%add32 = add i32 %15, 1
store i32 %add32, ptr @Fft_i, align 4
%16 = load i32, ptr @Fft_i, align 4
%17 = load i32, ptr @Fft_j, align 4, !tbaa !0
%cmp = icmp sle i32 %16, %17
br i1 %cmp, label %do.body2, label %do.end
do.end: ; preds = %do.body2
%18 = load i32, ptr @Fft_j, align 4
store i32 %18, ptr @Fft_k, align 4
%19 = load i32, ptr @Fft_j, align 4
%20 = load i32, ptr @Fft_l, align 4
%add33 = add i32 %19, %20
store i32 %add33, ptr @Fft_j, align 4
%21 = load i32, ptr @Fft_j, align 4
%22 = load i32, ptr @Fft_m, align 4, !tbaa !0
%cmp35 = icmp sle i32 %21, %22
br i1 %cmp35, label %do.body1, label %do.end36
do.end36: ; preds = %do.end
store i32 0, ptr @Fft_index, align 4
br label %do.body37
do.body37: ; preds = %do.body37, %do.end36
%23 = load i32, ptr @Fft_index, align 4
%idxprom38 = sext i32 %23 to i64
%arrayidx39 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom38
%24 = load i32, ptr @Fft_index, align 4
%idxprom40 = sext i32 %24 to i64
%arrayidx41 = getelementptr %struct.complex, ptr %w, i64 %idxprom40
call void @llvm.memcpy.p0.p0.i64(ptr %arrayidx39, ptr %arrayidx41, i64 8, i1 false)
%25 = load i32, ptr @Fft_index, align 4
%add42 = add i32 %25, 1
store i32 %add42, ptr @Fft_index, align 4
%26 = load i32, ptr @Fft_index, align 4
%cmp44 = icmp sle i32 %26, %n
br i1 %cmp44, label %do.body37, label %do.end45
do.end45: ; preds = %do.body37
%27 = load i32, ptr @Fft_l, align 4
%28 = load i32, ptr @Fft_l, align 4
%add46 = add i32 %27, %28
store i32 %add46, ptr @Fft_l, align 4
%29 = load i32, ptr @Fft_l, align 4
%cmp48 = icmp sle i32 %29, %n
br i1 %cmp48, label %do.body, label %for.cond
for.cond: ; preds = %for.body, %do.end45
%30 = load i32, ptr @Fft_i, align 4
%cmp50 = icmp sle i32 %30, %n
br i1 %cmp50, label %for.body, label %for.end
for.body: ; preds = %for.cond
%31 = load i32, ptr @Fft_i, align 4
%idxprom51 = sext i32 %31 to i64
%arrayidx52 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom51
%32 = load float, ptr %arrayidx52, align 4
%mul54 = fmul float %sqrinv, %32
%33 = load i32, ptr @Fft_i, align 4
%idxprom55 = sext i32 %33 to i64
%arrayidx56 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom55
store float %mul54, ptr %arrayidx56, align 4
%34 = load i32, ptr @Fft_i, align 4
%inc = add i32 %34, 1
store i32 %inc, ptr @Fft_i, align 4
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.fmuladd.f32(float, float, float) #0
; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
declare void @llvm.memcpy.p0.p0.i64(ptr noalias writeonly captures(none), ptr noalias readonly captures(none), i64, i1 immarg) #1
define i32 @main(i64 %idxprom, ptr %zr) {
entry:
store i32 1, ptr @Oscar_i, align 4
call void @Uniform11(ptr @Oscar_s, ptr %zr)
%0 = load float, ptr %zr, align 4
%1 = call float @llvm.fmuladd.f32(float %0, float 2.000000e+01, float -1.000000e+01)
%arrayidx = getelementptr %struct.complex, ptr @z, i64 %idxprom
store float %1, ptr %arrayidx, align 4
br label %for.cond
for.cond: ; preds = %for.body, %entry
%2 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
%cmp = icmp slt i32 %2, 21
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
call void @Fft(i32 256, ptr @w, float 6.250000e-02)
%3 = load i32, ptr @Oscar_i, align 4
%inc = add i32 %3, 1
store i32 %inc, ptr @Oscar_i, align 4
br label %for.cond
for.end: ; preds = %for.cond
%4 = load float, ptr getelementptr inbounds nuw (i8, ptr @z, i64 8), align 4
%conv = fpext float %4 to double
call void (ptr, ...) @printf(ptr @.str, double %conv)
ret i32 0
}
declare void @printf(ptr, ...)
attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
!0 = !{!1, !1, i64 0}
!1 = !{!"int", !2, i64 0}
!2 = !{!"omnipotent char", !3, i64 0}
!3 = !{!"Simple C/C++ TBAA"}
!4 = !{!5, !6, i64 0}
!5 = !{!"complex", !6, i64 0, !6, i64 4}
!6 = !{!"float", !2, i64 0}
```
The above .ll can be compiled with:
```sh
./tc.good/bin/clang \
--target=riscv64-linux-gnu \
--sysroot=$HOME/rvsysroot \
-march=rva23u64 \
-O3 \
reduced.ll \
-o Oscar.good
./tc.bad/bin/clang \
--target=riscv64-linux-gnu \
--sysroot=$HOME/rvsysroot \
-march=rva23u64 \
-O3 \
reduced.ll \
-o Oscar.bad
./Oscar.good # output follows
-9.365 0.000
./Oscar.bad # output follows
-70923976704.000 0.000
```
https://github.com/llvm/llvm-project/pull/149042
More information about the llvm-commits
mailing list