[llvm] [LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding. (PR #149042)

Sun Nov 16 07:29:50 PST 2025

asb wrote:

I'm not sure it's overly useful, but here is a reduction of SingleSource/Benchmarks/Stanford/Oscar.c that produces an expected result without this patch, and a different one with it:

```c
void printf(char *, ...);
struct complex {
  float rp, ip
} z[256], w[256];
float e_1_1, e_1_0, z_0_1, zr;
int Printcomplex_finish, Fft_i, Fft_j, Fft_k, Fft_l, Fft_m, Fft_index, Oscar_i,
    Oscar_s = 5767;
void Uniform11(int *iy, float *yfl) {
  *iy = 4855 * *iy + 1731 & 8191;
  *yfl = *iy / 8192.0f;
}
void Fft(int n, struct complex w[], float sqrinv) {
  Fft_m = n / 2;
  Fft_l = 1;
  do {
    Fft_k = 0;
    Fft_j = Fft_l;
    Fft_i = 1;
    do {
      do {
        w[Fft_i + Fft_k].rp = z[Fft_i].rp + z[Fft_m + Fft_i].rp;
        w[Fft_i + Fft_j].rp = z[Fft_i].rp - z[Fft_i + Fft_m].rp - e_1_1;
        w[Fft_j].ip = e_1_0 * z_0_1 * z[Fft_i].rp - z[Fft_m].rp;
        Fft_i = Fft_i + 1;
      } while (Fft_i <= Fft_j);
      Fft_k = Fft_j;
      Fft_j = Fft_j + Fft_l;
    } while (Fft_j <= Fft_m);
    Fft_index = 1;
    do {
      z[Fft_index] = w[Fft_index];
      Fft_index = Fft_index + 1;
    } while (Fft_index <= n);
    Fft_l = Fft_l + Fft_l;
  } while (Fft_l <= Fft_m);
  Fft_i = 1;
  for (; Fft_i <= n; Fft_i++)
    z[Fft_i].rp = sqrinv * z[Fft_i].rp;
}
void main() {
  Oscar_i = 1;
  Uniform11(&Oscar_s, &zr);
  z[Oscar_i].rp = 20.0f * zr - 10.0f;
  for (; Oscar_i <= 20; Oscar_i++)
    Fft(256, w, 0.0625f);
  printf("\n"
         "  %15.3f%15.3f",
         z[1].rp);
  while (Printcomplex_finish)
    ;
}

```

The output:
```
./tc.good/bin/clang   --target=riscv64-linux-gnu   --sysroot=$HOME/rvsysroot   -march=rva23u64 -O3   -o Oscar.good Oscar.i
./tc.bad/bin/clang   --target=riscv64-linux-gnu   --sysroot=$HOME/rvsysroot   -march=rva23u64 -O3   -o Oscar.bad Oscar.i
 ./Oscar.good # output follows

           -9.365          0.000
./Oscar.bad # output follows

  -1130966548480.000          0.000
```

Reducing the .ll such that you get the same answer for the 'good' toolchain and a different one for the 'bad' one:

```llvm
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"

%struct.complex = type { float, float }

@Oscar_s = global i32 5767
@Fft_m = dso_local global i32 0
@Fft_l = global i32 0
@Fft_k = global i32 0
@Fft_j = global i32 0
@Fft_i = global i32 0
@z = global [256 x %struct.complex] zeroinitializer
@e_1_1 = dso_local global float 0.000000e+00
@e_1_0 = global float 0.000000e+00
@Fft_index = global i32 0
@Oscar_i = global i32 0
@w = global [256 x %struct.complex] zeroinitializer
@.str = constant [16 x i8] c"\0A  %15.3f%15.3f\00"

define void @Uniform11(ptr %iy, ptr %yfl) {
entry:
  %0 = load i32, ptr %iy, align 4
  %mul = mul i32 %0, 4855
  %add = add i32 %mul, 1731
  %and = and i32 %add, 8191
  %conv = uitofp i32 %and to float
  %div = fmul float %conv, 0x3F20000000000000
  store float %div, ptr %yfl, align 4
  ret void
}

define void @Fft(i32 %n, ptr %w, float %sqrinv) {
entry:
  %div = sdiv i32 %n, 2
  store i32 %div, ptr @Fft_m, align 4
  store i32 1, ptr @Fft_l, align 4
  br label %do.body

do.body:                                          ; preds = %do.end45, %entry
  store i32 0, ptr @Fft_k, align 4
  store i32 0, ptr @Fft_j, align 4
  store i32 0, ptr @Fft_i, align 4
  br label %do.body1

do.body1:                                         ; preds = %do.end, %do.body
  br label %do.body2

do.body2:                                         ; preds = %do.body2, %do.body1
  %0 = load i32, ptr @Fft_i, align 4
  %idxprom = sext i32 %0 to i64
  %arrayidx = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom
  %1 = load float, ptr %arrayidx, align 4
  %2 = load i32, ptr @Fft_m, align 4, !tbaa !0
  %3 = load i32, ptr @Fft_i, align 4
  %add = add i32 %2, %3
  %idxprom3 = sext i32 %add to i64
  %arrayidx4 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom3
  %4 = load float, ptr %arrayidx4, align 4
  %add6 = fadd float %1, %4
  %5 = load i32, ptr @Fft_i, align 4
  %6 = load i32, ptr @Fft_k, align 4, !tbaa !0
  %add7 = add i32 %5, %6
  %idxprom8 = sext i32 %add7 to i64
  %arrayidx9 = getelementptr %struct.complex, ptr %w, i64 %idxprom8
  store float %add6, ptr %arrayidx9, align 4, !tbaa !4
  %7 = load i32, ptr @Fft_i, align 4, !tbaa !0
  %idxprom11 = sext i32 %7 to i64
  %arrayidx12 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom11
  %8 = load float, ptr %arrayidx12, align 4
  %9 = load i32, ptr @Fft_i, align 4
  %10 = load i32, ptr @Fft_m, align 4, !tbaa !0
  %add14 = add i32 %9, %10
  %idxprom15 = sext i32 %add14 to i64
  %arrayidx16 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom15
  %11 = load float, ptr %arrayidx16, align 4
  %sub = fsub float %8, %11
  %12 = load float, ptr @e_1_1, align 4
  %sub18 = fsub float %sub, %12
  %13 = load i32, ptr @Fft_i, align 4
  %14 = load i32, ptr @Fft_j, align 4, !tbaa !0
  %add19 = add i32 %13, %14
  %idxprom20 = sext i32 %add19 to i64
  %arrayidx21 = getelementptr %struct.complex, ptr %w, i64 %idxprom20
  store float %sub18, ptr %arrayidx21, align 4, !tbaa !4
  %15 = load i32, ptr @Fft_i, align 4, !tbaa !0
  %add32 = add i32 %15, 1
  store i32 %add32, ptr @Fft_i, align 4
  %16 = load i32, ptr @Fft_i, align 4
  %17 = load i32, ptr @Fft_j, align 4, !tbaa !0
  %cmp = icmp sle i32 %16, %17
  br i1 %cmp, label %do.body2, label %do.end

do.end:                                           ; preds = %do.body2
  %18 = load i32, ptr @Fft_j, align 4
  store i32 %18, ptr @Fft_k, align 4
  %19 = load i32, ptr @Fft_j, align 4
  %20 = load i32, ptr @Fft_l, align 4
  %add33 = add i32 %19, %20
  store i32 %add33, ptr @Fft_j, align 4
  %21 = load i32, ptr @Fft_j, align 4
  %22 = load i32, ptr @Fft_m, align 4, !tbaa !0
  %cmp35 = icmp sle i32 %21, %22
  br i1 %cmp35, label %do.body1, label %do.end36

do.end36:                                         ; preds = %do.end
  store i32 0, ptr @Fft_index, align 4
  br label %do.body37

do.body37:                                        ; preds = %do.body37, %do.end36
  %23 = load i32, ptr @Fft_index, align 4
  %idxprom38 = sext i32 %23 to i64
  %arrayidx39 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom38
  %24 = load i32, ptr @Fft_index, align 4
  %idxprom40 = sext i32 %24 to i64
  %arrayidx41 = getelementptr %struct.complex, ptr %w, i64 %idxprom40
  call void @llvm.memcpy.p0.p0.i64(ptr %arrayidx39, ptr %arrayidx41, i64 8, i1 false)
  %25 = load i32, ptr @Fft_index, align 4
  %add42 = add i32 %25, 1
  store i32 %add42, ptr @Fft_index, align 4
  %26 = load i32, ptr @Fft_index, align 4
  %cmp44 = icmp sle i32 %26, %n
  br i1 %cmp44, label %do.body37, label %do.end45

do.end45:                                         ; preds = %do.body37
  %27 = load i32, ptr @Fft_l, align 4
  %28 = load i32, ptr @Fft_l, align 4
  %add46 = add i32 %27, %28
  store i32 %add46, ptr @Fft_l, align 4
  %29 = load i32, ptr @Fft_l, align 4
  %cmp48 = icmp sle i32 %29, %n
  br i1 %cmp48, label %do.body, label %for.cond

for.cond:                                         ; preds = %for.body, %do.end45
  %30 = load i32, ptr @Fft_i, align 4
  %cmp50 = icmp sle i32 %30, %n
  br i1 %cmp50, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %31 = load i32, ptr @Fft_i, align 4
  %idxprom51 = sext i32 %31 to i64
  %arrayidx52 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom51
  %32 = load float, ptr %arrayidx52, align 4
  %mul54 = fmul float %sqrinv, %32
  %33 = load i32, ptr @Fft_i, align 4
  %idxprom55 = sext i32 %33 to i64
  %arrayidx56 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom55
  store float %mul54, ptr %arrayidx56, align 4
  %34 = load i32, ptr @Fft_i, align 4
  %inc = add i32 %34, 1
  store i32 %inc, ptr @Fft_i, align 4
  br label %for.cond

for.end:                                          ; preds = %for.cond
  ret void
}

; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.fmuladd.f32(float, float, float) #0

; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
declare void @llvm.memcpy.p0.p0.i64(ptr noalias writeonly captures(none), ptr noalias readonly captures(none), i64, i1 immarg) #1

define i32 @main(i64 %idxprom, ptr %zr) {
entry:
  store i32 1, ptr @Oscar_i, align 4
  call void @Uniform11(ptr @Oscar_s, ptr %zr)
  %0 = load float, ptr %zr, align 4
  %1 = call float @llvm.fmuladd.f32(float %0, float 2.000000e+01, float -1.000000e+01)
  %arrayidx = getelementptr %struct.complex, ptr @z, i64 %idxprom
  store float %1, ptr %arrayidx, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.body, %entry
  %2 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
  %cmp = icmp slt i32 %2, 21
  br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  call void @Fft(i32 256, ptr @w, float 6.250000e-02)
  %3 = load i32, ptr @Oscar_i, align 4
  %inc = add i32 %3, 1
  store i32 %inc, ptr @Oscar_i, align 4
  br label %for.cond

for.end:                                          ; preds = %for.cond
  %4 = load float, ptr getelementptr inbounds nuw (i8, ptr @z, i64 8), align 4
  %conv = fpext float %4 to double
  call void (ptr, ...) @printf(ptr @.str, double %conv)
  ret i32 0
}

declare void @printf(ptr, ...)

attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }

!0 = !{!1, !1, i64 0}
!1 = !{!"int", !2, i64 0}
!2 = !{!"omnipotent char", !3, i64 0}
!3 = !{!"Simple C/C++ TBAA"}
!4 = !{!5, !6, i64 0}
!5 = !{!"complex", !6, i64 0, !6, i64 4}
!6 = !{!"float", !2, i64 0}

```

The above .ll can be compiled with:
```sh
./tc.good/bin/clang \
  --target=riscv64-linux-gnu \
  --sysroot=$HOME/rvsysroot \
  -march=rva23u64 \
  -O3 \
  reduced.ll \
  -o Oscar.good
./tc.bad/bin/clang \
  --target=riscv64-linux-gnu \
  --sysroot=$HOME/rvsysroot \
  -march=rva23u64 \
  -O3 \
  reduced.ll \
  -o Oscar.bad
 ./Oscar.good # output follows

           -9.365          0.000
 ./Oscar.bad # output follows

  -70923976704.000          0.000
```

https://github.com/llvm/llvm-project/pull/149042