[llvm] [RISCV] Enable rematerialization for scalar loads (PR #166774)

Thu Nov 6 20:21:37 PST 2025

================
@@ -201,3 +201,95 @@ for.end:                                          ; preds = %for.inc, %entry
 }
 
 declare i32 @foo(i32, i32, i32, i32, i32, i32)
+
+define i32 @remat_ld(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %stack0, i32 %stack1, ptr %p, ptr %q) {
+; RV32I-LABEL: remat_ld:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -64
+; RV32I-NEXT:    .cfi_def_cfa_offset 64
+; RV32I-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    .cfi_offset s2, -16
+; RV32I-NEXT:    .cfi_offset s3, -20
+; RV32I-NEXT:    .cfi_offset s4, -24
+; RV32I-NEXT:    .cfi_offset s5, -28
+; RV32I-NEXT:    .cfi_offset s6, -32
+; RV32I-NEXT:    .cfi_offset s7, -36
+; RV32I-NEXT:    .cfi_offset s8, -40
+; RV32I-NEXT:    .cfi_offset s9, -44
+; RV32I-NEXT:    .cfi_offset s10, -48
+; RV32I-NEXT:    .cfi_offset s11, -52
+; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lw a2, 68(sp)
+; RV32I-NEXT:    lw a3, 64(sp)
+; RV32I-NEXT:    lw a0, 72(sp)
+; RV32I-NEXT:    lw a1, 76(sp)
+; RV32I-NEXT:    sw a3, 0(a0)
+; RV32I-NEXT:    sw a2, 0(a1)
+; RV32I-NEXT:    #APP
+; RV32I-NEXT:    #NO_APP
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    beqz a0, .LBB1_2
+; RV32I-NEXT:  # %bb.1: # %falsebb
+; RV32I-NEXT:    li a0, 0
+; RV32I-NEXT:    j .LBB1_3
+; RV32I-NEXT:  .LBB1_2: # %truebb
+; RV32I-NEXT:    lw a0, 68(sp)
+; RV32I-NEXT:    lw a1, 64(sp)
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:  .LBB1_3: # %falsebb
+; RV32I-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    .cfi_restore s2
+; RV32I-NEXT:    .cfi_restore s3
+; RV32I-NEXT:    .cfi_restore s4
+; RV32I-NEXT:    .cfi_restore s5
+; RV32I-NEXT:    .cfi_restore s6
+; RV32I-NEXT:    .cfi_restore s7
+; RV32I-NEXT:    .cfi_restore s8
+; RV32I-NEXT:    .cfi_restore s9
+; RV32I-NEXT:    .cfi_restore s10
+; RV32I-NEXT:    .cfi_restore s11
+; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+entry:
+  store i32 %stack0, ptr %p
+  store i32 %stack1, ptr %q
+  tail call void asm sideeffect "", "~{x1},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"()
+  %a = icmp eq i32 %0, 0
+  br i1 %a, label %truebb, label %falsebb
----------------
lukel97 wrote:

Yup, was able to simplify it to just a volatile store after the asm without branching. For some reason, the float stack arguments require a use before the asm in order to force the initial load off the stack

https://github.com/llvm/llvm-project/pull/166774