[llvm-branch-commits] [llvm] [CodeGen][StaticDataSplitter]Support constant pool partitioning (PR #129781)

Tue Mar 25 13:48:41 PDT 2025

================
@@ -0,0 +1,141 @@
+; RUN: llc -mtriple=aarch64 -enable-split-machine-functions \
+; RUN:     -partition-static-data-sections=true -function-sections=true \
+; RUN:     -unique-section-names=false \
+; RUN:     %s -o - 2>&1 | FileCheck %s --dump-input=always
+
+; Repeat the RUN command above for big-endian systems.
+; RUN: llc -mtriple=aarch64_be -enable-split-machine-functions \
+; RUN:     -partition-static-data-sections=true -function-sections=true \
+; RUN:     -unique-section-names=false \
+; RUN:     %s -o - 2>&1 | FileCheck %s --dump-input=always
+
+; Tests that constant pool hotness is aggregated across the module. The
+; static-data-splitter processes data from cold_func first, unprofiled_func
+; secondly, and then hot_func. Specifically, tests that
+; - If a constant is accessed by hot functions, all constant pools for this
+;   constant (e.g., from an unprofiled function, or cold function) should have
+;   `.hot` suffix.
+; - Similarly if a constant is accessed by both cold function and un-profiled
+;   function, constant pools for this constant should not have `.unlikely` suffix.
+
+; CHECK:     .section	.rodata.cst8.hot,"aM", at progbits,8
+; CHECK: .LCPI0_0:
+; CHECK:	   .xword	0x3fe5c28f5c28f5c3              // double 0.68000000000000005
+; CHECK:     .section	.rodata.cst8.unlikely,"aM", at progbits,8
+; CHECK: .LCPI0_1:
+; CHECK:     .xword 0x3fe5eb851eb851ec              // double 0.68500000000000005
+; CHECK:	   .section	.rodata.cst8,"aM", at progbits,8
+; CHECK: .LCPI0_2:
+; CHECK:     .byte   0                               // 0x0
+; CHECK:     .byte   4                               // 0x4
+; CHECK:     .byte   8                               // 0x8
+; CHECK:     .byte   12                              // 0xc
+; CHECK:     .byte   255                             // 0xff
+; CHECK:     .byte   255                             // 0xff
+; CHECK:     .byte   255                             // 0xff
+; CHECK:     .byte   255                             // 0xff
+
+; CHECK:	   .section	.rodata.cst8,"aM", at progbits,8
+; CHECK: .LCPI1_0:
+; CHECK:     .byte   0                               // 0x0
+; CHECK:     .byte   4                               // 0x4
+; CHECK:     .byte   8                               // 0x8
+; CHECK:     .byte   12                              // 0xc
+; CHECK:     .byte   255                             // 0xff
+; CHECK:     .byte   255                             // 0xff
+; CHECK:     .byte   255                             // 0xff
+; CHECK:     .byte   255                             // 0xff
+; CHECK:      .section        .rodata.cst16.hot,"aM", at progbits,16
+; CHECK: .LCPI1_1:
+; CHECK:      .word   442                             // 0x1ba
+; CHECK:      .word   100                             // 0x64
+; CHECK:      .word   0                               // 0x0
+; CHECK:      .word   0                               // 0x0
+
+; CHECK:      .section        .rodata.cst8.hot,"aM", at progbits,8
+; CHECK: .LCPI2_0:
+; CHECK:      .xword  0x3fe5c28f5c28f5c3              // double 0.68000000000000005
+; CHECK:      .section        .rodata.cst16.hot,"aM", at progbits,16
+; CHECK: .LCPI2_1:
+; CHECK:      .word   442                             // 0x1ba
+; CHECK:      .word   100                             // 0x64
+; CHECK:      .word   0                               // 0x0
+; CHECK:      .word   0                               // 0x0
+
+; CHECK:    .section	.rodata.cst32,"aM", at progbits,32
+; CHECK:    .globl	val
+
+define i32 @cold_func(double %x, <16 x i8> %a, <16 x i8> %b) !prof !16 {
+  %2 = tail call i32 (...) @func_taking_arbitrary_param(double 6.800000e-01)
+  %num = tail call i32 (...) @func_taking_arbitrary_param(double 6.8500000e-01)
+  %t1 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %a, <16 x i8> %b, <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = bitcast <8 x i8> %t1 to <2 x i32>
+  %3 = extractelement <2 x i32> %t2, i32 1
+  %sum = add i32 %2, %3
+  %ret = add i32 %sum, %num
+  ret i32 %ret
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>)
+declare i32 @func_taking_arbitrary_param(...)
+
+define <4 x i1> @unprofiled_func(<16 x i8> %a, <16 x i8> %b) {
+  %t1 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %a, <16 x i8> %b, <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
+  %t2 = bitcast <8 x i8> %t1 to <4 x i16>
+  %t3 = zext <4 x i16> %t2 to <4 x i32>
+  %cmp = icmp ule <4 x i32> <i32 442, i32 100, i32 0, i32 0>, %t3
----------------
snehasish wrote:

Can we use different values for the constant in the unprofiled func and the hot func? Using <442, 100, 0, 0> for both seems like it could lead to false positives.

https://github.com/llvm/llvm-project/pull/129781