[llvm-dev] suboptimal type isomorphy handling involving opaque structs
Jann Horn via llvm-dev
llvm-dev at lists.llvm.org
Wed May 9 18:21:43 PDT 2018
In the following example, LLVM's logic for merging isomorphic types
causes two functions in different compilation units with identical
function signatures to have different signatures in the resulting
bitcode:
===========================
$ cat demo-struct1.c
struct foo { int x; };
struct bar { int x; };
struct foo *return_arg_1(struct foo *a, struct bar *b) { return a; }
$ cat demo-struct2.c
struct foo;
struct bar;
struct foo *return_arg_2(struct foo *a, struct bar *b) { return a; }
$ ~/git/foreign/llvm-build/bin/clang -c -emit-llvm demo-struct1.c &&
~/git/foreign/llvm-build/bin/clang -c -emit-llvm demo-struct2.c &&
~/git/foreign/llvm-build/bin/llvm-link demo-struct1.bc demo-struct2.bc
-o demo-struct-linked.bc && ~/git/foreign/llvm-build/bin/llvm-dis
demo-struct-linked.bc && cat demo-struct-linked.ll
; ModuleID = 'demo-struct-linked.bc'
source_filename = "llvm-link"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%struct.foo = type { i32 }
%struct.bar = type opaque
; Function Attrs: noinline nounwind optnone uwtable
define dso_local %struct.foo* @return_arg_1(%struct.foo* %a,
%struct.foo* %b) #0 {
entry:
%a.addr = alloca %struct.foo*, align 8
%b.addr = alloca %struct.foo*, align 8
store %struct.foo* %a, %struct.foo** %a.addr, align 8
store %struct.foo* %b, %struct.foo** %b.addr, align 8
%0 = load %struct.foo*, %struct.foo** %a.addr, align 8
ret %struct.foo* %0
}
; Function Attrs: noinline nounwind optnone uwtable
define dso_local %struct.foo* @return_arg_2(%struct.foo* %a,
%struct.bar* %b) #0 {
entry:
%a.addr = alloca %struct.foo*, align 8
%b.addr = alloca %struct.bar*, align 8
store %struct.foo* %a, %struct.foo** %a.addr, align 8
store %struct.bar* %b, %struct.bar** %b.addr, align 8
%0 = load %struct.foo*, %struct.foo** %a.addr, align 8
ret %struct.foo* %0
}
attributes #0 = { noinline nounwind optnone uwtable
"correctly-rounded-divide-sqrt-fp-math"="false"
"disable-tail-calls"="false" "less-precise-fpmad"="false"
"no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"
"no-infs-fp-math"="false" "no-jump-tables"="false"
"no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false"
"no-trapping-math"="false" "stack-protector-buffer-size"="8"
"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
"unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0, !0}
!llvm.module.flags = !{!1}
!0 = !{!"clang version 7.0.0 (https://git.llvm.org/git/clang.git/
d315e4713bf345fb30968dc5f1da42b641a3ca08)
(https://git.llvm.org/git/llvm.git/
3491ea57525ea6afbc4e4171fdda4a8cc2b3aef2)"}
!1 = !{i32 1, !"wchar_size", i32 4}
===========================
In practice, when merging bitcode files from Linux kernel compilation
units, this effect causes lots of duplicated types because it
propagates upwards through the type hierarchy. In a Linux kernel build
with 2256 compilation units:
$ egrep '^%struct\.[0-9a-z_]+[. ]' llvm_bitcode_linked.ll | egrep -v
'^%struct\.anon\.' | wc -l
59984
$ egrep '^%struct\.[0-9a-z_]+ ' llvm_bitcode_linked.ll | egrep -v
'^%struct\.anon\.' | wc -l
4463
So there are 4463 unique struct names, but 59984 named struct types.
Many structs are duplicated a few hundred times (these aren't all,
just the most common ones):
$ egrep '^%struct\.[0-9a-z_]+\.' llvm_bitcode_linked.ll | egrep -v
'^%struct\.anon\.' | cut -d. -f2 | sort | uniq -c | sort -n | tail
-n30
462 file_lock
462 file_lock_operations
462 file_operations
462 file_system_type
462 iattr
462 inode
462 inode_operations
462 kiocb
462 lock_manager_operations
462 mem_dqinfo
462 mm_struct
462 page
462 path
462 quotactl_ops
462 quota_format_ops
462 quota_format_type
462 quota_info
462 rw_semaphore
462 super_block
462 super_operations
462 task_struct
462 vm_area_struct
471 bus_type
471 class
471 device
471 device_driver
471 device_type
471 dev_pm_domain
471 dev_pm_info
471 dev_pm_ops
On why the struct types are duplicated *that* often: I skimmed
IRLinker::computeTypeMapping(), and it looks to me like it will only
try to merge new struct types against one of the existing structs with
that name?
Does this qualify as a bug (because it causes functions that have the
same signature in sourcecode to have different signatures in the
bitcode coming out of llvm-link), or should I not be doing anything
with LLVM bitcode that relies on a particular struct type in
sourcecode always corresponding to a single struct type in bitcode
(possibly shared with other struct types that are isomorphic)?
More information about the llvm-dev
mailing list