[llvm-dev] suboptimal type isomorphy handling involving opaque structs

Wed May 9 18:21:43 PDT 2018

In the following example, LLVM's logic for merging isomorphic types
causes two functions in different compilation units with identical
function signatures to have different signatures in the resulting
bitcode:

===========================
$ cat demo-struct1.c
struct foo { int x; };
struct bar { int x; };
struct foo *return_arg_1(struct foo *a, struct bar *b) { return a; }
$ cat demo-struct2.c
struct foo;
struct bar;
struct foo *return_arg_2(struct foo *a, struct bar *b) { return a; }
$ ~/git/foreign/llvm-build/bin/clang -c -emit-llvm demo-struct1.c &&
~/git/foreign/llvm-build/bin/clang -c -emit-llvm demo-struct2.c &&
~/git/foreign/llvm-build/bin/llvm-link demo-struct1.bc demo-struct2.bc
-o demo-struct-linked.bc && ~/git/foreign/llvm-build/bin/llvm-dis
demo-struct-linked.bc && cat demo-struct-linked.ll
; ModuleID = 'demo-struct-linked.bc'
source_filename = "llvm-link"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%struct.foo = type { i32 }
%struct.bar = type opaque

; Function Attrs: noinline nounwind optnone uwtable
define dso_local %struct.foo* @return_arg_1(%struct.foo* %a,
%struct.foo* %b) #0 {
entry:
  %a.addr = alloca %struct.foo*, align 8
  %b.addr = alloca %struct.foo*, align 8
  store %struct.foo* %a, %struct.foo** %a.addr, align 8
  store %struct.foo* %b, %struct.foo** %b.addr, align 8
  %0 = load %struct.foo*, %struct.foo** %a.addr, align 8
  ret %struct.foo* %0
}

; Function Attrs: noinline nounwind optnone uwtable
define dso_local %struct.foo* @return_arg_2(%struct.foo* %a,
%struct.bar* %b) #0 {
entry:
  %a.addr = alloca %struct.foo*, align 8
  %b.addr = alloca %struct.bar*, align 8
  store %struct.foo* %a, %struct.foo** %a.addr, align 8
  store %struct.bar* %b, %struct.bar** %b.addr, align 8
  %0 = load %struct.foo*, %struct.foo** %a.addr, align 8
  ret %struct.foo* %0
}

attributes #0 = { noinline nounwind optnone uwtable
"correctly-rounded-divide-sqrt-fp-math"="false"
"disable-tail-calls"="false" "less-precise-fpmad"="false"
"no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"
"no-infs-fp-math"="false" "no-jump-tables"="false"
"no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false"
"no-trapping-math"="false" "stack-protector-buffer-size"="8"
"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
"unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{!0, !0}
!llvm.module.flags = !{!1}

!0 = !{!"clang version 7.0.0 (https://git.llvm.org/git/clang.git/
d315e4713bf345fb30968dc5f1da42b641a3ca08)
(https://git.llvm.org/git/llvm.git/
3491ea57525ea6afbc4e4171fdda4a8cc2b3aef2)"}
!1 = !{i32 1, !"wchar_size", i32 4}
===========================

In practice, when merging bitcode files from Linux kernel compilation
units, this effect causes lots of duplicated types because it
propagates upwards through the type hierarchy. In a Linux kernel build
with 2256 compilation units:

$ egrep '^%struct\.[0-9a-z_]+[. ]' llvm_bitcode_linked.ll | egrep -v
'^%struct\.anon\.' | wc -l
59984
$ egrep '^%struct\.[0-9a-z_]+ ' llvm_bitcode_linked.ll | egrep -v
'^%struct\.anon\.' | wc -l
4463

So there are 4463 unique struct names, but 59984 named struct types.
Many structs are duplicated a few hundred times (these aren't all,
just the most common ones):

$ egrep '^%struct\.[0-9a-z_]+\.' llvm_bitcode_linked.ll | egrep -v
'^%struct\.anon\.' | cut -d. -f2 | sort | uniq -c | sort -n | tail
-n30
    462 file_lock
    462 file_lock_operations
    462 file_operations
    462 file_system_type
    462 iattr
    462 inode
    462 inode_operations
    462 kiocb
    462 lock_manager_operations
    462 mem_dqinfo
    462 mm_struct
    462 page
    462 path
    462 quotactl_ops
    462 quota_format_ops
    462 quota_format_type
    462 quota_info
    462 rw_semaphore
    462 super_block
    462 super_operations
    462 task_struct
    462 vm_area_struct
    471 bus_type
    471 class
    471 device
    471 device_driver
    471 device_type
    471 dev_pm_domain
    471 dev_pm_info
    471 dev_pm_ops

On why the struct types are duplicated *that* often: I skimmed
IRLinker::computeTypeMapping(), and it looks to me like it will only
try to merge new struct types against one of the existing structs with
that name?

Does this qualify as a bug (because it causes functions that have the
same signature in sourcecode to have different signatures in the
bitcode coming out of llvm-link), or should I not be doing anything
with LLVM bitcode that relies on a particular struct type in
sourcecode always corresponding to a single struct type in bitcode
(possibly shared with other struct types that are isomorphic)?