[llvm-dev] windows ABI problem with i128?
Anton Korobeynikov via llvm-dev
llvm-dev at lists.llvm.org
Thu Apr 26 00:44:39 PDT 2018
Most probably you need to properly specify the calling convention the
backend is using for calling the runtime functions. Or implement the
stub for udivti3 that performs the necessary argument lifting.
I guess there is no standard ABI document describing the intended
calling convention here, so I'd just do what mingw64 does here and
make everything here compatible.
On Thu, Apr 26, 2018 at 4:44 AM, Andrew Kelley via llvm-dev
<llvm-dev at lists.llvm.org> wrote:
> I'm trying to use LLVM to create compiler-rt.o on Windows. I use this
> command from the compiler-rt project:
>
> [nix-shell:~/downloads/llvm-project/compiler-rt]$ clang -nostdlib -S
> -emit-llvm lib/builtins/udivti3.c -g -target x86_64-windows
> -DCRT_HAS_128BIT
>
> The resulting LLVM IR is:
> =================================================================
>
> ; ModuleID = 'lib/builtins/udivti3.c'
> source_filename = "lib/builtins/udivti3.c"
> target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64--windows-msvc19.11.0"
>
> ; Function Attrs: noinline nounwind optnone uwtable
> define i128 @__udivti3(i128, i128) #0 {
> %3 = alloca i128, align 16
> %4 = alloca i128, align 16
> store i128 %1, i128* %3, align 16
> store i128 %0, i128* %4, align 16
> %5 = load i128, i128* %3, align 16
> %6 = load i128, i128* %4, align 16
> %7 = call i128 @__udivmodti4(i128 %6, i128 %5, i128* null)
> ret i128 %7
> }
>
> declare i128 @__udivmodti4(i128, i128, i128*) #1
>
> attributes #0 = { noinline nounwind optnone uwtable
> "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false"
> "less-precise-fpmad"="false" "no-frame-pointer-elim"="false"
> "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"
> "no-signed-zeros-fp-math"="false" "no-trapping-math"="false"
> "stack-protector-buffer-size"="8" "target-cpu"="x86-64"
> "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false"
> "use-soft-float"="false" }
> attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false"
> "disable-tail-calls"="false" "less-precise-fpmad"="false"
> "no-frame-pointer-elim"="false" "no-infs-fp-math"="false"
> "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false"
> "no-trapping-math"="false" "stack-protector-buffer-size"="8"
> "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
> "unsafe-fp-math"="false" "use-soft-float"="false" }
>
> !llvm.module.flags = !{!0, !1}
> !llvm.ident = !{!2}
>
> !0 = !{i32 1, !"wchar_size", i32 2}
> !1 = !{i32 7, !"PIC Level", i32 2}
> !2 = !{!"clang version 6.0.0 (tags/RELEASE_600/final)"}
>
>
> =================================================================
> However I think this results in a different ABI than LLVM will use when you
> do i128 division. For example, here is my test case (in zig code):
> =================================================================
>
> pub extern "kernel32" stdcallcc fn ExitProcess(exit_code: c_uint) noreturn;
>
> export fn WinMainCRTStartup() noreturn {
> @setAlignStack(16);
> @setRuntimeSafety(false);
>
> var a: u128 = 152313999999999991610955792383;
> var b: u128 = 10000000000000000000;
> var c = a / b; // this generates a call to __udivti3
>
> if (c != b) {
> @breakpoint();
> }
> ExitProcess(0);
> }
>
> export fn __udivti3(a: u128, b: u128) u128 {
> @setRuntimeSafety(false);
> return b;
> }
>
>
> =================================================================
> This results in this LLVM IR:
> =================================================================
>
> ; ModuleID = 'test'
> source_filename = "test"
> target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-pc-windows-msvc"
>
> %"[]u8" = type { i8*, i64 }
> %StackTrace = type { i64, %"[]usize" }
> %"[]usize" = type { i64*, i64 }
>
> ; Function Attrs: nounwind readnone speculatable
> declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
>
> ; Function Attrs: nobuiltin noinline noreturn nounwind uwtable
> alignstack(16)
> define void @WinMainCRTStartup() #2 !dbg !41 {
> Entry:
> %a = alloca i128, align 8
> %b = alloca i128, align 8
> %c = alloca i128, align 8
> store i128 152313999999999991610955792383, i128* %a, align 8, !dbg !52
> call void @llvm.dbg.declare(metadata i128* %a, metadata !45, metadata
> !DIExpression()), !dbg !52
> store i128 10000000000000000000, i128* %b, align 8, !dbg !53
> call void @llvm.dbg.declare(metadata i128* %b, metadata !48, metadata
> !DIExpression()), !dbg !53
> %0 = load i128, i128* %a, align 8, !dbg !54
> %1 = load i128, i128* %b, align 8, !dbg !55
> %2 = udiv i128 %0, %1, !dbg !56
> store i128 %2, i128* %c, align 8, !dbg !57
> call void @llvm.dbg.declare(metadata i128* %c, metadata !50, metadata
> !DIExpression()), !dbg !57
> %3 = load i128, i128* %c, align 8, !dbg !58
> %4 = load i128, i128* %b, align 8, !dbg !60
> %5 = icmp ne i128 %3, %4, !dbg !61
> br i1 %5, label %Then, label %Else, !dbg !61
>
> Then: ; preds = %Entry
> call void @llvm.debugtrap(), !dbg !62
> br label %EndIf, !dbg !64
>
> Else: ; preds = %Entry
> br label %EndIf, !dbg !64
>
> EndIf: ; preds = %Else, %Then
> call void @ExitProcess(i32 0), !dbg !65
> unreachable, !dbg !65
> }
>
> ; Function Attrs: nounwind
> declare void @llvm.debugtrap() #3
>
> ; Function Attrs: nobuiltin noreturn nounwind uwtable
> declare void @ExitProcess(i32) #0
>
> ; Function Attrs: nobuiltin nounwind uwtable
> define i128 @__udivti3(i128, i128) #4 !dbg !66 {
> Entry:
> %a = alloca i128, align 8
> %b = alloca i128, align 8
> store i128 %0, i128* %a, align 8
> call void @llvm.dbg.declare(metadata i128* %a, metadata !70, metadata
> !DIExpression()), !dbg !73
> store i128 %1, i128* %b, align 8
> call void @llvm.dbg.declare(metadata i128* %b, metadata !71, metadata
> !DIExpression()), !dbg !74
> %2 = load i128, i128* %b, align 8, !dbg !75
> ret i128 %2, !dbg !78
> }
>
> ; Function Attrs: nounwind
> declare void @llvm.stackprotector(i8*, i8**) #3
>
> attributes #0 = { nobuiltin noreturn nounwind uwtable
> "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
> attributes #1 = { nounwind readnone speculatable }
> attributes #2 = { nobuiltin noinline noreturn nounwind uwtable alignstack=16
> "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
> attributes #3 = { nounwind }
> attributes #4 = { nobuiltin nounwind uwtable "no-frame-pointer-elim"="true"
> "no-frame-pointer-elim-non-leaf" }
>
> !llvm.module.flags = !{!0}
> !llvm.dbg.cu = !{!1}
>
> =================================================================
>
> When I link this with (link.exe or LLD, it does not matter):
> link.exe /OUT:test.exe /ENTRY:WinMainCRTStartup test.obj /subsystem:console
> kernel32.lib /nologo
>
> And run it, it triggers the breakpoint.
>
> Meanwhile on linux, this test passes.
>
> I suspect it may be a calling convention issue. Here is the assembly for the
> linux x86_64 version:
>
>
> =================================================================
> 0000000000000010 <_start>:
> 10: 55 push %rbp
> 11: 48 89 e5 mov %rsp,%rbp
> 14: 48 83 ec 40 sub $0x40,%rsp
> 18: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax
> 1f: 00 00 00
> 22: 48 89 45 f8 mov %rax,-0x8(%rbp)
> 26: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax
> 2d: 77 73 ff
> 30: 48 89 45 f0 mov %rax,-0x10(%rbp)
> 34: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax
> 3b: 23 c7 8a
> 3e: 48 89 45 e0 mov %rax,-0x20(%rbp)
> 42: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp)
> 49: 00
> 4a: 48 8b 7d f0 mov -0x10(%rbp),%rdi
> 4e: 48 8b 75 f8 mov -0x8(%rbp),%rsi
> 52: 48 8b 55 e0 mov -0x20(%rbp),%rdx
> 56: 48 8b 4d e8 mov -0x18(%rbp),%rcx
> 5a: e8 00 00 00 00 callq 5f <_start+0x4f>
> 5f: 48 89 55 d8 mov %rdx,-0x28(%rbp)
> 63: 48 89 45 d0 mov %rax,-0x30(%rbp)
> 67: c5 fa 6f 45 d0 vmovdqu -0x30(%rbp),%xmm0
> 6c: c5 fa 6f 4d e0 vmovdqu -0x20(%rbp),%xmm1
> 71: c5 f9 74 c1 vpcmpeqb %xmm1,%xmm0,%xmm0
> 75: c5 79 d7 c0 vpmovmskb %xmm0,%r8d
> 79: 41 81 e8 ff ff 00 00 sub $0xffff,%r8d
> 80: 44 89 45 cc mov %r8d,-0x34(%rbp)
> 84: 74 06 je 8c <_start+0x7c>
> 86: eb 00 jmp 88 <_start+0x78>
> 88: eb 00 jmp 8a <_start+0x7a>
> 8a: eb fe jmp 8a <_start+0x7a>
> 8c: eb 00 jmp 8e <_start+0x7e>
> 8e: 48 83 c4 40 add $0x40,%rsp
> 92: 5d pop %rbp
> 93: c3 retq
> 94: 66 66 66 2e 0f 1f 84 data16 data16 nopw %cs:0x0(%rax,%rax,1)
> 9b: 00 00 00 00 00
>
> 00000000000000a0 <__udivti3>:
> a0: 55 push %rbp
> a1: 48 89 e5 mov %rsp,%rbp
> a4: 48 89 7d f0 mov %rdi,-0x10(%rbp)
> a8: 48 89 75 f8 mov %rsi,-0x8(%rbp)
> ac: 48 89 4d e8 mov %rcx,-0x18(%rbp)
> b0: 48 89 55 e0 mov %rdx,-0x20(%rbp)
> b4: 48 8b 45 e0 mov -0x20(%rbp),%rax
> b8: 48 8b 55 e8 mov -0x18(%rbp),%rdx
> bc: 5d pop %rbp
> bd: c3 retq
>
>
> =================================================================
>
> And here is the assembly for the windows x86_64 version:
>
>
> =================================================================
> 0000000000000010 <_start>:
> 10: 55 push %rbp
> 11: 48 81 ec 80 00 00 00 sub $0x80,%rsp
> 18: 48 8d ac 24 80 00 00 lea 0x80(%rsp),%rbp
> 1f: 00
> 20: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax
> 27: 00 00 00
> 2a: 48 89 45 f8 mov %rax,-0x8(%rbp)
> 2e: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax
> 35: 77 73 ff
> 38: 48 89 45 f0 mov %rax,-0x10(%rbp)
> 3c: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax
> 43: 23 c7 8a
> 46: 48 89 45 e0 mov %rax,-0x20(%rbp)
> 4a: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp)
> 51: 00
> 52: 48 8b 45 f0 mov -0x10(%rbp),%rax
> 56: 48 8b 4d f8 mov -0x8(%rbp),%rcx
> 5a: 48 8b 55 e0 mov -0x20(%rbp),%rdx
> 5e: 4c 8b 45 e8 mov -0x18(%rbp),%r8
> 62: 48 89 4d c8 mov %rcx,-0x38(%rbp)
> 66: 48 89 45 c0 mov %rax,-0x40(%rbp)
> 6a: 4c 89 45 b8 mov %r8,-0x48(%rbp)
> 6e: 48 89 55 b0 mov %rdx,-0x50(%rbp)
> 72: 48 8d 4d c0 lea -0x40(%rbp),%rcx
> 76: 48 8d 55 b0 lea -0x50(%rbp),%rdx
> 7a: e8 41 00 00 00 callq c0 <__udivti3>
> 7f: 66 0f 70 c8 4e pshufd $0x4e,%xmm0,%xmm1
> 84: 66 0f d6 45 d0 movq %xmm0,-0x30(%rbp)
> 89: 66 0f d6 4d d8 movq %xmm1,-0x28(%rbp)
> 8e: 0f 10 45 d0 movups -0x30(%rbp),%xmm0
> 92: 0f 10 4d e0 movups -0x20(%rbp),%xmm1
> 96: 66 0f 74 c1 pcmpeqb %xmm1,%xmm0
> 9a: 66 44 0f d7 c8 pmovmskb %xmm0,%r9d
> 9f: 41 81 e9 ff ff 00 00 sub $0xffff,%r9d
> a6: 44 89 4d ac mov %r9d,-0x54(%rbp)
> aa: 74 06 je b2 <_start+0xa2>
> ac: eb 00 jmp ae <_start+0x9e>
> ae: eb 00 jmp b0 <_start+0xa0>
> b0: eb fe jmp b0 <_start+0xa0>
> b2: eb 00 jmp b4 <_start+0xa4>
> b4: 48 81 c4 80 00 00 00 add $0x80,%rsp
> bb: 5d pop %rbp
> bc: c3 retq
> bd: 90 nop
> be: 90 nop
> bf: 90 nop
>
> 00000000000000c0 <__udivti3>:
> c0: 55 push %rbp
> c1: 48 83 ec 20 sub $0x20,%rsp
> c5: 48 8d 6c 24 20 lea 0x20(%rsp),%rbp
> ca: 48 89 4d f0 mov %rcx,-0x10(%rbp)
> ce: 48 89 55 f8 mov %rdx,-0x8(%rbp)
> d2: 4c 89 4d e8 mov %r9,-0x18(%rbp)
> d6: 4c 89 45 e0 mov %r8,-0x20(%rbp)
> da: 48 8b 45 e0 mov -0x20(%rbp),%rax
> de: 48 8b 55 e8 mov -0x18(%rbp),%rdx
> e2: 48 83 c4 20 add $0x20,%rsp
> e6: 5d pop %rbp
> e7: c3 retq
>
> =================================================================
>
>
> Finally, my question:
>
> What is the correct LLVM IR to represent i128 values so that it will be
> compatible with the compiler-rt calls that LLVM generates? For example, what
> should be the LLVM IR definition of __udivti3?
>
> Because even though clang/compiler-rt project generates `define i128
> @__udivti3(i128, i128) #0 {`, this appears to be incorrect when run on
> windows.
>
> Thanks,
> Andrew
>
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>
--
With best regards, Anton Korobeynikov
Department of Statistical Modelling, Saint Petersburg State University
More information about the llvm-dev
mailing list