[llvm-dev] windows ABI problem with i128?

Thu Apr 26 00:44:39 PDT 2018

Most probably you need to properly specify the calling convention the
backend is using for calling the runtime functions. Or implement the
stub for udivti3 that performs the necessary argument lifting.

I guess there is no standard ABI document describing the intended
calling convention here, so I'd just do what mingw64 does here and
make everything here compatible.

On Thu, Apr 26, 2018 at 4:44 AM, Andrew Kelley via llvm-dev
<llvm-dev at lists.llvm.org> wrote:
> I'm trying to use LLVM to create compiler-rt.o on Windows. I use this
> command from the compiler-rt project:
>
> [nix-shell:~/downloads/llvm-project/compiler-rt]$ clang -nostdlib  -S
> -emit-llvm lib/builtins/udivti3.c  -g -target x86_64-windows
> -DCRT_HAS_128BIT
>
> The resulting LLVM IR is:
> =================================================================
>
> ; ModuleID = 'lib/builtins/udivti3.c'
> source_filename = "lib/builtins/udivti3.c"
> target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64--windows-msvc19.11.0"
>
> ; Function Attrs: noinline nounwind optnone uwtable
> define i128 @__udivti3(i128, i128) #0 {
>   %3 = alloca i128, align 16
>   %4 = alloca i128, align 16
>   store i128 %1, i128* %3, align 16
>   store i128 %0, i128* %4, align 16
>   %5 = load i128, i128* %3, align 16
>   %6 = load i128, i128* %4, align 16
>   %7 = call i128 @__udivmodti4(i128 %6, i128 %5, i128* null)
>   ret i128 %7
> }
>
> declare i128 @__udivmodti4(i128, i128, i128*) #1
>
> attributes #0 = { noinline nounwind optnone uwtable
> "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false"
> "less-precise-fpmad"="false" "no-frame-pointer-elim"="false"
> "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"
> "no-signed-zeros-fp-math"="false" "no-trapping-math"="false"
> "stack-protector-buffer-size"="8" "target-cpu"="x86-64"
> "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false"
> "use-soft-float"="false" }
> attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false"
> "disable-tail-calls"="false" "less-precise-fpmad"="false"
> "no-frame-pointer-elim"="false" "no-infs-fp-math"="false"
> "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false"
> "no-trapping-math"="false" "stack-protector-buffer-size"="8"
> "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
> "unsafe-fp-math"="false" "use-soft-float"="false" }
>
> !llvm.module.flags = !{!0, !1}
> !llvm.ident = !{!2}
>
> !0 = !{i32 1, !"wchar_size", i32 2}
> !1 = !{i32 7, !"PIC Level", i32 2}
> !2 = !{!"clang version 6.0.0 (tags/RELEASE_600/final)"}
>
>
> =================================================================
> However I think this results in a different ABI than LLVM will use when you
> do i128 division. For example, here is my test case (in zig code):
> =================================================================
>
> pub extern "kernel32" stdcallcc fn ExitProcess(exit_code: c_uint) noreturn;
>
> export fn WinMainCRTStartup() noreturn {
>     @setAlignStack(16);
>     @setRuntimeSafety(false);
>
>     var a: u128 = 152313999999999991610955792383;
>     var b: u128 = 10000000000000000000;
>     var c = a / b; // this generates a call to __udivti3
>
>     if (c != b) {
>         @breakpoint();
>     }
>     ExitProcess(0);
> }
>
> export fn __udivti3(a: u128, b: u128) u128 {
>     @setRuntimeSafety(false);
>     return b;
> }
>
>
> =================================================================
> This results in this LLVM IR:
> =================================================================
>
> ; ModuleID = 'test'
> source_filename = "test"
> target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-pc-windows-msvc"
>
> %"[]u8" = type { i8*, i64 }
> %StackTrace = type { i64, %"[]usize" }
> %"[]usize" = type { i64*, i64 }
>
> ; Function Attrs: nounwind readnone speculatable
> declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
>
> ; Function Attrs: nobuiltin noinline noreturn nounwind uwtable
> alignstack(16)
> define void @WinMainCRTStartup() #2 !dbg !41 {
> Entry:
>   %a = alloca i128, align 8
>   %b = alloca i128, align 8
>   %c = alloca i128, align 8
>   store i128 152313999999999991610955792383, i128* %a, align 8, !dbg !52
>   call void @llvm.dbg.declare(metadata i128* %a, metadata !45, metadata
> !DIExpression()), !dbg !52
>   store i128 10000000000000000000, i128* %b, align 8, !dbg !53
>   call void @llvm.dbg.declare(metadata i128* %b, metadata !48, metadata
> !DIExpression()), !dbg !53
>   %0 = load i128, i128* %a, align 8, !dbg !54
>   %1 = load i128, i128* %b, align 8, !dbg !55
>   %2 = udiv i128 %0, %1, !dbg !56
>   store i128 %2, i128* %c, align 8, !dbg !57
>   call void @llvm.dbg.declare(metadata i128* %c, metadata !50, metadata
> !DIExpression()), !dbg !57
>   %3 = load i128, i128* %c, align 8, !dbg !58
>   %4 = load i128, i128* %b, align 8, !dbg !60
>   %5 = icmp ne i128 %3, %4, !dbg !61
>   br i1 %5, label %Then, label %Else, !dbg !61
>
> Then:                                             ; preds = %Entry
>   call void @llvm.debugtrap(), !dbg !62
>   br label %EndIf, !dbg !64
>
> Else:                                             ; preds = %Entry
>   br label %EndIf, !dbg !64
>
> EndIf:                                            ; preds = %Else, %Then
>   call void @ExitProcess(i32 0), !dbg !65
>   unreachable, !dbg !65
> }
>
> ; Function Attrs: nounwind
> declare void @llvm.debugtrap() #3
>
> ; Function Attrs: nobuiltin noreturn nounwind uwtable
> declare void @ExitProcess(i32) #0
>
> ; Function Attrs: nobuiltin nounwind uwtable
> define i128 @__udivti3(i128, i128) #4 !dbg !66 {
> Entry:
>   %a = alloca i128, align 8
>   %b = alloca i128, align 8
>   store i128 %0, i128* %a, align 8
>   call void @llvm.dbg.declare(metadata i128* %a, metadata !70, metadata
> !DIExpression()), !dbg !73
>   store i128 %1, i128* %b, align 8
>   call void @llvm.dbg.declare(metadata i128* %b, metadata !71, metadata
> !DIExpression()), !dbg !74
>   %2 = load i128, i128* %b, align 8, !dbg !75
>   ret i128 %2, !dbg !78
> }
>
> ; Function Attrs: nounwind
> declare void @llvm.stackprotector(i8*, i8**) #3
>
> attributes #0 = { nobuiltin noreturn nounwind uwtable
> "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
> attributes #1 = { nounwind readnone speculatable }
> attributes #2 = { nobuiltin noinline noreturn nounwind uwtable alignstack=16
> "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
> attributes #3 = { nounwind }
> attributes #4 = { nobuiltin nounwind uwtable "no-frame-pointer-elim"="true"
> "no-frame-pointer-elim-non-leaf" }
>
> !llvm.module.flags = !{!0}
> !llvm.dbg.cu = !{!1}
>
> =================================================================
>
> When I link this with (link.exe or LLD, it does not matter):
> link.exe /OUT:test.exe /ENTRY:WinMainCRTStartup test.obj /subsystem:console
> kernel32.lib /nologo
>
> And run it, it triggers the breakpoint.
>
> Meanwhile on linux, this test passes.
>
> I suspect it may be a calling convention issue. Here is the assembly for the
> linux x86_64 version:
>
>
> =================================================================
> 0000000000000010 <_start>:
>   10:    55                       push   %rbp
>   11:    48 89 e5                 mov    %rsp,%rbp
>   14:    48 83 ec 40              sub    $0x40,%rsp
>   18:    48 b8 14 30 27 ec 01     movabs $0x1ec273014,%rax
>   1f:    00 00 00
>   22:    48 89 45 f8              mov    %rax,-0x8(%rbp)
>   26:    48 b8 ff ff ff ff ff     movabs $0xff7377ffffffffff,%rax
>   2d:    77 73 ff
>   30:    48 89 45 f0              mov    %rax,-0x10(%rbp)
>   34:    48 b8 00 00 e8 89 04     movabs $0x8ac7230489e80000,%rax
>   3b:    23 c7 8a
>   3e:    48 89 45 e0              mov    %rax,-0x20(%rbp)
>   42:    48 c7 45 e8 00 00 00     movq   $0x0,-0x18(%rbp)
>   49:    00
>   4a:    48 8b 7d f0              mov    -0x10(%rbp),%rdi
>   4e:    48 8b 75 f8              mov    -0x8(%rbp),%rsi
>   52:    48 8b 55 e0              mov    -0x20(%rbp),%rdx
>   56:    48 8b 4d e8              mov    -0x18(%rbp),%rcx
>   5a:    e8 00 00 00 00           callq  5f <_start+0x4f>
>   5f:    48 89 55 d8              mov    %rdx,-0x28(%rbp)
>   63:    48 89 45 d0              mov    %rax,-0x30(%rbp)
>   67:    c5 fa 6f 45 d0           vmovdqu -0x30(%rbp),%xmm0
>   6c:    c5 fa 6f 4d e0           vmovdqu -0x20(%rbp),%xmm1
>   71:    c5 f9 74 c1              vpcmpeqb %xmm1,%xmm0,%xmm0
>   75:    c5 79 d7 c0              vpmovmskb %xmm0,%r8d
>   79:    41 81 e8 ff ff 00 00     sub    $0xffff,%r8d
>   80:    44 89 45 cc              mov    %r8d,-0x34(%rbp)
>   84:    74 06                    je     8c <_start+0x7c>
>   86:    eb 00                    jmp    88 <_start+0x78>
>   88:    eb 00                    jmp    8a <_start+0x7a>
>   8a:    eb fe                    jmp    8a <_start+0x7a>
>   8c:    eb 00                    jmp    8e <_start+0x7e>
>   8e:    48 83 c4 40              add    $0x40,%rsp
>   92:    5d                       pop    %rbp
>   93:    c3                       retq
>   94:    66 66 66 2e 0f 1f 84     data16 data16 nopw %cs:0x0(%rax,%rax,1)
>   9b:    00 00 00 00 00
>
> 00000000000000a0 <__udivti3>:
>   a0:    55                       push   %rbp
>   a1:    48 89 e5                 mov    %rsp,%rbp
>   a4:    48 89 7d f0              mov    %rdi,-0x10(%rbp)
>   a8:    48 89 75 f8              mov    %rsi,-0x8(%rbp)
>   ac:    48 89 4d e8              mov    %rcx,-0x18(%rbp)
>   b0:    48 89 55 e0              mov    %rdx,-0x20(%rbp)
>   b4:    48 8b 45 e0              mov    -0x20(%rbp),%rax
>   b8:    48 8b 55 e8              mov    -0x18(%rbp),%rdx
>   bc:    5d                       pop    %rbp
>   bd:    c3                       retq
>
>
> =================================================================
>
> And here is the assembly for the windows x86_64 version:
>
>
> =================================================================
> 0000000000000010 <_start>:
>   10:    55                       push   %rbp
>   11:    48 81 ec 80 00 00 00     sub    $0x80,%rsp
>   18:    48 8d ac 24 80 00 00     lea    0x80(%rsp),%rbp
>   1f:    00
>   20:    48 b8 14 30 27 ec 01     movabs $0x1ec273014,%rax
>   27:    00 00 00
>   2a:    48 89 45 f8              mov    %rax,-0x8(%rbp)
>   2e:    48 b8 ff ff ff ff ff     movabs $0xff7377ffffffffff,%rax
>   35:    77 73 ff
>   38:    48 89 45 f0              mov    %rax,-0x10(%rbp)
>   3c:    48 b8 00 00 e8 89 04     movabs $0x8ac7230489e80000,%rax
>   43:    23 c7 8a
>   46:    48 89 45 e0              mov    %rax,-0x20(%rbp)
>   4a:    48 c7 45 e8 00 00 00     movq   $0x0,-0x18(%rbp)
>   51:    00
>   52:    48 8b 45 f0              mov    -0x10(%rbp),%rax
>   56:    48 8b 4d f8              mov    -0x8(%rbp),%rcx
>   5a:    48 8b 55 e0              mov    -0x20(%rbp),%rdx
>   5e:    4c 8b 45 e8              mov    -0x18(%rbp),%r8
>   62:    48 89 4d c8              mov    %rcx,-0x38(%rbp)
>   66:    48 89 45 c0              mov    %rax,-0x40(%rbp)
>   6a:    4c 89 45 b8              mov    %r8,-0x48(%rbp)
>   6e:    48 89 55 b0              mov    %rdx,-0x50(%rbp)
>   72:    48 8d 4d c0              lea    -0x40(%rbp),%rcx
>   76:    48 8d 55 b0              lea    -0x50(%rbp),%rdx
>   7a:    e8 41 00 00 00           callq  c0 <__udivti3>
>   7f:    66 0f 70 c8 4e           pshufd $0x4e,%xmm0,%xmm1
>   84:    66 0f d6 45 d0           movq   %xmm0,-0x30(%rbp)
>   89:    66 0f d6 4d d8           movq   %xmm1,-0x28(%rbp)
>   8e:    0f 10 45 d0              movups -0x30(%rbp),%xmm0
>   92:    0f 10 4d e0              movups -0x20(%rbp),%xmm1
>   96:    66 0f 74 c1              pcmpeqb %xmm1,%xmm0
>   9a:    66 44 0f d7 c8           pmovmskb %xmm0,%r9d
>   9f:    41 81 e9 ff ff 00 00     sub    $0xffff,%r9d
>   a6:    44 89 4d ac              mov    %r9d,-0x54(%rbp)
>   aa:    74 06                    je     b2 <_start+0xa2>
>   ac:    eb 00                    jmp    ae <_start+0x9e>
>   ae:    eb 00                    jmp    b0 <_start+0xa0>
>   b0:    eb fe                    jmp    b0 <_start+0xa0>
>   b2:    eb 00                    jmp    b4 <_start+0xa4>
>   b4:    48 81 c4 80 00 00 00     add    $0x80,%rsp
>   bb:    5d                       pop    %rbp
>   bc:    c3                       retq
>   bd:    90                       nop
>   be:    90                       nop
>   bf:    90                       nop
>
> 00000000000000c0 <__udivti3>:
>   c0:    55                       push   %rbp
>   c1:    48 83 ec 20              sub    $0x20,%rsp
>   c5:    48 8d 6c 24 20           lea    0x20(%rsp),%rbp
>   ca:    48 89 4d f0              mov    %rcx,-0x10(%rbp)
>   ce:    48 89 55 f8              mov    %rdx,-0x8(%rbp)
>   d2:    4c 89 4d e8              mov    %r9,-0x18(%rbp)
>   d6:    4c 89 45 e0              mov    %r8,-0x20(%rbp)
>   da:    48 8b 45 e0              mov    -0x20(%rbp),%rax
>   de:    48 8b 55 e8              mov    -0x18(%rbp),%rdx
>   e2:    48 83 c4 20              add    $0x20,%rsp
>   e6:    5d                       pop    %rbp
>   e7:    c3                       retq
>
> =================================================================
>
>
> Finally, my question:
>
> What is the correct LLVM IR to represent i128 values so that it will be
> compatible with the compiler-rt calls that LLVM generates? For example, what
> should be the LLVM IR definition of  __udivti3?
>
> Because even though clang/compiler-rt project generates `define i128
> @__udivti3(i128, i128) #0 {`, this appears to be incorrect when run on
> windows.
>
> Thanks,
> Andrew
>
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>

-- 
With best regards, Anton Korobeynikov
Department of Statistical Modelling, Saint Petersburg State University