[llvm-dev] windows ABI problem with i128?
Andrew Kelley via llvm-dev
llvm-dev at lists.llvm.org
Wed Apr 25 20:44:32 PDT 2018
I'm trying to use LLVM to create compiler-rt.o on Windows. I use this
command from the compiler-rt project:
[nix-shell:~/downloads/llvm-project/compiler-rt]$ clang -nostdlib -S
-emit-llvm lib/builtins/udivti3.c -g -target x86_64-windows
-DCRT_HAS_128BIT
The resulting LLVM IR is:
=================================================================
; ModuleID = 'lib/builtins/udivti3.c'
source_filename = "lib/builtins/udivti3.c"
target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64--windows-msvc19.11.0"
; Function Attrs: noinline nounwind optnone uwtable
define i128 @__udivti3(i128, i128) #0 {
%3 = alloca i128, align 16
%4 = alloca i128, align 16
store i128 %1, i128* %3, align 16
store i128 %0, i128* %4, align 16
%5 = load i128, i128* %3, align 16
%6 = load i128, i128* %4, align 16
%7 = call i128 @__udivmodti4(i128 %6, i128 %5, i128* null)
ret i128 %7
}
declare i128 @__udivmodti4(i128, i128, i128*) #1
attributes #0 = { noinline nounwind optnone uwtable
"correctly-rounded-divide-sqrt-fp-math"="false"
"disable-tail-calls"="false" "less-precise-fpmad"="false"
"no-frame-pointer-elim"="false" "no-infs-fp-math"="false"
"no-jump-tables"="false" "no-nans-fp-math"="false"
"no-signed-zeros-fp-math"="false" "no-trapping-math"="false"
"stack-protector-buffer-size"="8" "target-cpu"="x86-64"
"target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false"
"use-soft-float"="false" }
attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false"
"disable-tail-calls"="false" "less-precise-fpmad"="false"
"no-frame-pointer-elim"="false" "no-infs-fp-math"="false"
"no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false"
"no-trapping-math"="false" "stack-protector-buffer-size"="8"
"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
"unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 1, !"wchar_size", i32 2}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 6.0.0 (tags/RELEASE_600/final)"}
=================================================================
However I think this results in a different ABI than LLVM will use when you
do i128 division. For example, here is my test case (in zig code):
=================================================================
pub extern "kernel32" stdcallcc fn ExitProcess(exit_code: c_uint) noreturn;
export fn WinMainCRTStartup() noreturn {
@setAlignStack(16);
@setRuntimeSafety(false);
var a: u128 = 152313999999999991610955792383;
var b: u128 = 10000000000000000000;
var c = a / b; // this generates a call to __udivti3
if (c != b) {
@breakpoint();
}
ExitProcess(0);
}
export fn __udivti3(a: u128, b: u128) u128 {
@setRuntimeSafety(false);
return b;
}
=================================================================
This results in this LLVM IR:
=================================================================
; ModuleID = 'test'
source_filename = "test"
target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-windows-msvc"
%"[]u8" = type { i8*, i64 }
%StackTrace = type { i64, %"[]usize" }
%"[]usize" = type { i64*, i64 }
; Function Attrs: nounwind readnone speculatable
declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
; Function Attrs: nobuiltin noinline noreturn nounwind uwtable
alignstack(16)
define void @WinMainCRTStartup() #2 !dbg !41 {
Entry:
%a = alloca i128, align 8
%b = alloca i128, align 8
%c = alloca i128, align 8
store i128 152313999999999991610955792383, i128* %a, align 8, !dbg !52
call void @llvm.dbg.declare(metadata i128* %a, metadata !45, metadata
!DIExpression()), !dbg !52
store i128 10000000000000000000, i128* %b, align 8, !dbg !53
call void @llvm.dbg.declare(metadata i128* %b, metadata !48, metadata
!DIExpression()), !dbg !53
%0 = load i128, i128* %a, align 8, !dbg !54
%1 = load i128, i128* %b, align 8, !dbg !55
%2 = udiv i128 %0, %1, !dbg !56
store i128 %2, i128* %c, align 8, !dbg !57
call void @llvm.dbg.declare(metadata i128* %c, metadata !50, metadata
!DIExpression()), !dbg !57
%3 = load i128, i128* %c, align 8, !dbg !58
%4 = load i128, i128* %b, align 8, !dbg !60
%5 = icmp ne i128 %3, %4, !dbg !61
br i1 %5, label %Then, label %Else, !dbg !61
Then: ; preds = %Entry
call void @llvm.debugtrap(), !dbg !62
br label %EndIf, !dbg !64
Else: ; preds = %Entry
br label %EndIf, !dbg !64
EndIf: ; preds = %Else, %Then
call void @ExitProcess(i32 0), !dbg !65
unreachable, !dbg !65
}
; Function Attrs: nounwind
declare void @llvm.debugtrap() #3
; Function Attrs: nobuiltin noreturn nounwind uwtable
declare void @ExitProcess(i32) #0
; Function Attrs: nobuiltin nounwind uwtable
define i128 @__udivti3(i128, i128) #4 !dbg !66 {
Entry:
%a = alloca i128, align 8
%b = alloca i128, align 8
store i128 %0, i128* %a, align 8
call void @llvm.dbg.declare(metadata i128* %a, metadata !70, metadata
!DIExpression()), !dbg !73
store i128 %1, i128* %b, align 8
call void @llvm.dbg.declare(metadata i128* %b, metadata !71, metadata
!DIExpression()), !dbg !74
%2 = load i128, i128* %b, align 8, !dbg !75
ret i128 %2, !dbg !78
}
; Function Attrs: nounwind
declare void @llvm.stackprotector(i8*, i8**) #3
attributes #0 = { nobuiltin noreturn nounwind uwtable
"no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { nobuiltin noinline noreturn nounwind uwtable
alignstack=16 "no-frame-pointer-elim"="true"
"no-frame-pointer-elim-non-leaf" }
attributes #3 = { nounwind }
attributes #4 = { nobuiltin nounwind uwtable "no-frame-pointer-elim"="true"
"no-frame-pointer-elim-non-leaf" }
!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}
=================================================================
When I link this with (link.exe or LLD, it does not matter):
link.exe /OUT:test.exe /ENTRY:WinMainCRTStartup test.obj /subsystem:console
kernel32.lib /nologo
And run it, it triggers the breakpoint.
Meanwhile on linux, this test passes.
I suspect it may be a calling convention issue. Here is the assembly for
the linux x86_64 version:
=================================================================
0000000000000010 <_start>:
10: 55 push %rbp
11: 48 89 e5 mov %rsp,%rbp
14: 48 83 ec 40 sub $0x40,%rsp
18: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax
1f: 00 00 00
22: 48 89 45 f8 mov %rax,-0x8(%rbp)
26: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax
2d: 77 73 ff
30: 48 89 45 f0 mov %rax,-0x10(%rbp)
34: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax
3b: 23 c7 8a
3e: 48 89 45 e0 mov %rax,-0x20(%rbp)
42: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp)
49: 00
4a: 48 8b 7d f0 mov -0x10(%rbp),%rdi
4e: 48 8b 75 f8 mov -0x8(%rbp),%rsi
52: 48 8b 55 e0 mov -0x20(%rbp),%rdx
56: 48 8b 4d e8 mov -0x18(%rbp),%rcx
5a: e8 00 00 00 00 callq 5f <_start+0x4f>
5f: 48 89 55 d8 mov %rdx,-0x28(%rbp)
63: 48 89 45 d0 mov %rax,-0x30(%rbp)
67: c5 fa 6f 45 d0 vmovdqu -0x30(%rbp),%xmm0
6c: c5 fa 6f 4d e0 vmovdqu -0x20(%rbp),%xmm1
71: c5 f9 74 c1 vpcmpeqb %xmm1,%xmm0,%xmm0
75: c5 79 d7 c0 vpmovmskb %xmm0,%r8d
79: 41 81 e8 ff ff 00 00 sub $0xffff,%r8d
80: 44 89 45 cc mov %r8d,-0x34(%rbp)
84: 74 06 je 8c <_start+0x7c>
86: eb 00 jmp 88 <_start+0x78>
88: eb 00 jmp 8a <_start+0x7a>
8a: eb fe jmp 8a <_start+0x7a>
8c: eb 00 jmp 8e <_start+0x7e>
8e: 48 83 c4 40 add $0x40,%rsp
92: 5d pop %rbp
93: c3 retq
94: 66 66 66 2e 0f 1f 84 data16 data16 nopw %cs:0x0(%rax,%rax,1)
9b: 00 00 00 00 00
00000000000000a0 <__udivti3>:
a0: 55 push %rbp
a1: 48 89 e5 mov %rsp,%rbp
a4: 48 89 7d f0 mov %rdi,-0x10(%rbp)
a8: 48 89 75 f8 mov %rsi,-0x8(%rbp)
ac: 48 89 4d e8 mov %rcx,-0x18(%rbp)
b0: 48 89 55 e0 mov %rdx,-0x20(%rbp)
b4: 48 8b 45 e0 mov -0x20(%rbp),%rax
b8: 48 8b 55 e8 mov -0x18(%rbp),%rdx
bc: 5d pop %rbp
bd: c3 retq
=================================================================
And here is the assembly for the windows x86_64 version:
=================================================================
0000000000000010 <_start>:
10: 55 push %rbp
11: 48 81 ec 80 00 00 00 sub $0x80,%rsp
18: 48 8d ac 24 80 00 00 lea 0x80(%rsp),%rbp
1f: 00
20: 48 b8 14 30 27 ec 01 movabs $0x1ec273014,%rax
27: 00 00 00
2a: 48 89 45 f8 mov %rax,-0x8(%rbp)
2e: 48 b8 ff ff ff ff ff movabs $0xff7377ffffffffff,%rax
35: 77 73 ff
38: 48 89 45 f0 mov %rax,-0x10(%rbp)
3c: 48 b8 00 00 e8 89 04 movabs $0x8ac7230489e80000,%rax
43: 23 c7 8a
46: 48 89 45 e0 mov %rax,-0x20(%rbp)
4a: 48 c7 45 e8 00 00 00 movq $0x0,-0x18(%rbp)
51: 00
52: 48 8b 45 f0 mov -0x10(%rbp),%rax
56: 48 8b 4d f8 mov -0x8(%rbp),%rcx
5a: 48 8b 55 e0 mov -0x20(%rbp),%rdx
5e: 4c 8b 45 e8 mov -0x18(%rbp),%r8
62: 48 89 4d c8 mov %rcx,-0x38(%rbp)
66: 48 89 45 c0 mov %rax,-0x40(%rbp)
6a: 4c 89 45 b8 mov %r8,-0x48(%rbp)
6e: 48 89 55 b0 mov %rdx,-0x50(%rbp)
72: 48 8d 4d c0 lea -0x40(%rbp),%rcx
76: 48 8d 55 b0 lea -0x50(%rbp),%rdx
7a: e8 41 00 00 00 callq c0 <__udivti3>
7f: 66 0f 70 c8 4e pshufd $0x4e,%xmm0,%xmm1
84: 66 0f d6 45 d0 movq %xmm0,-0x30(%rbp)
89: 66 0f d6 4d d8 movq %xmm1,-0x28(%rbp)
8e: 0f 10 45 d0 movups -0x30(%rbp),%xmm0
92: 0f 10 4d e0 movups -0x20(%rbp),%xmm1
96: 66 0f 74 c1 pcmpeqb %xmm1,%xmm0
9a: 66 44 0f d7 c8 pmovmskb %xmm0,%r9d
9f: 41 81 e9 ff ff 00 00 sub $0xffff,%r9d
a6: 44 89 4d ac mov %r9d,-0x54(%rbp)
aa: 74 06 je b2 <_start+0xa2>
ac: eb 00 jmp ae <_start+0x9e>
ae: eb 00 jmp b0 <_start+0xa0>
b0: eb fe jmp b0 <_start+0xa0>
b2: eb 00 jmp b4 <_start+0xa4>
b4: 48 81 c4 80 00 00 00 add $0x80,%rsp
bb: 5d pop %rbp
bc: c3 retq
bd: 90 nop
be: 90 nop
bf: 90 nop
00000000000000c0 <__udivti3>:
c0: 55 push %rbp
c1: 48 83 ec 20 sub $0x20,%rsp
c5: 48 8d 6c 24 20 lea 0x20(%rsp),%rbp
ca: 48 89 4d f0 mov %rcx,-0x10(%rbp)
ce: 48 89 55 f8 mov %rdx,-0x8(%rbp)
d2: 4c 89 4d e8 mov %r9,-0x18(%rbp)
d6: 4c 89 45 e0 mov %r8,-0x20(%rbp)
da: 48 8b 45 e0 mov -0x20(%rbp),%rax
de: 48 8b 55 e8 mov -0x18(%rbp),%rdx
e2: 48 83 c4 20 add $0x20,%rsp
e6: 5d pop %rbp
e7: c3 retq
=================================================================
Finally, my question:
What is the correct LLVM IR to represent i128 values so that it will be
compatible with the compiler-rt calls that LLVM generates? For example,
what should be the LLVM IR definition of __udivti3?
Because even though clang/compiler-rt project generates `define i128
@__udivti3(i128, i128) #0 {`, this appears to be incorrect when run on
windows.
Thanks,
Andrew
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20180425/e8b7cbf8/attachment.html>
More information about the llvm-dev
mailing list