[llvm] c2da0cd - [X86] Add Win32/64 mulo test coverage

Fri Apr 23 06:52:02 PDT 2021

Author: Simon Pilgrim
Date: 2021-04-23T14:51:42+01:00
New Revision: c2da0cdff5683550b0ceb8739c862dc0e4e9b204

URL: https://github.com/llvm/llvm-project/commit/c2da0cdff5683550b0ceb8739c862dc0e4e9b204
DIFF: https://github.com/llvm/llvm-project/commit/c2da0cdff5683550b0ceb8739c862dc0e4e9b204.diff

LOG: [X86] Add Win32/64 mulo test coverage

Part of an investigation to solve the windows regressions caused by rG13ec913bdf50

Added: 
    

Modified: 
    llvm/test/CodeGen/X86/xmulo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index f6c36f5afd99a..46bd004b867a8 100644

--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -1,34 +1,57 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -disable-peephole -mtriple=x86_64-darwin-unknown < %s | FileCheck %s --check-prefixes=CHECK,SDAG
-; RUN: llc -disable-peephole -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort=1 < %s | FileCheck %s --check-prefixes=CHECK,FAST
-; RUN: llc -disable-peephole -mtriple=x86_64-darwin-unknown -mcpu=knl < %s | FileCheck %s --check-prefixes=CHECK,SDAG
+; RUN: llc -disable-peephole -mtriple=x86_64-linux-unknown < %s | FileCheck %s --check-prefixes=CHECK,LINUX,SDAG
+; RUN: llc -disable-peephole -mtriple=x86_64-linux-unknown -fast-isel -fast-isel-abort=1 < %s | FileCheck %s --check-prefixes=CHECK,LINUX,FAST
+; RUN: llc -disable-peephole -mtriple=x86_64-linux-unknown -mcpu=knl < %s | FileCheck %s --check-prefixes=CHECK,LINUX,SDAG
+; RUN: llc -disable-peephole -mtriple=x86_64-pc-win32 < %s | FileCheck %s --check-prefixes=CHECK,WIN64
+; RUN: llc -disable-peephole -mtriple=i386-pc-win32 < %s | FileCheck %s --check-prefix=WIN32
 
 define {i64, i1} @t1() nounwind {
 ; CHECK-LABEL: t1:
-; CHECK:       ## %bb.0:
+; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl $72, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    retq
+;
+; WIN32-LABEL: t1:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl $72, %eax
+; WIN32-NEXT:    xorl %edx, %edx
+; WIN32-NEXT:    xorl %ecx, %ecx
+; WIN32-NEXT:    retl
   %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 8)
   ret {i64, i1} %1
 }
 
 define {i64, i1} @t2() nounwind {
 ; CHECK-LABEL: t2:
-; CHECK:       ## %bb.0:
+; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    retq
+;
+; WIN32-LABEL: t2:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    xorl %edx, %edx
+; WIN32-NEXT:    xorl %ecx, %ecx
+; WIN32-NEXT:    retl
   %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 0)
   ret {i64, i1} %1
 }
 
 define {i64, i1} @t3() nounwind {
 ; CHECK-LABEL: t3:
-; CHECK:       ## %bb.0:
+; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq $-9, %rax
 ; CHECK-NEXT:    movb $1, %dl
 ; CHECK-NEXT:    retq
+;
+; WIN32-LABEL: t3:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl $-9, %eax
+; WIN32-NEXT:    movl $-1, %edx
+; WIN32-NEXT:    movb $1, %cl
+; WIN32-NEXT:    retl
   %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 -1)
   ret {i64, i1} %1
 }
@@ -36,9 +59,9 @@ define {i64, i1} @t3() nounwind {
 ; SMULO
 define zeroext i1 @smuloi8(i8 %v1, i8 %v2, i8* %res) {
 ; SDAG-LABEL: smuloi8:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movl %edi, %eax
-; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
+; SDAG-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    imulb %sil
 ; SDAG-NEXT:    seto %cl
 ; SDAG-NEXT:    movb %al, (%rdx)
@@ -46,15 +69,34 @@ define zeroext i1 @smuloi8(i8 %v1, i8 %v2, i8* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi8:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movl %edi, %eax
-; FAST-NEXT:    ## kill: def $al killed $al killed $eax
+; FAST-NEXT:    # kill: def $al killed $al killed $eax
 ; FAST-NEXT:    imulb %sil
 ; FAST-NEXT:    seto %cl
 ; FAST-NEXT:    movb %al, (%rdx)
 ; FAST-NEXT:    andb $1, %cl
 ; FAST-NEXT:    movzbl %cl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smuloi8:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    imulb %dl
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movb %al, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smuloi8:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; WIN32-NEXT:    imulb {{[0-9]+}}(%esp)
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    movb %al, (%edx)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    retl
   %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
   %obit = extractvalue {i8, i1} %t, 1
@@ -64,20 +106,36 @@ define zeroext i1 @smuloi8(i8 %v1, i8 %v2, i8* %res) {
 
 define zeroext i1 @smuloi16(i16 %v1, i16 %v2, i16* %res) {
 ; SDAG-LABEL: smuloi16:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    imulw %si, %di
 ; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    movw %di, (%rdx)
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi16:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    imulw %si, %di
 ; FAST-NEXT:    seto %al
 ; FAST-NEXT:    movw %di, (%rdx)
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smuloi16:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    imulw %dx, %cx
+; WIN64-NEXT:    seto %al
+; WIN64-NEXT:    movw %cx, (%r8)
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smuloi16:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    imulw {{[0-9]+}}(%esp), %dx
+; WIN32-NEXT:    seto %al
+; WIN32-NEXT:    movw %dx, (%ecx)
+; WIN32-NEXT:    retl
   %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
   %obit = extractvalue {i16, i1} %t, 1
@@ -87,20 +145,36 @@ define zeroext i1 @smuloi16(i16 %v1, i16 %v2, i16* %res) {
 
 define zeroext i1 @smuloi32(i32 %v1, i32 %v2, i32* %res) {
 ; SDAG-LABEL: smuloi32:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    imull %esi, %edi
 ; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    movl %edi, (%rdx)
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi32:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    imull %esi, %edi
 ; FAST-NEXT:    seto %al
 ; FAST-NEXT:    movl %edi, (%rdx)
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smuloi32:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    imull %edx, %ecx
+; WIN64-NEXT:    seto %al
+; WIN64-NEXT:    movl %ecx, (%r8)
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smuloi32:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    seto %al
+; WIN32-NEXT:    movl %edx, (%ecx)
+; WIN32-NEXT:    retl
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -110,20 +184,58 @@ define zeroext i1 @smuloi32(i32 %v1, i32 %v2, i32* %res) {
 
 define zeroext i1 @smuloi64(i64 %v1, i64 %v2, i64* %res) {
 ; SDAG-LABEL: smuloi64:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    imulq %rsi, %rdi
 ; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    movq %rdi, (%rdx)
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi64:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    imulq %rsi, %rdi
 ; FAST-NEXT:    seto %al
 ; FAST-NEXT:    movq %rdi, (%rdx)
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smuloi64:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    imulq %rdx, %rcx
+; WIN64-NEXT:    seto %al
+; WIN64-NEXT:    movq %rcx, (%r8)
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smuloi64:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl $0, (%esp)
+; WIN32-NEXT:    movl %esp, %ebx
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %edx
+; WIN32-NEXT:    pushl %ecx
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    calll ___mulodi4
+; WIN32-NEXT:    addl $20, %esp
+; WIN32-NEXT:    cmpl $0, (%esp)
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    movl %edx, 4(%esi)
+; WIN32-NEXT:    movl %eax, (%esi)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    popl %edi
+; WIN32-NEXT:    popl %ebx
+; WIN32-NEXT:    retl
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -134,9 +246,9 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, i64* %res) {
 ; UMULO
 define zeroext i1 @umuloi8(i8 %v1, i8 %v2, i8* %res) {
 ; SDAG-LABEL: umuloi8:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movl %edi, %eax
-; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
+; SDAG-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    mulb %sil
 ; SDAG-NEXT:    seto %cl
 ; SDAG-NEXT:    movb %al, (%rdx)
@@ -144,15 +256,34 @@ define zeroext i1 @umuloi8(i8 %v1, i8 %v2, i8* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi8:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movl %edi, %eax
-; FAST-NEXT:    ## kill: def $al killed $al killed $eax
+; FAST-NEXT:    # kill: def $al killed $al killed $eax
 ; FAST-NEXT:    mulb %sil
 ; FAST-NEXT:    seto %cl
 ; FAST-NEXT:    movb %al, (%rdx)
 ; FAST-NEXT:    andb $1, %cl
 ; FAST-NEXT:    movzbl %cl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umuloi8:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    mulb %dl
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movb %al, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umuloi8:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; WIN32-NEXT:    mulb {{[0-9]+}}(%esp)
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    movb %al, (%edx)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    retl
   %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
   %obit = extractvalue {i8, i1} %t, 1
@@ -162,10 +293,10 @@ define zeroext i1 @umuloi8(i8 %v1, i8 %v2, i8* %res) {
 
 define zeroext i1 @umuloi16(i16 %v1, i16 %v2, i16* %res) {
 ; SDAG-LABEL: umuloi16:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movl %edi, %eax
-; SDAG-NEXT:    ## kill: def $ax killed $ax killed $eax
+; SDAG-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SDAG-NEXT:    mulw %si
 ; SDAG-NEXT:    seto %dl
 ; SDAG-NEXT:    movw %ax, (%rcx)
@@ -173,16 +304,37 @@ define zeroext i1 @umuloi16(i16 %v1, i16 %v2, i16* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi16:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movq %rdx, %rcx
 ; FAST-NEXT:    movl %edi, %eax
-; FAST-NEXT:    ## kill: def $ax killed $ax killed $eax
+; FAST-NEXT:    # kill: def $ax killed $ax killed $eax
 ; FAST-NEXT:    mulw %si
 ; FAST-NEXT:    seto %dl
 ; FAST-NEXT:    movw %ax, (%rcx)
 ; FAST-NEXT:    andb $1, %dl
 ; FAST-NEXT:    movzbl %dl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umuloi16:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    mulw %dx
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movw %ax, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umuloi16:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    mulw {{[0-9]+}}(%esp)
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    movw %ax, (%esi)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    retl
   %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
   %obit = extractvalue {i16, i1} %t, 1
@@ -192,7 +344,7 @@ define zeroext i1 @umuloi16(i16 %v1, i16 %v2, i16* %res) {
 
 define zeroext i1 @umuloi32(i32 %v1, i32 %v2, i32* %res) {
 ; SDAG-LABEL: umuloi32:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movl %edi, %eax
 ; SDAG-NEXT:    mull %esi
@@ -202,7 +354,7 @@ define zeroext i1 @umuloi32(i32 %v1, i32 %v2, i32* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi32:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movq %rdx, %rcx
 ; FAST-NEXT:    movl %edi, %eax
 ; FAST-NEXT:    mull %esi
@@ -211,6 +363,27 @@ define zeroext i1 @umuloi32(i32 %v1, i32 %v2, i32* %res) {
 ; FAST-NEXT:    andb $1, %dl
 ; FAST-NEXT:    movzbl %dl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umuloi32:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    mull %edx
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movl %eax, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umuloi32:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    movl %eax, (%esi)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    retl
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -220,7 +393,7 @@ define zeroext i1 @umuloi32(i32 %v1, i32 %v2, i32* %res) {
 
 define zeroext i1 @umuloi64(i64 %v1, i64 %v2, i64* %res) {
 ; SDAG-LABEL: umuloi64:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movq %rdi, %rax
 ; SDAG-NEXT:    mulq %rsi
@@ -230,7 +403,7 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, i64* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi64:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movq %rdx, %rcx
 ; FAST-NEXT:    movq %rdi, %rax
 ; FAST-NEXT:    mulq %rsi
@@ -239,6 +412,54 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, i64* %res) {
 ; FAST-NEXT:    andb $1, %dl
 ; FAST-NEXT:    movzbl %dl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umuloi64:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movq %rcx, %rax
+; WIN64-NEXT:    mulq %rdx
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movq %rax, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umuloi64:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebp
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    testl %eax, %eax
+; WIN32-NEXT:    setne %bl
+; WIN32-NEXT:    andb %dl, %bl
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    seto %ch
+; WIN32-NEXT:    orb %cl, %ch
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    addl %esi, %edx
+; WIN32-NEXT:    setb %cl
+; WIN32-NEXT:    orb %ch, %cl
+; WIN32-NEXT:    orb %bl, %cl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %eax, (%esi)
+; WIN32-NEXT:    movl %edx, 4(%esi)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    popl %edi
+; WIN32-NEXT:    popl %ebx
+; WIN32-NEXT:    popl %ebp
+; WIN32-NEXT:    retl
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -250,13 +471,33 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, i64* %res) {
 ; Check the use of the overflow bit in combination with a select instruction.
 ;
 define i32 @smuloselecti32(i32 %v1, i32 %v2) {
-; CHECK-LABEL: smuloselecti32:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    imull %esi, %ecx
-; CHECK-NEXT:    cmovol %edi, %eax
-; CHECK-NEXT:    retq
+; LINUX-LABEL: smuloselecti32:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    movl %esi, %eax
+; LINUX-NEXT:    movl %edi, %ecx
+; LINUX-NEXT:    imull %esi, %ecx
+; LINUX-NEXT:    cmovol %edi, %eax
+; LINUX-NEXT:    retq
+;
+; WIN64-LABEL: smuloselecti32:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %edx, %eax
+; WIN64-NEXT:    movl %ecx, %edx
+; WIN64-NEXT:    imull %eax, %edx
+; WIN64-NEXT:    cmovol %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smuloselecti32:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, %edx
+; WIN32-NEXT:    imull %ecx, %edx
+; WIN32-NEXT:    jo LBB11_2
+; WIN32-NEXT:  # %bb.1:
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:  LBB11_2:
+; WIN32-NEXT:    retl
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -264,13 +505,56 @@ define i32 @smuloselecti32(i32 %v1, i32 %v2) {
 }
 
 define i64 @smuloselecti64(i64 %v1, i64 %v2) {
-; CHECK-LABEL: smuloselecti64:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    movq %rdi, %rcx
-; CHECK-NEXT:    imulq %rsi, %rcx
-; CHECK-NEXT:    cmovoq %rdi, %rax
-; CHECK-NEXT:    retq
+; LINUX-LABEL: smuloselecti64:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    movq %rsi, %rax
+; LINUX-NEXT:    movq %rdi, %rcx
+; LINUX-NEXT:    imulq %rsi, %rcx
+; LINUX-NEXT:    cmovoq %rdi, %rax
+; LINUX-NEXT:    retq
+;
+; WIN64-LABEL: smuloselecti64:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movq %rdx, %rax
+; WIN64-NEXT:    movq %rcx, %rdx
+; WIN64-NEXT:    imulq %rax, %rdx
+; WIN64-NEXT:    cmovoq %rcx, %rax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smuloselecti64:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebp
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl $0, (%esp)
+; WIN32-NEXT:    movl %esp, %eax
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    pushl %ebp
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    calll ___mulodi4
+; WIN32-NEXT:    addl $20, %esp
+; WIN32-NEXT:    cmpl $0, (%esp)
+; WIN32-NEXT:    jne LBB12_2
+; WIN32-NEXT:  # %bb.1:
+; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:  LBB12_2:
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    popl %edi
+; WIN32-NEXT:    popl %ebx
+; WIN32-NEXT:    popl %ebp
+; WIN32-NEXT:    retl
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -278,13 +562,37 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 }
 
 define i32 @umuloselecti32(i32 %v1, i32 %v2) {
-; CHECK-LABEL: umuloselecti32:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    mull %esi
-; CHECK-NEXT:    cmovol %edi, %esi
-; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    retq
+; LINUX-LABEL: umuloselecti32:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    movl %edi, %eax
+; LINUX-NEXT:    mull %esi
+; LINUX-NEXT:    cmovol %edi, %esi
+; LINUX-NEXT:    movl %esi, %eax
+; LINUX-NEXT:    retq
+;
+; WIN64-LABEL: umuloselecti32:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %edx, %r8d
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    mull %edx
+; WIN64-NEXT:    cmovol %ecx, %r8d
+; WIN64-NEXT:    movl %r8d, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umuloselecti32:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    jo LBB13_2
+; WIN32-NEXT:  # %bb.1:
+; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:  LBB13_2:
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    retl
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -292,13 +600,69 @@ define i32 @umuloselecti32(i32 %v1, i32 %v2) {
 }
 
 define i64 @umuloselecti64(i64 %v1, i64 %v2) {
-; CHECK-LABEL: umuloselecti64:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    mulq %rsi
-; CHECK-NEXT:    cmovoq %rdi, %rsi
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    retq
+; LINUX-LABEL: umuloselecti64:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    movq %rdi, %rax
+; LINUX-NEXT:    mulq %rsi
+; LINUX-NEXT:    cmovoq %rdi, %rsi
+; LINUX-NEXT:    movq %rsi, %rax
+; LINUX-NEXT:    retq
+;
+; WIN64-LABEL: umuloselecti64:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movq %rdx, %r8
+; WIN64-NEXT:    movq %rcx, %rax
+; WIN64-NEXT:    mulq %rdx
+; WIN64-NEXT:    cmovoq %rcx, %r8
+; WIN64-NEXT:    movq %r8, %rax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umuloselecti64:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebp
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    setne %bl
+; WIN32-NEXT:    andb %al, %bl
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    seto %bh
+; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
+; WIN32-NEXT:    addl %edi, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    addl %ebp, %edx
+; WIN32-NEXT:    setb %al
+; WIN32-NEXT:    orb %bh, %al
+; WIN32-NEXT:    orb %bl, %al
+; WIN32-NEXT:    testb %al, %al
+; WIN32-NEXT:    jne LBB14_2
+; WIN32-NEXT:  # %bb.1:
+; WIN32-NEXT:    movl %edi, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:  LBB14_2:
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl %esi, %edx
+; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    popl %edi
+; WIN32-NEXT:    popl %ebx
+; WIN32-NEXT:    popl %ebp
+; WIN32-NEXT:    retl
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -310,36 +674,60 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
 ;
 define zeroext i1 @smulobri8(i8 %v1, i8 %v2) {
 ; SDAG-LABEL: smulobri8:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movl %edi, %eax
-; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
+; SDAG-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    imulb %sil
-; SDAG-NEXT:    jo LBB15_1
-; SDAG-NEXT:  ## %bb.2: ## %continue
+; SDAG-NEXT:    jo .LBB15_1
+; SDAG-NEXT:  # %bb.2: # %continue
 ; SDAG-NEXT:    movb $1, %al
 ; SDAG-NEXT:    retq
-; SDAG-NEXT:  LBB15_1: ## %overflow
+; SDAG-NEXT:  .LBB15_1: # %overflow
 ; SDAG-NEXT:    xorl %eax, %eax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smulobri8:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movl %edi, %eax
-; FAST-NEXT:    ## kill: def $al killed $al killed $eax
+; FAST-NEXT:    # kill: def $al killed $al killed $eax
 ; FAST-NEXT:    imulb %sil
 ; FAST-NEXT:    seto %al
 ; FAST-NEXT:    testb $1, %al
-; FAST-NEXT:    jne LBB15_1
-; FAST-NEXT:  ## %bb.2: ## %continue
+; FAST-NEXT:    jne .LBB15_1
+; FAST-NEXT:  # %bb.2: # %continue
 ; FAST-NEXT:    movb $1, %al
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
-; FAST-NEXT:  LBB15_1: ## %overflow
+; FAST-NEXT:  .LBB15_1: # %overflow
 ; FAST-NEXT:    xorl %eax, %eax
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smulobri8:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    imulb %dl
+; WIN64-NEXT:    jo .LBB15_1
+; WIN64-NEXT:  # %bb.2: # %continue
+; WIN64-NEXT:    movb $1, %al
+; WIN64-NEXT:    retq
+; WIN64-NEXT:  .LBB15_1: # %overflow
+; WIN64-NEXT:    xorl %eax, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smulobri8:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; WIN32-NEXT:    imulb {{[0-9]+}}(%esp)
+; WIN32-NEXT:    jo LBB15_1
+; WIN32-NEXT:  # %bb.2: # %continue
+; WIN32-NEXT:    movb $1, %al
+; WIN32-NEXT:    retl
+; WIN32-NEXT:  LBB15_1: # %overflow
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    retl
   %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
   %obit = extractvalue {i8, i1} %t, 1
@@ -354,32 +742,55 @@ continue:
 
 define zeroext i1 @smulobri16(i16 %v1, i16 %v2) {
 ; SDAG-LABEL: smulobri16:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    imulw %si, %di
-; SDAG-NEXT:    jo LBB16_1
-; SDAG-NEXT:  ## %bb.2: ## %continue
+; SDAG-NEXT:    jo .LBB16_1
+; SDAG-NEXT:  # %bb.2: # %continue
 ; SDAG-NEXT:    movb $1, %al
 ; SDAG-NEXT:    retq
-; SDAG-NEXT:  LBB16_1: ## %overflow
+; SDAG-NEXT:  .LBB16_1: # %overflow
 ; SDAG-NEXT:    xorl %eax, %eax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smulobri16:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    imulw %si, %di
 ; FAST-NEXT:    seto %al
 ; FAST-NEXT:    testb $1, %al
-; FAST-NEXT:    jne LBB16_1
-; FAST-NEXT:  ## %bb.2: ## %continue
+; FAST-NEXT:    jne .LBB16_1
+; FAST-NEXT:  # %bb.2: # %continue
 ; FAST-NEXT:    movb $1, %al
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
-; FAST-NEXT:  LBB16_1: ## %overflow
+; FAST-NEXT:  .LBB16_1: # %overflow
 ; FAST-NEXT:    xorl %eax, %eax
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smulobri16:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    imulw %dx, %cx
+; WIN64-NEXT:    jo .LBB16_1
+; WIN64-NEXT:  # %bb.2: # %continue
+; WIN64-NEXT:    movb $1, %al
+; WIN64-NEXT:    retq
+; WIN64-NEXT:  .LBB16_1: # %overflow
+; WIN64-NEXT:    xorl %eax, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smulobri16:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imulw {{[0-9]+}}(%esp), %ax
+; WIN32-NEXT:    jo LBB16_1
+; WIN32-NEXT:  # %bb.2: # %continue
+; WIN32-NEXT:    movb $1, %al
+; WIN32-NEXT:    retl
+; WIN32-NEXT:  LBB16_1: # %overflow
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    retl
   %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
   %obit = extractvalue {i16, i1} %t, 1
@@ -394,30 +805,53 @@ continue:
 
 define zeroext i1 @smulobri32(i32 %v1, i32 %v2) {
 ; SDAG-LABEL: smulobri32:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    imull %esi, %edi
-; SDAG-NEXT:    jo LBB17_1
-; SDAG-NEXT:  ## %bb.2: ## %continue
+; SDAG-NEXT:    jo .LBB17_1
+; SDAG-NEXT:  # %bb.2: # %continue
 ; SDAG-NEXT:    movb $1, %al
 ; SDAG-NEXT:    retq
-; SDAG-NEXT:  LBB17_1: ## %overflow
+; SDAG-NEXT:  .LBB17_1: # %overflow
 ; SDAG-NEXT:    xorl %eax, %eax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smulobri32:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    imull %esi, %edi
-; FAST-NEXT:    jo LBB17_1
-; FAST-NEXT:  ## %bb.2: ## %continue
+; FAST-NEXT:    jo .LBB17_1
+; FAST-NEXT:  # %bb.2: # %continue
 ; FAST-NEXT:    movb $1, %al
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
-; FAST-NEXT:  LBB17_1: ## %overflow
+; FAST-NEXT:  .LBB17_1: # %overflow
 ; FAST-NEXT:    xorl %eax, %eax
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smulobri32:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    imull %edx, %ecx
+; WIN64-NEXT:    jo .LBB17_1
+; WIN64-NEXT:  # %bb.2: # %continue
+; WIN64-NEXT:    movb $1, %al
+; WIN64-NEXT:    retq
+; WIN64-NEXT:  .LBB17_1: # %overflow
+; WIN64-NEXT:    xorl %eax, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smulobri32:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    jo LBB17_1
+; WIN32-NEXT:  # %bb.2: # %continue
+; WIN32-NEXT:    movb $1, %al
+; WIN32-NEXT:    retl
+; WIN32-NEXT:  LBB17_1: # %overflow
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    retl
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -432,30 +866,72 @@ continue:
 
 define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; SDAG-LABEL: smulobri64:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    imulq %rsi, %rdi
-; SDAG-NEXT:    jo LBB18_1
-; SDAG-NEXT:  ## %bb.2: ## %continue
+; SDAG-NEXT:    jo .LBB18_1
+; SDAG-NEXT:  # %bb.2: # %continue
 ; SDAG-NEXT:    movb $1, %al
 ; SDAG-NEXT:    retq
-; SDAG-NEXT:  LBB18_1: ## %overflow
+; SDAG-NEXT:  .LBB18_1: # %overflow
 ; SDAG-NEXT:    xorl %eax, %eax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smulobri64:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    imulq %rsi, %rdi
-; FAST-NEXT:    jo LBB18_1
-; FAST-NEXT:  ## %bb.2: ## %continue
+; FAST-NEXT:    jo .LBB18_1
+; FAST-NEXT:  # %bb.2: # %continue
 ; FAST-NEXT:    movb $1, %al
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
-; FAST-NEXT:  LBB18_1: ## %overflow
+; FAST-NEXT:  .LBB18_1: # %overflow
 ; FAST-NEXT:    xorl %eax, %eax
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smulobri64:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    imulq %rdx, %rcx
+; WIN64-NEXT:    jo .LBB18_1
+; WIN64-NEXT:  # %bb.2: # %continue
+; WIN64-NEXT:    movb $1, %al
+; WIN64-NEXT:    retq
+; WIN64-NEXT:  .LBB18_1: # %overflow
+; WIN64-NEXT:    xorl %eax, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smulobri64:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl $0, (%esp)
+; WIN32-NEXT:    movl %esp, %edi
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %edx
+; WIN32-NEXT:    pushl %ecx
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    calll ___mulodi4
+; WIN32-NEXT:    addl $20, %esp
+; WIN32-NEXT:    cmpl $0, (%esp)
+; WIN32-NEXT:    jne LBB18_1
+; WIN32-NEXT:  # %bb.3: # %continue
+; WIN32-NEXT:    movb $1, %al
+; WIN32-NEXT:  LBB18_2: # %overflow
+; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    popl %edi
+; WIN32-NEXT:    retl
+; WIN32-NEXT:  LBB18_1: # %overflow
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    jmp LBB18_2
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -470,36 +946,60 @@ continue:
 
 define zeroext i1 @umulobri8(i8 %v1, i8 %v2) {
 ; SDAG-LABEL: umulobri8:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movl %edi, %eax
-; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
+; SDAG-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    mulb %sil
-; SDAG-NEXT:    jo LBB19_1
-; SDAG-NEXT:  ## %bb.2: ## %continue
+; SDAG-NEXT:    jo .LBB19_1
+; SDAG-NEXT:  # %bb.2: # %continue
 ; SDAG-NEXT:    movb $1, %al
 ; SDAG-NEXT:    retq
-; SDAG-NEXT:  LBB19_1: ## %overflow
+; SDAG-NEXT:  .LBB19_1: # %overflow
 ; SDAG-NEXT:    xorl %eax, %eax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umulobri8:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movl %edi, %eax
-; FAST-NEXT:    ## kill: def $al killed $al killed $eax
+; FAST-NEXT:    # kill: def $al killed $al killed $eax
 ; FAST-NEXT:    mulb %sil
 ; FAST-NEXT:    seto %al
 ; FAST-NEXT:    testb $1, %al
-; FAST-NEXT:    jne LBB19_1
-; FAST-NEXT:  ## %bb.2: ## %continue
+; FAST-NEXT:    jne .LBB19_1
+; FAST-NEXT:  # %bb.2: # %continue
 ; FAST-NEXT:    movb $1, %al
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
-; FAST-NEXT:  LBB19_1: ## %overflow
+; FAST-NEXT:  .LBB19_1: # %overflow
 ; FAST-NEXT:    xorl %eax, %eax
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umulobri8:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    mulb %dl
+; WIN64-NEXT:    jo .LBB19_1
+; WIN64-NEXT:  # %bb.2: # %continue
+; WIN64-NEXT:    movb $1, %al
+; WIN64-NEXT:    retq
+; WIN64-NEXT:  .LBB19_1: # %overflow
+; WIN64-NEXT:    xorl %eax, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umulobri8:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; WIN32-NEXT:    mulb {{[0-9]+}}(%esp)
+; WIN32-NEXT:    jo LBB19_1
+; WIN32-NEXT:  # %bb.2: # %continue
+; WIN32-NEXT:    movb $1, %al
+; WIN32-NEXT:    retl
+; WIN32-NEXT:  LBB19_1: # %overflow
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    retl
   %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
   %obit = extractvalue {i8, i1} %t, 1
@@ -514,36 +1014,60 @@ continue:
 
 define zeroext i1 @umulobri16(i16 %v1, i16 %v2) {
 ; SDAG-LABEL: umulobri16:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movl %edi, %eax
-; SDAG-NEXT:    ## kill: def $ax killed $ax killed $eax
+; SDAG-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SDAG-NEXT:    mulw %si
-; SDAG-NEXT:    jo LBB20_1
-; SDAG-NEXT:  ## %bb.2: ## %continue
+; SDAG-NEXT:    jo .LBB20_1
+; SDAG-NEXT:  # %bb.2: # %continue
 ; SDAG-NEXT:    movb $1, %al
 ; SDAG-NEXT:    retq
-; SDAG-NEXT:  LBB20_1: ## %overflow
+; SDAG-NEXT:  .LBB20_1: # %overflow
 ; SDAG-NEXT:    xorl %eax, %eax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umulobri16:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movl %edi, %eax
-; FAST-NEXT:    ## kill: def $ax killed $ax killed $eax
+; FAST-NEXT:    # kill: def $ax killed $ax killed $eax
 ; FAST-NEXT:    mulw %si
 ; FAST-NEXT:    seto %al
 ; FAST-NEXT:    testb $1, %al
-; FAST-NEXT:    jne LBB20_1
-; FAST-NEXT:  ## %bb.2: ## %continue
+; FAST-NEXT:    jne .LBB20_1
+; FAST-NEXT:  # %bb.2: # %continue
 ; FAST-NEXT:    movb $1, %al
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
-; FAST-NEXT:  LBB20_1: ## %overflow
+; FAST-NEXT:  .LBB20_1: # %overflow
 ; FAST-NEXT:    xorl %eax, %eax
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umulobri16:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    mulw %dx
+; WIN64-NEXT:    jo .LBB20_1
+; WIN64-NEXT:  # %bb.2: # %continue
+; WIN64-NEXT:    movb $1, %al
+; WIN64-NEXT:    retq
+; WIN64-NEXT:  .LBB20_1: # %overflow
+; WIN64-NEXT:    xorl %eax, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umulobri16:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    mulw {{[0-9]+}}(%esp)
+; WIN32-NEXT:    jo LBB20_1
+; WIN32-NEXT:  # %bb.2: # %continue
+; WIN32-NEXT:    movb $1, %al
+; WIN32-NEXT:    retl
+; WIN32-NEXT:  LBB20_1: # %overflow
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    retl
   %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
   %obit = extractvalue {i16, i1} %t, 1
@@ -558,32 +1082,56 @@ continue:
 
 define zeroext i1 @umulobri32(i32 %v1, i32 %v2) {
 ; SDAG-LABEL: umulobri32:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movl %edi, %eax
 ; SDAG-NEXT:    mull %esi
-; SDAG-NEXT:    jo LBB21_1
-; SDAG-NEXT:  ## %bb.2: ## %continue
+; SDAG-NEXT:    jo .LBB21_1
+; SDAG-NEXT:  # %bb.2: # %continue
 ; SDAG-NEXT:    movb $1, %al
 ; SDAG-NEXT:    retq
-; SDAG-NEXT:  LBB21_1: ## %overflow
+; SDAG-NEXT:  .LBB21_1: # %overflow
 ; SDAG-NEXT:    xorl %eax, %eax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umulobri32:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movl %edi, %eax
 ; FAST-NEXT:    mull %esi
-; FAST-NEXT:    jo LBB21_1
-; FAST-NEXT:  ## %bb.2: ## %continue
+; FAST-NEXT:    jo .LBB21_1
+; FAST-NEXT:  # %bb.2: # %continue
 ; FAST-NEXT:    movb $1, %al
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
-; FAST-NEXT:  LBB21_1: ## %overflow
+; FAST-NEXT:  .LBB21_1: # %overflow
 ; FAST-NEXT:    xorl %eax, %eax
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umulobri32:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    mull %edx
+; WIN64-NEXT:    jo .LBB21_1
+; WIN64-NEXT:  # %bb.2: # %continue
+; WIN64-NEXT:    movb $1, %al
+; WIN64-NEXT:    retq
+; WIN64-NEXT:  .LBB21_1: # %overflow
+; WIN64-NEXT:    xorl %eax, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umulobri32:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    jo LBB21_1
+; WIN32-NEXT:  # %bb.2: # %continue
+; WIN32-NEXT:    movb $1, %al
+; WIN32-NEXT:    retl
+; WIN32-NEXT:  LBB21_1: # %overflow
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    retl
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -598,32 +1146,87 @@ continue:
 
 define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
 ; SDAG-LABEL: umulobri64:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movq %rdi, %rax
 ; SDAG-NEXT:    mulq %rsi
-; SDAG-NEXT:    jo LBB22_1
-; SDAG-NEXT:  ## %bb.2: ## %continue
+; SDAG-NEXT:    jo .LBB22_1
+; SDAG-NEXT:  # %bb.2: # %continue
 ; SDAG-NEXT:    movb $1, %al
 ; SDAG-NEXT:    retq
-; SDAG-NEXT:  LBB22_1: ## %overflow
+; SDAG-NEXT:  .LBB22_1: # %overflow
 ; SDAG-NEXT:    xorl %eax, %eax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umulobri64:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movq %rdi, %rax
 ; FAST-NEXT:    mulq %rsi
-; FAST-NEXT:    jo LBB22_1
-; FAST-NEXT:  ## %bb.2: ## %continue
+; FAST-NEXT:    jo .LBB22_1
+; FAST-NEXT:  # %bb.2: # %continue
 ; FAST-NEXT:    movb $1, %al
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
-; FAST-NEXT:  LBB22_1: ## %overflow
+; FAST-NEXT:  .LBB22_1: # %overflow
 ; FAST-NEXT:    xorl %eax, %eax
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umulobri64:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movq %rcx, %rax
+; WIN64-NEXT:    mulq %rdx
+; WIN64-NEXT:    jo .LBB22_1
+; WIN64-NEXT:  # %bb.2: # %continue
+; WIN64-NEXT:    movb $1, %al
+; WIN64-NEXT:    retq
+; WIN64-NEXT:  .LBB22_1: # %overflow
+; WIN64-NEXT:    xorl %eax, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umulobri64:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebp
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    testl %eax, %eax
+; WIN32-NEXT:    setne %bl
+; WIN32-NEXT:    andb %dl, %bl
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    seto %bh
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    orb %bh, %cl
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    addl %esi, %edx
+; WIN32-NEXT:    setb %al
+; WIN32-NEXT:    orb %cl, %al
+; WIN32-NEXT:    orb %bl, %al
+; WIN32-NEXT:    subb $1, %al
+; WIN32-NEXT:    je LBB22_1
+; WIN32-NEXT:  # %bb.3: # %continue
+; WIN32-NEXT:    movb $1, %al
+; WIN32-NEXT:  LBB22_2: # %overflow
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    popl %edi
+; WIN32-NEXT:    popl %ebx
+; WIN32-NEXT:    popl %ebp
+; WIN32-NEXT:    retl
+; WIN32-NEXT:  LBB22_1: # %overflow
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    jmp LBB22_2
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -637,14 +1240,40 @@ continue:
 }
 
 define i1 @bug27873(i64 %c1, i1 %c2) {
-; CHECK-LABEL: bug27873:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    movl $160, %ecx
-; CHECK-NEXT:    mulq %rcx
-; CHECK-NEXT:    seto %al
-; CHECK-NEXT:    orb %sil, %al
-; CHECK-NEXT:    retq
+; LINUX-LABEL: bug27873:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    movq %rdi, %rax
+; LINUX-NEXT:    movl $160, %ecx
+; LINUX-NEXT:    mulq %rcx
+; LINUX-NEXT:    seto %al
+; LINUX-NEXT:    orb %sil, %al
+; LINUX-NEXT:    retq
+;
+; WIN64-LABEL: bug27873:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %edx, %r8d
+; WIN64-NEXT:    movq %rcx, %rax
+; WIN64-NEXT:    movl $160, %ecx
+; WIN64-NEXT:    mulq %rcx
+; WIN64-NEXT:    seto %al
+; WIN64-NEXT:    orb %r8b, %al
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: bug27873:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    movl $160, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    movl $160, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    addl %ecx, %edx
+; WIN32-NEXT:    setb %al
+; WIN32-NEXT:    orb %bl, %al
+; WIN32-NEXT:    orb {{[0-9]+}}(%esp), %al
+; WIN32-NEXT:    popl %ebx
+; WIN32-NEXT:    retl
   %mul = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c1, i64 160)
   %mul.overflow = extractvalue { i64, i1 } %mul, 1
   %x1 = or i1 %c2, %mul.overflow
@@ -653,9 +1282,9 @@ define i1 @bug27873(i64 %c1, i1 %c2) {
 
 define zeroext i1 @smuloi8_load(i8* %ptr1, i8 %v2, i8* %res) {
 ; SDAG-LABEL: smuloi8_load:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movl %esi, %eax
-; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
+; SDAG-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    imulb (%rdi)
 ; SDAG-NEXT:    seto %cl
 ; SDAG-NEXT:    movb %al, (%rdx)
@@ -663,7 +1292,7 @@ define zeroext i1 @smuloi8_load(i8* %ptr1, i8 %v2, i8* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi8_load:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movb (%rdi), %al
 ; FAST-NEXT:    imulb %sil
 ; FAST-NEXT:    seto %cl
@@ -671,6 +1300,26 @@ define zeroext i1 @smuloi8_load(i8* %ptr1, i8 %v2, i8* %res) {
 ; FAST-NEXT:    andb $1, %cl
 ; FAST-NEXT:    movzbl %cl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smuloi8_load:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %edx, %eax
+; WIN64-NEXT:    imulb (%rcx)
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movb %al, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smuloi8_load:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movb (%eax), %al
+; WIN32-NEXT:    imulb {{[0-9]+}}(%esp)
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    movb %al, (%edx)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    retl
   %v1 = load i8, i8* %ptr1
   %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
@@ -681,9 +1330,9 @@ define zeroext i1 @smuloi8_load(i8* %ptr1, i8 %v2, i8* %res) {
 
 define zeroext i1 @smuloi8_load2(i8 %v1, i8* %ptr2, i8* %res) {
 ; SDAG-LABEL: smuloi8_load2:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movl %edi, %eax
-; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
+; SDAG-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    imulb (%rsi)
 ; SDAG-NEXT:    seto %cl
 ; SDAG-NEXT:    movb %al, (%rdx)
@@ -691,15 +1340,35 @@ define zeroext i1 @smuloi8_load2(i8 %v1, i8* %ptr2, i8* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi8_load2:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movl %edi, %eax
-; FAST-NEXT:    ## kill: def $al killed $al killed $eax
+; FAST-NEXT:    # kill: def $al killed $al killed $eax
 ; FAST-NEXT:    imulb (%rsi)
 ; FAST-NEXT:    seto %cl
 ; FAST-NEXT:    movb %al, (%rdx)
 ; FAST-NEXT:    andb $1, %cl
 ; FAST-NEXT:    movzbl %cl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smuloi8_load2:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    imulb (%rdx)
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movb %al, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smuloi8_load2:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    imulb (%ecx)
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    movb %al, (%edx)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    retl
   %v2 = load i8, i8* %ptr2
   %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
@@ -710,20 +1379,37 @@ define zeroext i1 @smuloi8_load2(i8 %v1, i8* %ptr2, i8* %res) {
 
 define zeroext i1 @smuloi16_load(i16* %ptr1, i16 %v2, i16* %res) {
 ; SDAG-LABEL: smuloi16_load:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    imulw (%rdi), %si
 ; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    movw %si, (%rdx)
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi16_load:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    imulw (%rdi), %si
 ; FAST-NEXT:    seto %al
 ; FAST-NEXT:    movw %si, (%rdx)
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smuloi16_load:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    imulw (%rcx), %dx
+; WIN64-NEXT:    seto %al
+; WIN64-NEXT:    movw %dx, (%r8)
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smuloi16_load:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movzwl (%eax), %edx
+; WIN32-NEXT:    imulw {{[0-9]+}}(%esp), %dx
+; WIN32-NEXT:    seto %al
+; WIN32-NEXT:    movw %dx, (%ecx)
+; WIN32-NEXT:    retl
   %v1 = load i16, i16* %ptr1
   %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
@@ -734,20 +1420,37 @@ define zeroext i1 @smuloi16_load(i16* %ptr1, i16 %v2, i16* %res) {
 
 define zeroext i1 @smuloi16_load2(i16 %v1, i16* %ptr2, i16* %res) {
 ; SDAG-LABEL: smuloi16_load2:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    imulw (%rsi), %di
 ; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    movw %di, (%rdx)
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi16_load2:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    imulw (%rsi), %di
 ; FAST-NEXT:    seto %al
 ; FAST-NEXT:    movw %di, (%rdx)
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smuloi16_load2:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    imulw (%rdx), %cx
+; WIN64-NEXT:    seto %al
+; WIN64-NEXT:    movw %cx, (%r8)
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smuloi16_load2:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    imulw (%eax), %dx
+; WIN32-NEXT:    seto %al
+; WIN32-NEXT:    movw %dx, (%ecx)
+; WIN32-NEXT:    retl
   %v2 = load i16, i16* %ptr2
   %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
@@ -758,20 +1461,37 @@ define zeroext i1 @smuloi16_load2(i16 %v1, i16* %ptr2, i16* %res) {
 
 define zeroext i1 @smuloi32_load(i32* %ptr1, i32 %v2, i32* %res) {
 ; SDAG-LABEL: smuloi32_load:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    imull (%rdi), %esi
 ; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    movl %esi, (%rdx)
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi32_load:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    imull (%rdi), %esi
 ; FAST-NEXT:    seto %al
 ; FAST-NEXT:    movl %esi, (%rdx)
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smuloi32_load:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    imull (%rcx), %edx
+; WIN64-NEXT:    seto %al
+; WIN64-NEXT:    movl %edx, (%r8)
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smuloi32_load:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl (%eax), %edx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    seto %al
+; WIN32-NEXT:    movl %edx, (%ecx)
+; WIN32-NEXT:    retl
   %v1 = load i32, i32* %ptr1
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
@@ -782,20 +1502,37 @@ define zeroext i1 @smuloi32_load(i32* %ptr1, i32 %v2, i32* %res) {
 
 define zeroext i1 @smuloi32_load2(i32 %v1, i32* %ptr2, i32* %res) {
 ; SDAG-LABEL: smuloi32_load2:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    imull (%rsi), %edi
 ; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    movl %edi, (%rdx)
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi32_load2:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    imull (%rsi), %edi
 ; FAST-NEXT:    seto %al
 ; FAST-NEXT:    movl %edi, (%rdx)
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smuloi32_load2:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    imull (%rdx), %ecx
+; WIN64-NEXT:    seto %al
+; WIN64-NEXT:    movl %ecx, (%r8)
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smuloi32_load2:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    imull (%eax), %edx
+; WIN32-NEXT:    seto %al
+; WIN32-NEXT:    movl %edx, (%ecx)
+; WIN32-NEXT:    retl
   %v2 = load i32, i32* %ptr2
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
@@ -806,20 +1543,59 @@ define zeroext i1 @smuloi32_load2(i32 %v1, i32* %ptr2, i32* %res) {
 
 define zeroext i1 @smuloi64_load(i64* %ptr1, i64 %v2, i64* %res) {
 ; SDAG-LABEL: smuloi64_load:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    imulq (%rdi), %rsi
 ; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    movq %rsi, (%rdx)
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi64_load:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    imulq (%rdi), %rsi
 ; FAST-NEXT:    seto %al
 ; FAST-NEXT:    movq %rsi, (%rdx)
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smuloi64_load:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    imulq (%rcx), %rdx
+; WIN64-NEXT:    seto %al
+; WIN64-NEXT:    movq %rdx, (%r8)
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smuloi64_load:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl (%edx), %edi
+; WIN32-NEXT:    movl 4(%edx), %edx
+; WIN32-NEXT:    movl $0, (%esp)
+; WIN32-NEXT:    movl %esp, %ebx
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    pushl %ecx
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    pushl %edx
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    calll ___mulodi4
+; WIN32-NEXT:    addl $20, %esp
+; WIN32-NEXT:    cmpl $0, (%esp)
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    movl %eax, (%esi)
+; WIN32-NEXT:    movl %edx, 4(%esi)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    popl %edi
+; WIN32-NEXT:    popl %ebx
+; WIN32-NEXT:    retl
   %v1 = load i64, i64* %ptr1
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
@@ -830,20 +1606,59 @@ define zeroext i1 @smuloi64_load(i64* %ptr1, i64 %v2, i64* %res) {
 
 define zeroext i1 @smuloi64_load2(i64 %v1, i64* %ptr2, i64* %res) {
 ; SDAG-LABEL: smuloi64_load2:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    imulq (%rsi), %rdi
 ; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    movq %rdi, (%rdx)
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloi64_load2:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    imulq (%rsi), %rdi
 ; FAST-NEXT:    seto %al
 ; FAST-NEXT:    movq %rdi, (%rdx)
 ; FAST-NEXT:    andb $1, %al
 ; FAST-NEXT:    movzbl %al, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: smuloi64_load2:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    imulq (%rdx), %rcx
+; WIN64-NEXT:    seto %al
+; WIN64-NEXT:    movq %rcx, (%r8)
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: smuloi64_load2:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl (%edx), %edi
+; WIN32-NEXT:    movl 4(%edx), %edx
+; WIN32-NEXT:    movl $0, (%esp)
+; WIN32-NEXT:    movl %esp, %ebx
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    pushl %edx
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %ecx
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    calll ___mulodi4
+; WIN32-NEXT:    addl $20, %esp
+; WIN32-NEXT:    cmpl $0, (%esp)
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    movl %eax, (%esi)
+; WIN32-NEXT:    movl %edx, 4(%esi)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    popl %edi
+; WIN32-NEXT:    popl %ebx
+; WIN32-NEXT:    retl
   %v2 = load i64, i64* %ptr2
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
@@ -854,9 +1669,9 @@ define zeroext i1 @smuloi64_load2(i64 %v1, i64* %ptr2, i64* %res) {
 
 define zeroext i1 @umuloi8_load(i8* %ptr1, i8 %v2, i8* %res) {
 ; SDAG-LABEL: umuloi8_load:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movl %esi, %eax
-; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
+; SDAG-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    mulb (%rdi)
 ; SDAG-NEXT:    seto %cl
 ; SDAG-NEXT:    movb %al, (%rdx)
@@ -864,7 +1679,7 @@ define zeroext i1 @umuloi8_load(i8* %ptr1, i8 %v2, i8* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi8_load:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movb (%rdi), %al
 ; FAST-NEXT:    mulb %sil
 ; FAST-NEXT:    seto %cl
@@ -872,6 +1687,26 @@ define zeroext i1 @umuloi8_load(i8* %ptr1, i8 %v2, i8* %res) {
 ; FAST-NEXT:    andb $1, %cl
 ; FAST-NEXT:    movzbl %cl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umuloi8_load:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %edx, %eax
+; WIN64-NEXT:    mulb (%rcx)
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movb %al, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umuloi8_load:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movb (%eax), %al
+; WIN32-NEXT:    mulb {{[0-9]+}}(%esp)
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    movb %al, (%edx)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    retl
   %v1 = load i8, i8* %ptr1
   %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
@@ -882,9 +1717,9 @@ define zeroext i1 @umuloi8_load(i8* %ptr1, i8 %v2, i8* %res) {
 
 define zeroext i1 @umuloi8_load2(i8 %v1, i8* %ptr2, i8* %res) {
 ; SDAG-LABEL: umuloi8_load2:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movl %edi, %eax
-; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
+; SDAG-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    mulb (%rsi)
 ; SDAG-NEXT:    seto %cl
 ; SDAG-NEXT:    movb %al, (%rdx)
@@ -892,15 +1727,35 @@ define zeroext i1 @umuloi8_load2(i8 %v1, i8* %ptr2, i8* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi8_load2:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movl %edi, %eax
-; FAST-NEXT:    ## kill: def $al killed $al killed $eax
+; FAST-NEXT:    # kill: def $al killed $al killed $eax
 ; FAST-NEXT:    mulb (%rsi)
 ; FAST-NEXT:    seto %cl
 ; FAST-NEXT:    movb %al, (%rdx)
 ; FAST-NEXT:    andb $1, %cl
 ; FAST-NEXT:    movzbl %cl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umuloi8_load2:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    mulb (%rdx)
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movb %al, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umuloi8_load2:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    mulb (%ecx)
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    movb %al, (%edx)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    retl
   %v2 = load i8, i8* %ptr2
   %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
@@ -911,10 +1766,10 @@ define zeroext i1 @umuloi8_load2(i8 %v1, i8* %ptr2, i8* %res) {
 
 define zeroext i1 @umuloi16_load(i16* %ptr1, i16 %v2, i16* %res) {
 ; SDAG-LABEL: umuloi16_load:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movl %esi, %eax
-; SDAG-NEXT:    ## kill: def $ax killed $ax killed $eax
+; SDAG-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SDAG-NEXT:    mulw (%rdi)
 ; SDAG-NEXT:    seto %dl
 ; SDAG-NEXT:    movw %ax, (%rcx)
@@ -922,7 +1777,7 @@ define zeroext i1 @umuloi16_load(i16* %ptr1, i16 %v2, i16* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi16_load:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movq %rdx, %rcx
 ; FAST-NEXT:    movzwl (%rdi), %eax
 ; FAST-NEXT:    mulw %si
@@ -931,6 +1786,28 @@ define zeroext i1 @umuloi16_load(i16* %ptr1, i16 %v2, i16* %res) {
 ; FAST-NEXT:    andb $1, %dl
 ; FAST-NEXT:    movzbl %dl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umuloi16_load:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %edx, %eax
+; WIN64-NEXT:    mulw (%rcx)
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movw %ax, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umuloi16_load:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movzwl (%eax), %eax
+; WIN32-NEXT:    mulw {{[0-9]+}}(%esp)
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    movw %ax, (%esi)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    retl
   %v1 = load i16, i16* %ptr1
   %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
@@ -941,10 +1818,10 @@ define zeroext i1 @umuloi16_load(i16* %ptr1, i16 %v2, i16* %res) {
 
 define zeroext i1 @umuloi16_load2(i16 %v1, i16* %ptr2, i16* %res) {
 ; SDAG-LABEL: umuloi16_load2:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movl %edi, %eax
-; SDAG-NEXT:    ## kill: def $ax killed $ax killed $eax
+; SDAG-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SDAG-NEXT:    mulw (%rsi)
 ; SDAG-NEXT:    seto %dl
 ; SDAG-NEXT:    movw %ax, (%rcx)
@@ -952,16 +1829,38 @@ define zeroext i1 @umuloi16_load2(i16 %v1, i16* %ptr2, i16* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi16_load2:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movq %rdx, %rcx
 ; FAST-NEXT:    movl %edi, %eax
-; FAST-NEXT:    ## kill: def $ax killed $ax killed $eax
+; FAST-NEXT:    # kill: def $ax killed $ax killed $eax
 ; FAST-NEXT:    mulw (%rsi)
 ; FAST-NEXT:    seto %dl
 ; FAST-NEXT:    movw %ax, (%rcx)
 ; FAST-NEXT:    andb $1, %dl
 ; FAST-NEXT:    movzbl %dl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umuloi16_load2:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    mulw (%rdx)
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movw %ax, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umuloi16_load2:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    mulw (%ecx)
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    movw %ax, (%esi)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    retl
   %v2 = load i16, i16* %ptr2
   %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
@@ -972,7 +1871,7 @@ define zeroext i1 @umuloi16_load2(i16 %v1, i16* %ptr2, i16* %res) {
 
 define zeroext i1 @umuloi32_load(i32* %ptr1, i32 %v2, i32* %res) {
 ; SDAG-LABEL: umuloi32_load:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movl %esi, %eax
 ; SDAG-NEXT:    mull (%rdi)
@@ -982,7 +1881,7 @@ define zeroext i1 @umuloi32_load(i32* %ptr1, i32 %v2, i32* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi32_load:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movq %rdx, %rcx
 ; FAST-NEXT:    movl (%rdi), %eax
 ; FAST-NEXT:    mull %esi
@@ -991,6 +1890,28 @@ define zeroext i1 @umuloi32_load(i32* %ptr1, i32 %v2, i32* %res) {
 ; FAST-NEXT:    andb $1, %dl
 ; FAST-NEXT:    movzbl %dl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umuloi32_load:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %edx, %eax
+; WIN64-NEXT:    mull (%rcx)
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movl %eax, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umuloi32_load:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl (%eax), %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    movl %eax, (%esi)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    retl
   %v1 = load i32, i32* %ptr1
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
@@ -1001,7 +1922,7 @@ define zeroext i1 @umuloi32_load(i32* %ptr1, i32 %v2, i32* %res) {
 
 define zeroext i1 @umuloi32_load2(i32 %v1, i32* %ptr2, i32* %res) {
 ; SDAG-LABEL: umuloi32_load2:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movl %edi, %eax
 ; SDAG-NEXT:    mull (%rsi)
@@ -1011,7 +1932,7 @@ define zeroext i1 @umuloi32_load2(i32 %v1, i32* %ptr2, i32* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi32_load2:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movq %rdx, %rcx
 ; FAST-NEXT:    movl %edi, %eax
 ; FAST-NEXT:    mull (%rsi)
@@ -1020,6 +1941,28 @@ define zeroext i1 @umuloi32_load2(i32 %v1, i32* %ptr2, i32* %res) {
 ; FAST-NEXT:    andb $1, %dl
 ; FAST-NEXT:    movzbl %dl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umuloi32_load2:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    mull (%rdx)
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movl %eax, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umuloi32_load2:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    mull (%ecx)
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    movl %eax, (%esi)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    retl
   %v2 = load i32, i32* %ptr2
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
@@ -1030,7 +1973,7 @@ define zeroext i1 @umuloi32_load2(i32 %v1, i32* %ptr2, i32* %res) {
 
 define zeroext i1 @umuloi64_load(i64* %ptr1, i64 %v2, i64* %res) {
 ; SDAG-LABEL: umuloi64_load:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movq %rsi, %rax
 ; SDAG-NEXT:    mulq (%rdi)
@@ -1040,7 +1983,7 @@ define zeroext i1 @umuloi64_load(i64* %ptr1, i64 %v2, i64* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi64_load:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movq %rdx, %rcx
 ; FAST-NEXT:    movq (%rdi), %rax
 ; FAST-NEXT:    mulq %rsi
@@ -1049,6 +1992,58 @@ define zeroext i1 @umuloi64_load(i64* %ptr1, i64 %v2, i64* %res) {
 ; FAST-NEXT:    andb $1, %dl
 ; FAST-NEXT:    movzbl %dl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umuloi64_load:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movq %rdx, %rax
+; WIN64-NEXT:    mulq (%rcx)
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movq %rax, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umuloi64_load:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebp
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl (%eax), %esi
+; WIN32-NEXT:    movl 4(%eax), %eax
+; WIN32-NEXT:    testl %ecx, %ecx
+; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    testl %eax, %eax
+; WIN32-NEXT:    setne %bl
+; WIN32-NEXT:    andb %dl, %bl
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    seto %bh
+; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
+; WIN32-NEXT:    addl %edi, %ecx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    addl %ecx, %edx
+; WIN32-NEXT:    setb %cl
+; WIN32-NEXT:    orb %bh, %cl
+; WIN32-NEXT:    orb %bl, %cl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %eax, (%esi)
+; WIN32-NEXT:    movl %edx, 4(%esi)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    popl %edi
+; WIN32-NEXT:    popl %ebx
+; WIN32-NEXT:    popl %ebp
+; WIN32-NEXT:    retl
   %v1 = load i64, i64* %ptr1
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
@@ -1059,7 +2054,7 @@ define zeroext i1 @umuloi64_load(i64* %ptr1, i64 %v2, i64* %res) {
 
 define zeroext i1 @umuloi64_load2(i64 %v1, i64* %ptr2, i64* %res) {
 ; SDAG-LABEL: umuloi64_load2:
-; SDAG:       ## %bb.0:
+; SDAG:       # %bb.0:
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movq %rdi, %rax
 ; SDAG-NEXT:    mulq (%rsi)
@@ -1069,7 +2064,7 @@ define zeroext i1 @umuloi64_load2(i64 %v1, i64* %ptr2, i64* %res) {
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: umuloi64_load2:
-; FAST:       ## %bb.0:
+; FAST:       # %bb.0:
 ; FAST-NEXT:    movq %rdx, %rcx
 ; FAST-NEXT:    movq %rdi, %rax
 ; FAST-NEXT:    mulq (%rsi)
@@ -1078,6 +2073,55 @@ define zeroext i1 @umuloi64_load2(i64 %v1, i64* %ptr2, i64* %res) {
 ; FAST-NEXT:    andb $1, %dl
 ; FAST-NEXT:    movzbl %dl, %eax
 ; FAST-NEXT:    retq
+;
+; WIN64-LABEL: umuloi64_load2:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    movq %rcx, %rax
+; WIN64-NEXT:    mulq (%rdx)
+; WIN64-NEXT:    seto %cl
+; WIN64-NEXT:    movq %rax, (%r8)
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    retq
+;
+; WIN32-LABEL: umuloi64_load2:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebp
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl (%edx), %ebp
+; WIN32-NEXT:    movl 4(%edx), %esi
+; WIN32-NEXT:    testl %eax, %eax
+; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    setne %bl
+; WIN32-NEXT:    andb %dl, %bl
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    seto %ch
+; WIN32-NEXT:    orb %cl, %ch
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    addl %esi, %edx
+; WIN32-NEXT:    setb %cl
+; WIN32-NEXT:    orb %ch, %cl
+; WIN32-NEXT:    orb %bl, %cl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %eax, (%esi)
+; WIN32-NEXT:    movl %edx, 4(%esi)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    popl %edi
+; WIN32-NEXT:    popl %ebx
+; WIN32-NEXT:    popl %ebp
+; WIN32-NEXT:    retl
   %v2 = load i64, i64* %ptr2
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0