[llvm] r333841 - [X86][BMI1] Test i32 intrinsics on 32/64 bits + branch off i64 tests

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Sun Jun 3 07:11:35 PDT 2018


Author: rksimon
Date: Sun Jun  3 07:11:34 2018
New Revision: 333841

URL: http://llvm.org/viewvc/llvm-project?rev=333841&view=rev
Log:
[X86][BMI1] Test i32 intrinsics on 32/64 bits + branch off i64 tests

Further refactoring will wait until D47452 has landed.

Part of ongoing work to ensure we test all intrinsic style tests on 32 and 64 bit targets where possible.

Added:
    llvm/trunk/test/CodeGen/X86/bmi-x86_64.ll
Modified:
    llvm/trunk/test/CodeGen/X86/bmi.ll

Added: llvm/trunk/test/CodeGen/X86/bmi-x86_64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bmi-x86_64.ll?rev=333841&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bmi-x86_64.ll (added)
+++ llvm/trunk/test/CodeGen/X86/bmi-x86_64.ll Sun Jun  3 07:11:34 2018
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=CHECK,BMI1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=CHECK,BMI2
+
+declare i64 @llvm.x86.bmi.bextr.64(i64, i64)
+
+define i64 @bextr64(i64 %x, i64 %y)   {
+; CHECK-LABEL: bextr64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bextrq %rsi, %rdi, %rax
+; CHECK-NEXT:    retq
+  %tmp = tail call i64 @llvm.x86.bmi.bextr.64(i64 %x, i64 %y)
+  ret i64 %tmp
+}
+
+define i64 @bextr64b(i64 %x)  uwtable  ssp {
+; CHECK-LABEL: bextr64b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $3076, %eax # imm = 0xC04
+; CHECK-NEXT:    bextrl %eax, %edi, %eax
+; CHECK-NEXT:    retq
+  %1 = lshr i64 %x, 4
+  %2 = and i64 %1, 4095
+  ret i64 %2
+}
+
+; Make sure we still use the AH subreg trick to extract 15:8
+define i64 @bextr64_subreg(i64 %x)  uwtable  ssp {
+; CHECK-LABEL: bextr64_subreg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movzbl %ah, %eax
+; CHECK-NEXT:    retq
+  %1 = lshr i64 %x, 8
+  %2 = and i64 %1, 255
+  ret i64 %2
+}
+
+define i64 @bextr64b_load(i64* %x) {
+; CHECK-LABEL: bextr64b_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $3076, %eax # imm = 0xC04
+; CHECK-NEXT:    bextrl %eax, (%rdi), %eax
+; CHECK-NEXT:    retq
+  %1 = load i64, i64* %x, align 8
+  %2 = lshr i64 %1, 4
+  %3 = and i64 %2, 4095
+  ret i64 %3
+}
+
+; PR34042
+define i64 @bextr64c(i64 %x, i32 %y) {
+; CHECK-LABEL: bextr64c:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movslq %esi, %rax
+; CHECK-NEXT:    bextrq %rax, %rdi, %rax
+; CHECK-NEXT:    retq
+  %tmp0 = sext i32 %y to i64
+  %tmp1 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %x, i64 %tmp0)
+  ret i64 %tmp1
+}
+
+define i64 @bextr64d(i64 %a) {
+; CHECK-LABEL: bextr64d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl $8450, %eax # imm = 0x2102
+; CHECK-NEXT:    bextrq %rax, %rdi, %rax
+; CHECK-NEXT:    retq
+entry:
+  %shr = lshr i64 %a, 2
+  %and = and i64 %shr, 8589934591
+  ret i64 %and
+}
+
+define i64 @non_bextr64(i64 %x) {
+; CHECK-LABEL: non_bextr64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrq $2, %rdi
+; CHECK-NEXT:    movabsq $8589934590, %rax # imm = 0x1FFFFFFFE
+; CHECK-NEXT:    andq %rdi, %rax
+; CHECK-NEXT:    retq
+entry:
+  %shr = lshr i64 %x, 2
+  %and = and i64 %shr, 8589934590
+  ret i64 %and
+}

Modified: llvm/trunk/test/CodeGen/X86/bmi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bmi.ll?rev=333841&r1=333840&r2=333841&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bmi.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bmi.ll Sun Jun  3 07:11:34 2018
@@ -1,22 +1,37 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=CHECK --check-prefix=BMI1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefix=CHECK --check-prefix=BMI2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=CHECK,X86,BMI1,X86-BMI1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=CHECK,X86,BMI2,X86-BMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=CHECK,X64,BMI1,X64-BMI1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=CHECK,X64,BMI2,X64-BMI2
 
 define i32 @andn32(i32 %x, i32 %y)   {
-; CHECK-LABEL: andn32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andnl %esi, %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: andn32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andn32:
+; X64:       # %bb.0:
+; X64-NEXT:    andnl %esi, %edi, %eax
+; X64-NEXT:    retq
   %tmp1 = xor i32 %x, -1
   %tmp2 = and i32 %y, %tmp1
   ret i32 %tmp2
 }
 
 define i32 @andn32_load(i32 %x, i32* %y)   {
-; CHECK-LABEL: andn32_load:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andnl (%rsi), %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: andn32_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andnl (%eax), %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andn32_load:
+; X64:       # %bb.0:
+; X64-NEXT:    andnl (%rsi), %edi, %eax
+; X64-NEXT:    retq
   %y1 = load i32, i32* %y
   %tmp1 = xor i32 %x, -1
   %tmp2 = and i32 %y1, %tmp1
@@ -24,10 +39,18 @@ define i32 @andn32_load(i32 %x, i32* %y)
 }
 
 define i64 @andn64(i64 %x, i64 %y)   {
-; CHECK-LABEL: andn64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andnq %rsi, %rdi, %rax
-; CHECK-NEXT:    retq
+; X86-LABEL: andn64:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-NEXT:    andnl {{[0-9]+}}(%esp), %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: andn64:
+; X64:       # %bb.0:
+; X64-NEXT:    andnq %rsi, %rdi, %rax
+; X64-NEXT:    retq
   %tmp1 = xor i64 %x, -1
   %tmp2 = and i64 %tmp1, %y
   ret i64 %tmp2
@@ -35,11 +58,18 @@ define i64 @andn64(i64 %x, i64 %y)   {
 
 ; Don't choose a 'test' if an 'andn' can be used.
 define i1 @andn_cmp(i32 %x, i32 %y) {
-; CHECK-LABEL: andn_cmp:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andnl %esi, %edi, %eax
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    retq
+; X86-LABEL: andn_cmp:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: andn_cmp:
+; X64:       # %bb.0:
+; X64-NEXT:    andnl %esi, %edi, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
   %notx = xor i32 %x, -1
   %and = and i32 %notx, %y
   %cmp = icmp eq i32 %and, 0
@@ -48,44 +78,72 @@ define i1 @andn_cmp(i32 %x, i32 %y) {
 
 ; Recognize a disguised andn in the following 4 tests.
 define i1 @and_cmp1(i32 %x, i32 %y) {
-; CHECK-LABEL: and_cmp1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andnl %esi, %edi, %eax
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    retq
+; X86-LABEL: and_cmp1:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_cmp1:
+; X64:       # %bb.0:
+; X64-NEXT:    andnl %esi, %edi, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
   %and = and i32 %x, %y
   %cmp = icmp eq i32 %and, %y
   ret i1 %cmp
 }
 
 define i1 @and_cmp2(i32 %x, i32 %y) {
-; CHECK-LABEL: and_cmp2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andnl %esi, %edi, %eax
-; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    retq
+; X86-LABEL: and_cmp2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_cmp2:
+; X64:       # %bb.0:
+; X64-NEXT:    andnl %esi, %edi, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
   %and = and i32 %y, %x
   %cmp = icmp ne i32 %and, %y
   ret i1 %cmp
 }
 
 define i1 @and_cmp3(i32 %x, i32 %y) {
-; CHECK-LABEL: and_cmp3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andnl %esi, %edi, %eax
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    retq
+; X86-LABEL: and_cmp3:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_cmp3:
+; X64:       # %bb.0:
+; X64-NEXT:    andnl %esi, %edi, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
   %and = and i32 %x, %y
   %cmp = icmp eq i32 %y, %and
   ret i1 %cmp
 }
 
 define i1 @and_cmp4(i32 %x, i32 %y) {
-; CHECK-LABEL: and_cmp4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andnl %esi, %edi, %eax
-; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    retq
+; X86-LABEL: and_cmp4:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_cmp4:
+; X64:       # %bb.0:
+; X64-NEXT:    andnl %esi, %edi, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
   %and = and i32 %y, %x
   %cmp = icmp ne i32 %y, %and
   ret i1 %cmp
@@ -94,12 +152,20 @@ define i1 @and_cmp4(i32 %x, i32 %y) {
 ; A mask and compare against constant is ok for an 'andn' too
 ; even though the BMI instruction doesn't have an immediate form.
 define i1 @and_cmp_const(i32 %x) {
-; CHECK-LABEL: and_cmp_const:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    notl %edi
-; CHECK-NEXT:    andl $43, %edi
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    retq
+; X86-LABEL: and_cmp_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    andl $43, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_cmp_const:
+; X64:       # %bb.0:
+; X64-NEXT:    notl %edi
+; X64-NEXT:    andl $43, %edi
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
   %and = and i32 %x, 43
   %cmp = icmp eq i32 %and, 43
   ret i1 %cmp
@@ -107,11 +173,19 @@ define i1 @and_cmp_const(i32 %x) {
 
 ; But don't use 'andn' if the mask is a power-of-two.
 define i1 @and_cmp_const_power_of_two(i32 %x, i32 %y) {
-; CHECK-LABEL: and_cmp_const_power_of_two:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    btl %esi, %edi
-; CHECK-NEXT:    setae %al
-; CHECK-NEXT:    retq
+; X86-LABEL: and_cmp_const_power_of_two:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setae %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_cmp_const_power_of_two:
+; X64:       # %bb.0:
+; X64-NEXT:    btl %esi, %edi
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
   %shl = shl i32 1, %y
   %and = and i32 %x, %shl
   %cmp = icmp ne i32 %and, %shl
@@ -120,14 +194,24 @@ define i1 @and_cmp_const_power_of_two(i3
 
 ; Don't transform to 'andn' if there's another use of the 'and'.
 define i32 @and_cmp_not_one_use(i32 %x) {
-; CHECK-LABEL: and_cmp_not_one_use:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $37, %edi
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    cmpl $37, %edi
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    addl %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: and_cmp_not_one_use:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $37, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl $37, %ecx
+; X86-NEXT:    sete %al
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_cmp_not_one_use:
+; X64:       # %bb.0:
+; X64-NEXT:    andl $37, %edi
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl $37, %edi
+; X64-NEXT:    sete %al
+; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    retq
   %and = and i32 %x, 37
   %cmp = icmp eq i32 %and, 37
   %ext = zext i1 %cmp to i32
@@ -137,24 +221,42 @@ define i32 @and_cmp_not_one_use(i32 %x)
 
 ; Verify that we're not transforming invalid comparison predicates.
 define i1 @not_an_andn1(i32 %x, i32 %y) {
-; CHECK-LABEL: not_an_andn1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl %esi, %edi
-; CHECK-NEXT:    cmpl %edi, %esi
-; CHECK-NEXT:    setg %al
-; CHECK-NEXT:    retq
+; X86-LABEL: not_an_andn1:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl %eax, %ecx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_an_andn1:
+; X64:       # %bb.0:
+; X64-NEXT:    andl %esi, %edi
+; X64-NEXT:    cmpl %edi, %esi
+; X64-NEXT:    setg %al
+; X64-NEXT:    retq
   %and = and i32 %x, %y
   %cmp = icmp sgt i32 %y, %and
   ret i1 %cmp
 }
 
 define i1 @not_an_andn2(i32 %x, i32 %y) {
-; CHECK-LABEL: not_an_andn2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl %esi, %edi
-; CHECK-NEXT:    cmpl %edi, %esi
-; CHECK-NEXT:    setbe %al
-; CHECK-NEXT:    retq
+; X86-LABEL: not_an_andn2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl %eax, %ecx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    setbe %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_an_andn2:
+; X64:       # %bb.0:
+; X64-NEXT:    andl %esi, %edi
+; X64-NEXT:    cmpl %edi, %esi
+; X64-NEXT:    setbe %al
+; X64-NEXT:    retq
   %and = and i32 %y, %x
   %cmp = icmp ule i32 %y, %and
   ret i1 %cmp
@@ -162,11 +264,21 @@ define i1 @not_an_andn2(i32 %x, i32 %y)
 
 ; Don't choose a 'test' if an 'andn' can be used.
 define i1 @andn_cmp_swap_ops(i64 %x, i64 %y) {
-; CHECK-LABEL: andn_cmp_swap_ops:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andnq %rsi, %rdi, %rax
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    retq
+; X86-LABEL: andn_cmp_swap_ops:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andnl {{[0-9]+}}(%esp), %ecx, %ecx
+; X86-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: andn_cmp_swap_ops:
+; X64:       # %bb.0:
+; X64-NEXT:    andnq %rsi, %rdi, %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
   %notx = xor i64 %x, -1
   %and = and i64 %y, %notx
   %cmp = icmp eq i64 %and, 0
@@ -175,45 +287,72 @@ define i1 @andn_cmp_swap_ops(i64 %x, i64
 
 ; Use a 'test' (not an 'and') because 'andn' only works for i32/i64.
 define i1 @andn_cmp_i8(i8 %x, i8 %y) {
-; CHECK-LABEL: andn_cmp_i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    notb %sil
-; CHECK-NEXT:    testb %sil, %dil
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    retq
+; X86-LABEL: andn_cmp_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    notb %al
+; X86-NEXT:    testb %al, {{[0-9]+}}(%esp)
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: andn_cmp_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    notb %sil
+; X64-NEXT:    testb %sil, %dil
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
   %noty = xor i8 %y, -1
   %and = and i8 %x, %noty
   %cmp = icmp eq i8 %and, 0
   ret i1 %cmp
 }
 
+declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
+
 define i32 @bextr32(i32 %x, i32 %y)   {
-; CHECK-LABEL: bextr32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    bextrl %esi, %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: bextr32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: bextr32:
+; X64:       # %bb.0:
+; X64-NEXT:    bextrl %esi, %edi, %eax
+; X64-NEXT:    retq
   %tmp = tail call i32 @llvm.x86.bmi.bextr.32(i32 %x, i32 %y)
   ret i32 %tmp
 }
 
 define i32 @bextr32_load(i32* %x, i32 %y)   {
-; CHECK-LABEL: bextr32_load:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    bextrl %esi, (%rdi), %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: bextr32_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    bextrl %eax, (%ecx), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: bextr32_load:
+; X64:       # %bb.0:
+; X64-NEXT:    bextrl %esi, (%rdi), %eax
+; X64-NEXT:    retq
   %x1 = load i32, i32* %x
   %tmp = tail call i32 @llvm.x86.bmi.bextr.32(i32 %x1, i32 %y)
   ret i32 %tmp
 }
 
-declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
-
 define i32 @bextr32b(i32 %x)  uwtable  ssp {
-; CHECK-LABEL: bextr32b:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl $3076, %eax # imm = 0xC04
-; CHECK-NEXT:    bextrl %eax, %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: bextr32b:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $3076, %eax # imm = 0xC04
+; X86-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: bextr32b:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $3076, %eax # imm = 0xC04
+; X64-NEXT:    bextrl %eax, %edi, %eax
+; X64-NEXT:    retq
   %1 = lshr i32 %x, 4
   %2 = and i32 %1, 4095
   ret i32 %2
@@ -221,22 +360,34 @@ define i32 @bextr32b(i32 %x)  uwtable  s
 
 ; Make sure we still use AH subreg trick to extract 15:8
 define i32 @bextr32_subreg(i32 %x)  uwtable  ssp {
-; CHECK-LABEL: bextr32_subreg:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    movzbl %ah, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: bextr32_subreg:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: bextr32_subreg:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movzbl %ah, %eax
+; X64-NEXT:    retq
   %1 = lshr i32 %x, 8
   %2 = and i32 %1, 255
   ret i32 %2
 }
 
 define i32 @bextr32b_load(i32* %x)  uwtable  ssp {
-; CHECK-LABEL: bextr32b_load:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl $3076, %eax # imm = 0xC04
-; CHECK-NEXT:    bextrl %eax, (%rdi), %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: bextr32b_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $3076, %ecx # imm = 0xC04
+; X86-NEXT:    bextrl %ecx, (%eax), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: bextr32b_load:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $3076, %eax # imm = 0xC04
+; X64-NEXT:    bextrl %eax, (%rdi), %eax
+; X64-NEXT:    retq
   %1 = load i32, i32* %x
   %2 = lshr i32 %1, 4
   %3 = and i32 %2, 4095
@@ -245,126 +396,71 @@ define i32 @bextr32b_load(i32* %x)  uwta
 
 ; PR34042
 define i32 @bextr32c(i32 %x, i16 zeroext %y) {
-; CHECK-LABEL: bextr32c:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movswl %si, %eax
-; CHECK-NEXT:    bextrl %eax, %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: bextr32c:
+; X86:       # %bb.0:
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: bextr32c:
+; X64:       # %bb.0:
+; X64-NEXT:    movswl %si, %eax
+; X64-NEXT:    bextrl %eax, %edi, %eax
+; X64-NEXT:    retq
   %tmp0 = sext i16 %y to i32
   %tmp1 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %x, i32 %tmp0)
   ret i32 %tmp1
 }
 
-define i64 @bextr64(i64 %x, i64 %y)   {
-; CHECK-LABEL: bextr64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    bextrq %rsi, %rdi, %rax
-; CHECK-NEXT:    retq
-  %tmp = tail call i64 @llvm.x86.bmi.bextr.64(i64 %x, i64 %y)
-  ret i64 %tmp
-}
-
-declare i64 @llvm.x86.bmi.bextr.64(i64, i64)
-
-define i64 @bextr64b(i64 %x)  uwtable  ssp {
-; CHECK-LABEL: bextr64b:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl $3076, %eax # imm = 0xC04
-; CHECK-NEXT:    bextrl %eax, %edi, %eax
-; CHECK-NEXT:    retq
-  %1 = lshr i64 %x, 4
-  %2 = and i64 %1, 4095
-  ret i64 %2
-}
-
-; Make sure we still use the AH subreg trick to extract 15:8
-define i64 @bextr64_subreg(i64 %x)  uwtable  ssp {
-; CHECK-LABEL: bextr64_subreg:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    movzbl %ah, %eax
-; CHECK-NEXT:    retq
-  %1 = lshr i64 %x, 8
-  %2 = and i64 %1, 255
-  ret i64 %2
-}
-
-define i64 @bextr64b_load(i64* %x) {
-; CHECK-LABEL: bextr64b_load:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl $3076, %eax # imm = 0xC04
-; CHECK-NEXT:    bextrl %eax, (%rdi), %eax
-; CHECK-NEXT:    retq
-  %1 = load i64, i64* %x, align 8
-  %2 = lshr i64 %1, 4
-  %3 = and i64 %2, 4095
-  ret i64 %3
-}
-
-; PR34042
-define i64 @bextr64c(i64 %x, i32 %y) {
-; CHECK-LABEL: bextr64c:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movslq %esi, %rax
-; CHECK-NEXT:    bextrq %rax, %rdi, %rax
-; CHECK-NEXT:    retq
-  %tmp0 = sext i32 %y to i64
-  %tmp1 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %x, i64 %tmp0)
-  ret i64 %tmp1
-}
-
-define i64 @bextr64d(i64 %a) {
-; CHECK-LABEL: bextr64d:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl $8450, %eax # imm = 0x2102
-; CHECK-NEXT:    bextrq %rax, %rdi, %rax
-; CHECK-NEXT:    retq
-entry:
-  %shr = lshr i64 %a, 2
-  %and = and i64 %shr, 8589934591
-  ret i64 %and
-}
-
 define i32 @non_bextr32(i32 %x) {
-; CHECK-LABEL: non_bextr32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrl $2, %edi
-; CHECK-NEXT:    andl $111, %edi
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: non_bextr32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $111, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: non_bextr32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    shrl $2, %edi
+; X64-NEXT:    andl $111, %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
 entry:
   %shr = lshr i32 %x, 2
   %and = and i32 %shr, 111
   ret i32 %and
 }
 
-define i64 @non_bextr64(i64 %x) {
-; CHECK-LABEL: non_bextr64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrq $2, %rdi
-; CHECK-NEXT:    movabsq $8589934590, %rax # imm = 0x1FFFFFFFE
-; CHECK-NEXT:    andq %rdi, %rax
-; CHECK-NEXT:    retq
-entry:
-  %shr = lshr i64 %x, 2
-  %and = and i64 %shr, 8589934590
-  ret i64 %and
-}
-
 define i32 @bzhi32b(i32 %x, i8 zeroext %index) {
-; BMI1-LABEL: bzhi32b:
-; BMI1:       # %bb.0: # %entry
-; BMI1-NEXT:    movl $1, %eax
-; BMI1-NEXT:    movl %esi, %ecx
-; BMI1-NEXT:    shll %cl, %eax
-; BMI1-NEXT:    decl %eax
-; BMI1-NEXT:    andl %edi, %eax
-; BMI1-NEXT:    retq
-;
-; BMI2-LABEL: bzhi32b:
-; BMI2:       # %bb.0: # %entry
-; BMI2-NEXT:    bzhil %esi, %edi, %eax
-; BMI2-NEXT:    retq
+; X86-BMI1-LABEL: bzhi32b:
+; X86-BMI1:       # %bb.0: # %entry
+; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1-NEXT:    movl $1, %eax
+; X86-BMI1-NEXT:    shll %cl, %eax
+; X86-BMI1-NEXT:    decl %eax
+; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    retl
+;
+; X86-BMI2-LABEL: bzhi32b:
+; X86-BMI2:       # %bb.0: # %entry
+; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI2-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    retl
+;
+; X64-BMI1-LABEL: bzhi32b:
+; X64-BMI1:       # %bb.0: # %entry
+; X64-BMI1-NEXT:    movl $1, %eax
+; X64-BMI1-NEXT:    movl %esi, %ecx
+; X64-BMI1-NEXT:    shll %cl, %eax
+; X64-BMI1-NEXT:    decl %eax
+; X64-BMI1-NEXT:    andl %edi, %eax
+; X64-BMI1-NEXT:    retq
+;
+; X64-BMI2-LABEL: bzhi32b:
+; X64-BMI2:       # %bb.0: # %entry
+; X64-BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X64-BMI2-NEXT:    retq
 entry:
   %conv = zext i8 %index to i32
   %shl = shl i32 1, %conv
@@ -374,19 +470,36 @@ entry:
 }
 
 define i32 @bzhi32b_load(i32* %w, i8 zeroext %index) {
-; BMI1-LABEL: bzhi32b_load:
-; BMI1:       # %bb.0: # %entry
-; BMI1-NEXT:    movl $1, %eax
-; BMI1-NEXT:    movl %esi, %ecx
-; BMI1-NEXT:    shll %cl, %eax
-; BMI1-NEXT:    decl %eax
-; BMI1-NEXT:    andl (%rdi), %eax
-; BMI1-NEXT:    retq
-;
-; BMI2-LABEL: bzhi32b_load:
-; BMI2:       # %bb.0: # %entry
-; BMI2-NEXT:    bzhil %esi, (%rdi), %eax
-; BMI2-NEXT:    retq
+; X86-BMI1-LABEL: bzhi32b_load:
+; X86-BMI1:       # %bb.0: # %entry
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1-NEXT:    movl $1, %eax
+; X86-BMI1-NEXT:    shll %cl, %eax
+; X86-BMI1-NEXT:    decl %eax
+; X86-BMI1-NEXT:    andl (%edx), %eax
+; X86-BMI1-NEXT:    retl
+;
+; X86-BMI2-LABEL: bzhi32b_load:
+; X86-BMI2:       # %bb.0: # %entry
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    bzhil %ecx, (%eax), %eax
+; X86-BMI2-NEXT:    retl
+;
+; X64-BMI1-LABEL: bzhi32b_load:
+; X64-BMI1:       # %bb.0: # %entry
+; X64-BMI1-NEXT:    movl $1, %eax
+; X64-BMI1-NEXT:    movl %esi, %ecx
+; X64-BMI1-NEXT:    shll %cl, %eax
+; X64-BMI1-NEXT:    decl %eax
+; X64-BMI1-NEXT:    andl (%rdi), %eax
+; X64-BMI1-NEXT:    retq
+;
+; X64-BMI2-LABEL: bzhi32b_load:
+; X64-BMI2:       # %bb.0: # %entry
+; X64-BMI2-NEXT:    bzhil %esi, (%rdi), %eax
+; X64-BMI2-NEXT:    retq
 entry:
   %x = load i32, i32* %w
   %conv = zext i8 %index to i32
@@ -397,19 +510,34 @@ entry:
 }
 
 define i32 @bzhi32c(i32 %x, i8 zeroext %index) {
-; BMI1-LABEL: bzhi32c:
-; BMI1:       # %bb.0: # %entry
-; BMI1-NEXT:    movl $1, %eax
-; BMI1-NEXT:    movl %esi, %ecx
-; BMI1-NEXT:    shll %cl, %eax
-; BMI1-NEXT:    decl %eax
-; BMI1-NEXT:    andl %edi, %eax
-; BMI1-NEXT:    retq
-;
-; BMI2-LABEL: bzhi32c:
-; BMI2:       # %bb.0: # %entry
-; BMI2-NEXT:    bzhil %esi, %edi, %eax
-; BMI2-NEXT:    retq
+; X86-BMI1-LABEL: bzhi32c:
+; X86-BMI1:       # %bb.0: # %entry
+; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1-NEXT:    movl $1, %eax
+; X86-BMI1-NEXT:    shll %cl, %eax
+; X86-BMI1-NEXT:    decl %eax
+; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    retl
+;
+; X86-BMI2-LABEL: bzhi32c:
+; X86-BMI2:       # %bb.0: # %entry
+; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI2-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    retl
+;
+; X64-BMI1-LABEL: bzhi32c:
+; X64-BMI1:       # %bb.0: # %entry
+; X64-BMI1-NEXT:    movl $1, %eax
+; X64-BMI1-NEXT:    movl %esi, %ecx
+; X64-BMI1-NEXT:    shll %cl, %eax
+; X64-BMI1-NEXT:    decl %eax
+; X64-BMI1-NEXT:    andl %edi, %eax
+; X64-BMI1-NEXT:    retq
+;
+; X64-BMI2-LABEL: bzhi32c:
+; X64-BMI2:       # %bb.0: # %entry
+; X64-BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X64-BMI2-NEXT:    retq
 entry:
   %conv = zext i8 %index to i32
   %shl = shl i32 1, %conv
@@ -419,20 +547,36 @@ entry:
 }
 
 define i32 @bzhi32d(i32 %a, i32 %b) {
-; BMI1-LABEL: bzhi32d:
-; BMI1:       # %bb.0: # %entry
-; BMI1-NEXT:    movl $32, %ecx
-; BMI1-NEXT:    subl %esi, %ecx
-; BMI1-NEXT:    movl $-1, %eax
-; BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; BMI1-NEXT:    shrl %cl, %eax
-; BMI1-NEXT:    andl %edi, %eax
-; BMI1-NEXT:    retq
-;
-; BMI2-LABEL: bzhi32d:
-; BMI2:       # %bb.0: # %entry
-; BMI2-NEXT:    bzhil %esi, %edi, %eax
-; BMI2-NEXT:    retq
+; X86-BMI1-LABEL: bzhi32d:
+; X86-BMI1:       # %bb.0: # %entry
+; X86-BMI1-NEXT:    movl $32, %ecx
+; X86-BMI1-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl $-1, %eax
+; X86-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    retl
+;
+; X86-BMI2-LABEL: bzhi32d:
+; X86-BMI2:       # %bb.0: # %entry
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    retl
+;
+; X64-BMI1-LABEL: bzhi32d:
+; X64-BMI1:       # %bb.0: # %entry
+; X64-BMI1-NEXT:    movl $32, %ecx
+; X64-BMI1-NEXT:    subl %esi, %ecx
+; X64-BMI1-NEXT:    movl $-1, %eax
+; X64-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1-NEXT:    shrl %cl, %eax
+; X64-BMI1-NEXT:    andl %edi, %eax
+; X64-BMI1-NEXT:    retq
+;
+; X64-BMI2-LABEL: bzhi32d:
+; X64-BMI2:       # %bb.0: # %entry
+; X64-BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X64-BMI2-NEXT:    retq
 entry:
   %sub = sub i32 32, %b
   %shr = lshr i32 -1, %sub
@@ -441,20 +585,36 @@ entry:
 }
 
 define i32 @bzhi32e(i32 %a, i32 %b) {
-; BMI1-LABEL: bzhi32e:
-; BMI1:       # %bb.0: # %entry
-; BMI1-NEXT:    movl $32, %ecx
-; BMI1-NEXT:    subl %esi, %ecx
-; BMI1-NEXT:    shll %cl, %edi
-; BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; BMI1-NEXT:    shrl %cl, %edi
-; BMI1-NEXT:    movl %edi, %eax
-; BMI1-NEXT:    retq
-;
-; BMI2-LABEL: bzhi32e:
-; BMI2:       # %bb.0: # %entry
-; BMI2-NEXT:    bzhil %esi, %edi, %eax
-; BMI2-NEXT:    retq
+; X86-BMI1-LABEL: bzhi32e:
+; X86-BMI1:       # %bb.0: # %entry
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl $32, %ecx
+; X86-BMI1-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    shll %cl, %eax
+; X86-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    retl
+;
+; X86-BMI2-LABEL: bzhi32e:
+; X86-BMI2:       # %bb.0: # %entry
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    retl
+;
+; X64-BMI1-LABEL: bzhi32e:
+; X64-BMI1:       # %bb.0: # %entry
+; X64-BMI1-NEXT:    movl $32, %ecx
+; X64-BMI1-NEXT:    subl %esi, %ecx
+; X64-BMI1-NEXT:    shll %cl, %edi
+; X64-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1-NEXT:    shrl %cl, %edi
+; X64-BMI1-NEXT:    movl %edi, %eax
+; X64-BMI1-NEXT:    retq
+;
+; X64-BMI2-LABEL: bzhi32e:
+; X64-BMI2:       # %bb.0: # %entry
+; X64-BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X64-BMI2-NEXT:    retq
 entry:
   %sub = sub i32 32, %b
   %shl = shl i32 %a, %sub
@@ -463,20 +623,58 @@ entry:
 }
 
 define i64 @bzhi64b(i64 %x, i8 zeroext %index) {
-; BMI1-LABEL: bzhi64b:
-; BMI1:       # %bb.0: # %entry
-; BMI1-NEXT:    movl $1, %eax
-; BMI1-NEXT:    movl %esi, %ecx
-; BMI1-NEXT:    shlq %cl, %rax
-; BMI1-NEXT:    decq %rax
-; BMI1-NEXT:    andq %rdi, %rax
-; BMI1-NEXT:    retq
-;
-; BMI2-LABEL: bzhi64b:
-; BMI2:       # %bb.0: # %entry
-; BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
-; BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
-; BMI2-NEXT:    retq
+; X86-BMI1-LABEL: bzhi64b:
+; X86-BMI1:       # %bb.0: # %entry
+; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1-NEXT:    movl $1, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI1-NEXT:    shll %cl, %eax
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    je .LBB27_2
+; X86-BMI1-NEXT:  # %bb.1:
+; X86-BMI1-NEXT:    movl %eax, %edx
+; X86-BMI1-NEXT:    xorl %eax, %eax
+; X86-BMI1-NEXT:  .LBB27_2: # %entry
+; X86-BMI1-NEXT:    addl $-1, %eax
+; X86-BMI1-NEXT:    adcl $-1, %edx
+; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    retl
+;
+; X86-BMI2-LABEL: bzhi64b:
+; X86-BMI2:       # %bb.0: # %entry
+; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    movl $1, %eax
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB27_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    xorl %eax, %eax
+; X86-BMI2-NEXT:  .LBB27_2: # %entry
+; X86-BMI2-NEXT:    addl $-1, %eax
+; X86-BMI2-NEXT:    adcl $-1, %edx
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    retl
+;
+; X64-BMI1-LABEL: bzhi64b:
+; X64-BMI1:       # %bb.0: # %entry
+; X64-BMI1-NEXT:    movl $1, %eax
+; X64-BMI1-NEXT:    movl %esi, %ecx
+; X64-BMI1-NEXT:    shlq %cl, %rax
+; X64-BMI1-NEXT:    decq %rax
+; X64-BMI1-NEXT:    andq %rdi, %rax
+; X64-BMI1-NEXT:    retq
+;
+; X64-BMI2-LABEL: bzhi64b:
+; X64-BMI2:       # %bb.0: # %entry
+; X64-BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
+; X64-BMI2-NEXT:    retq
 entry:
   %conv = zext i8 %index to i64
   %shl = shl i64 1, %conv
@@ -486,20 +684,55 @@ entry:
 }
 
 define i64 @bzhi64c(i64 %a, i64 %b) {
-; BMI1-LABEL: bzhi64c:
-; BMI1:       # %bb.0: # %entry
-; BMI1-NEXT:    movl $64, %ecx
-; BMI1-NEXT:    subl %esi, %ecx
-; BMI1-NEXT:    movq $-1, %rax
-; BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; BMI1-NEXT:    shrq %cl, %rax
-; BMI1-NEXT:    andq %rdi, %rax
-; BMI1-NEXT:    retq
-;
-; BMI2-LABEL: bzhi64c:
-; BMI2:       # %bb.0: # %entry
-; BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
-; BMI2-NEXT:    retq
+; X86-BMI1-LABEL: bzhi64c:
+; X86-BMI1:       # %bb.0: # %entry
+; X86-BMI1-NEXT:    movl $64, %ecx
+; X86-BMI1-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl $-1, %eax
+; X86-BMI1-NEXT:    movl $-1, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    je .LBB28_2
+; X86-BMI1-NEXT:  # %bb.1:
+; X86-BMI1-NEXT:    movl %edx, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:  .LBB28_2: # %entry
+; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1-NEXT:    retl
+;
+; X86-BMI2-LABEL: bzhi64c:
+; X86-BMI2:       # %bb.0: # %entry
+; X86-BMI2-NEXT:    movl $64, %ecx
+; X86-BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl $-1, %eax
+; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edx
+; X86-BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB28_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %edx, %eax
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:  .LBB28_2: # %entry
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    retl
+;
+; X64-BMI1-LABEL: bzhi64c:
+; X64-BMI1:       # %bb.0: # %entry
+; X64-BMI1-NEXT:    movl $64, %ecx
+; X64-BMI1-NEXT:    subl %esi, %ecx
+; X64-BMI1-NEXT:    movq $-1, %rax
+; X64-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1-NEXT:    shrq %cl, %rax
+; X64-BMI1-NEXT:    andq %rdi, %rax
+; X64-BMI1-NEXT:    retq
+;
+; X64-BMI2-LABEL: bzhi64c:
+; X64-BMI2:       # %bb.0: # %entry
+; X64-BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
+; X64-BMI2-NEXT:    retq
 entry:
   %sub = sub i64 64, %b
   %shr = lshr i64 -1, %sub
@@ -508,21 +741,56 @@ entry:
 }
 
 define i64 @bzhi64d(i64 %a, i32 %b) {
-; BMI1-LABEL: bzhi64d:
-; BMI1:       # %bb.0: # %entry
-; BMI1-NEXT:    movl $64, %ecx
-; BMI1-NEXT:    subl %esi, %ecx
-; BMI1-NEXT:    movq $-1, %rax
-; BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; BMI1-NEXT:    shrq %cl, %rax
-; BMI1-NEXT:    andq %rdi, %rax
-; BMI1-NEXT:    retq
-;
-; BMI2-LABEL: bzhi64d:
-; BMI2:       # %bb.0: # %entry
-; BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
-; BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
-; BMI2-NEXT:    retq
+; X86-BMI1-LABEL: bzhi64d:
+; X86-BMI1:       # %bb.0: # %entry
+; X86-BMI1-NEXT:    movl $64, %ecx
+; X86-BMI1-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl $-1, %eax
+; X86-BMI1-NEXT:    movl $-1, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    je .LBB29_2
+; X86-BMI1-NEXT:  # %bb.1:
+; X86-BMI1-NEXT:    movl %edx, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:  .LBB29_2: # %entry
+; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1-NEXT:    retl
+;
+; X86-BMI2-LABEL: bzhi64d:
+; X86-BMI2:       # %bb.0: # %entry
+; X86-BMI2-NEXT:    movl $64, %ecx
+; X86-BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl $-1, %eax
+; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edx
+; X86-BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB29_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %edx, %eax
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:  .LBB29_2: # %entry
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    retl
+;
+; X64-BMI1-LABEL: bzhi64d:
+; X64-BMI1:       # %bb.0: # %entry
+; X64-BMI1-NEXT:    movl $64, %ecx
+; X64-BMI1-NEXT:    subl %esi, %ecx
+; X64-BMI1-NEXT:    movq $-1, %rax
+; X64-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1-NEXT:    shrq %cl, %rax
+; X64-BMI1-NEXT:    andq %rdi, %rax
+; X64-BMI1-NEXT:    retq
+;
+; X64-BMI2-LABEL: bzhi64d:
+; X64-BMI2:       # %bb.0: # %entry
+; X64-BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
+; X64-BMI2-NEXT:    retq
 entry:
   %sub = sub i32 64, %b
   %sh_prom = zext i32 %sub to i64
@@ -532,20 +800,106 @@ entry:
 }
 
 define i64 @bzhi64e(i64 %a, i64 %b) {
-; BMI1-LABEL: bzhi64e:
-; BMI1:       # %bb.0: # %entry
-; BMI1-NEXT:    movl $64, %ecx
-; BMI1-NEXT:    subl %esi, %ecx
-; BMI1-NEXT:    shlq %cl, %rdi
-; BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; BMI1-NEXT:    shrq %cl, %rdi
-; BMI1-NEXT:    movq %rdi, %rax
-; BMI1-NEXT:    retq
-;
-; BMI2-LABEL: bzhi64e:
-; BMI2:       # %bb.0: # %entry
-; BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
-; BMI2-NEXT:    retq
+; X86-BMI1-LABEL: bzhi64e:
+; X86-BMI1:       # %bb.0: # %entry
+; X86-BMI1-NEXT:    pushl %ebx
+; X86-BMI1-NEXT:    .cfi_def_cfa_offset 8
+; X86-BMI1-NEXT:    pushl %edi
+; X86-BMI1-NEXT:    .cfi_def_cfa_offset 12
+; X86-BMI1-NEXT:    pushl %esi
+; X86-BMI1-NEXT:    .cfi_def_cfa_offset 16
+; X86-BMI1-NEXT:    .cfi_offset %esi, -16
+; X86-BMI1-NEXT:    .cfi_offset %edi, -12
+; X86-BMI1-NEXT:    .cfi_offset %ebx, -8
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl $64, %ecx
+; X86-BMI1-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %edx, %esi
+; X86-BMI1-NEXT:    shll %cl, %esi
+; X86-BMI1-NEXT:    shldl %cl, %edx, %eax
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    movl %esi, %edi
+; X86-BMI1-NEXT:    jne .LBB30_2
+; X86-BMI1-NEXT:  # %bb.1: # %entry
+; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:  .LBB30_2: # %entry
+; X86-BMI1-NEXT:    movl %edi, %eax
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    xorl %ebx, %ebx
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    movl $0, %edx
+; X86-BMI1-NEXT:    jne .LBB30_4
+; X86-BMI1-NEXT:  # %bb.3: # %entry
+; X86-BMI1-NEXT:    movl %esi, %ebx
+; X86-BMI1-NEXT:    movl %eax, %edx
+; X86-BMI1-NEXT:  .LBB30_4: # %entry
+; X86-BMI1-NEXT:    shrdl %cl, %edi, %ebx
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    jne .LBB30_6
+; X86-BMI1-NEXT:  # %bb.5: # %entry
+; X86-BMI1-NEXT:    movl %ebx, %eax
+; X86-BMI1-NEXT:  .LBB30_6: # %entry
+; X86-BMI1-NEXT:    popl %esi
+; X86-BMI1-NEXT:    .cfi_def_cfa_offset 12
+; X86-BMI1-NEXT:    popl %edi
+; X86-BMI1-NEXT:    .cfi_def_cfa_offset 8
+; X86-BMI1-NEXT:    popl %ebx
+; X86-BMI1-NEXT:    .cfi_def_cfa_offset 4
+; X86-BMI1-NEXT:    retl
+;
+; X86-BMI2-LABEL: bzhi64e:
+; X86-BMI2:       # %bb.0: # %entry
+; X86-BMI2-NEXT:    pushl %edi
+; X86-BMI2-NEXT:    .cfi_def_cfa_offset 8
+; X86-BMI2-NEXT:    pushl %esi
+; X86-BMI2-NEXT:    .cfi_def_cfa_offset 12
+; X86-BMI2-NEXT:    .cfi_offset %esi, -12
+; X86-BMI2-NEXT:    .cfi_offset %edi, -8
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI2-NEXT:    movl $64, %ecx
+; X86-BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shldl %cl, %eax, %esi
+; X86-BMI2-NEXT:    shlxl %ecx, %eax, %edi
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB30_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %edi, %esi
+; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:  .LBB30_2: # %entry
+; X86-BMI2-NEXT:    shrxl %ecx, %esi, %eax
+; X86-BMI2-NEXT:    jne .LBB30_4
+; X86-BMI2-NEXT:  # %bb.3: # %entry
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:  .LBB30_4: # %entry
+; X86-BMI2-NEXT:    shrdl %cl, %esi, %edi
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    jne .LBB30_6
+; X86-BMI2-NEXT:  # %bb.5: # %entry
+; X86-BMI2-NEXT:    movl %edi, %eax
+; X86-BMI2-NEXT:  .LBB30_6: # %entry
+; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    .cfi_def_cfa_offset 8
+; X86-BMI2-NEXT:    popl %edi
+; X86-BMI2-NEXT:    .cfi_def_cfa_offset 4
+; X86-BMI2-NEXT:    retl
+;
+; X64-BMI1-LABEL: bzhi64e:
+; X64-BMI1:       # %bb.0: # %entry
+; X64-BMI1-NEXT:    movl $64, %ecx
+; X64-BMI1-NEXT:    subl %esi, %ecx
+; X64-BMI1-NEXT:    shlq %cl, %rdi
+; X64-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1-NEXT:    shrq %cl, %rdi
+; X64-BMI1-NEXT:    movq %rdi, %rax
+; X64-BMI1-NEXT:    retq
+;
+; X64-BMI2-LABEL: bzhi64e:
+; X64-BMI2:       # %bb.0: # %entry
+; X64-BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
+; X64-BMI2-NEXT:    retq
 entry:
   %sub = sub i64 64, %b
   %shl = shl i64 %a, %sub
@@ -554,21 +908,107 @@ entry:
 }
 
 define i64 @bzhi64f(i64 %a, i32 %b) {
-; BMI1-LABEL: bzhi64f:
-; BMI1:       # %bb.0: # %entry
-; BMI1-NEXT:    movl $64, %ecx
-; BMI1-NEXT:    subl %esi, %ecx
-; BMI1-NEXT:    shlq %cl, %rdi
-; BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; BMI1-NEXT:    shrq %cl, %rdi
-; BMI1-NEXT:    movq %rdi, %rax
-; BMI1-NEXT:    retq
-;
-; BMI2-LABEL: bzhi64f:
-; BMI2:       # %bb.0: # %entry
-; BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
-; BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
-; BMI2-NEXT:    retq
+; X86-BMI1-LABEL: bzhi64f:
+; X86-BMI1:       # %bb.0: # %entry
+; X86-BMI1-NEXT:    pushl %ebx
+; X86-BMI1-NEXT:    .cfi_def_cfa_offset 8
+; X86-BMI1-NEXT:    pushl %edi
+; X86-BMI1-NEXT:    .cfi_def_cfa_offset 12
+; X86-BMI1-NEXT:    pushl %esi
+; X86-BMI1-NEXT:    .cfi_def_cfa_offset 16
+; X86-BMI1-NEXT:    .cfi_offset %esi, -16
+; X86-BMI1-NEXT:    .cfi_offset %edi, -12
+; X86-BMI1-NEXT:    .cfi_offset %ebx, -8
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl $64, %ecx
+; X86-BMI1-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %edx, %esi
+; X86-BMI1-NEXT:    shll %cl, %esi
+; X86-BMI1-NEXT:    shldl %cl, %edx, %eax
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    movl %esi, %edi
+; X86-BMI1-NEXT:    jne .LBB31_2
+; X86-BMI1-NEXT:  # %bb.1: # %entry
+; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:  .LBB31_2: # %entry
+; X86-BMI1-NEXT:    movl %edi, %eax
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    xorl %ebx, %ebx
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    movl $0, %edx
+; X86-BMI1-NEXT:    jne .LBB31_4
+; X86-BMI1-NEXT:  # %bb.3: # %entry
+; X86-BMI1-NEXT:    movl %esi, %ebx
+; X86-BMI1-NEXT:    movl %eax, %edx
+; X86-BMI1-NEXT:  .LBB31_4: # %entry
+; X86-BMI1-NEXT:    shrdl %cl, %edi, %ebx
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    jne .LBB31_6
+; X86-BMI1-NEXT:  # %bb.5: # %entry
+; X86-BMI1-NEXT:    movl %ebx, %eax
+; X86-BMI1-NEXT:  .LBB31_6: # %entry
+; X86-BMI1-NEXT:    popl %esi
+; X86-BMI1-NEXT:    .cfi_def_cfa_offset 12
+; X86-BMI1-NEXT:    popl %edi
+; X86-BMI1-NEXT:    .cfi_def_cfa_offset 8
+; X86-BMI1-NEXT:    popl %ebx
+; X86-BMI1-NEXT:    .cfi_def_cfa_offset 4
+; X86-BMI1-NEXT:    retl
+;
+; X86-BMI2-LABEL: bzhi64f:
+; X86-BMI2:       # %bb.0: # %entry
+; X86-BMI2-NEXT:    pushl %edi
+; X86-BMI2-NEXT:    .cfi_def_cfa_offset 8
+; X86-BMI2-NEXT:    pushl %esi
+; X86-BMI2-NEXT:    .cfi_def_cfa_offset 12
+; X86-BMI2-NEXT:    .cfi_offset %esi, -12
+; X86-BMI2-NEXT:    .cfi_offset %edi, -8
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI2-NEXT:    movl $64, %ecx
+; X86-BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shldl %cl, %eax, %esi
+; X86-BMI2-NEXT:    shlxl %ecx, %eax, %edi
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB31_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %edi, %esi
+; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:  .LBB31_2: # %entry
+; X86-BMI2-NEXT:    shrxl %ecx, %esi, %eax
+; X86-BMI2-NEXT:    jne .LBB31_4
+; X86-BMI2-NEXT:  # %bb.3: # %entry
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:  .LBB31_4: # %entry
+; X86-BMI2-NEXT:    shrdl %cl, %esi, %edi
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    jne .LBB31_6
+; X86-BMI2-NEXT:  # %bb.5: # %entry
+; X86-BMI2-NEXT:    movl %edi, %eax
+; X86-BMI2-NEXT:  .LBB31_6: # %entry
+; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    .cfi_def_cfa_offset 8
+; X86-BMI2-NEXT:    popl %edi
+; X86-BMI2-NEXT:    .cfi_def_cfa_offset 4
+; X86-BMI2-NEXT:    retl
+;
+; X64-BMI1-LABEL: bzhi64f:
+; X64-BMI1:       # %bb.0: # %entry
+; X64-BMI1-NEXT:    movl $64, %ecx
+; X64-BMI1-NEXT:    subl %esi, %ecx
+; X64-BMI1-NEXT:    shlq %cl, %rdi
+; X64-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1-NEXT:    shrq %cl, %rdi
+; X64-BMI1-NEXT:    movq %rdi, %rax
+; X64-BMI1-NEXT:    retq
+;
+; X64-BMI2-LABEL: bzhi64f:
+; X64-BMI2:       # %bb.0: # %entry
+; X64-BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
+; X64-BMI2-NEXT:    retq
 entry:
   %sub = sub i32 64, %b
   %sh_prom = zext i32 %sub to i64
@@ -578,34 +1018,49 @@ entry:
 }
 
 define i64 @bzhi64_constant_mask(i64 %x) {
-; BMI1-LABEL: bzhi64_constant_mask:
-; BMI1:       # %bb.0: # %entry
-; BMI1-NEXT:    movl $15872, %eax # imm = 0x3E00
-; BMI1-NEXT:    bextrq %rax, %rdi, %rax
-; BMI1-NEXT:    retq
-;
-; BMI2-LABEL: bzhi64_constant_mask:
-; BMI2:       # %bb.0: # %entry
-; BMI2-NEXT:    movb $62, %al
-; BMI2-NEXT:    bzhiq %rax, %rdi, %rax
-; BMI2-NEXT:    retq
+; X86-LABEL: bzhi64_constant_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $1073741823, %edx # imm = 0x3FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    retl
+;
+; X64-BMI1-LABEL: bzhi64_constant_mask:
+; X64-BMI1:       # %bb.0: # %entry
+; X64-BMI1-NEXT:    movl $15872, %eax # imm = 0x3E00
+; X64-BMI1-NEXT:    bextrq %rax, %rdi, %rax
+; X64-BMI1-NEXT:    retq
+;
+; X64-BMI2-LABEL: bzhi64_constant_mask:
+; X64-BMI2:       # %bb.0: # %entry
+; X64-BMI2-NEXT:    movb $62, %al
+; X64-BMI2-NEXT:    bzhiq %rax, %rdi, %rax
+; X64-BMI2-NEXT:    retq
 entry:
   %and = and i64 %x, 4611686018427387903
   ret i64 %and
 }
 
 define i64 @bzhi64_constant_mask_load(i64* %x) {
-; BMI1-LABEL: bzhi64_constant_mask_load:
-; BMI1:       # %bb.0: # %entry
-; BMI1-NEXT:    movl $15872, %eax # imm = 0x3E00
-; BMI1-NEXT:    bextrq %rax, (%rdi), %rax
-; BMI1-NEXT:    retq
-;
-; BMI2-LABEL: bzhi64_constant_mask_load:
-; BMI2:       # %bb.0: # %entry
-; BMI2-NEXT:    movb $62, %al
-; BMI2-NEXT:    bzhiq %rax, (%rdi), %rax
-; BMI2-NEXT:    retq
+; X86-LABEL: bzhi64_constant_mask_load:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    movl $1073741823, %edx # imm = 0x3FFFFFFF
+; X86-NEXT:    andl 4(%ecx), %edx
+; X86-NEXT:    retl
+;
+; X64-BMI1-LABEL: bzhi64_constant_mask_load:
+; X64-BMI1:       # %bb.0: # %entry
+; X64-BMI1-NEXT:    movl $15872, %eax # imm = 0x3E00
+; X64-BMI1-NEXT:    bextrq %rax, (%rdi), %rax
+; X64-BMI1-NEXT:    retq
+;
+; X64-BMI2-LABEL: bzhi64_constant_mask_load:
+; X64-BMI2:       # %bb.0: # %entry
+; X64-BMI2-NEXT:    movb $62, %al
+; X64-BMI2-NEXT:    bzhiq %rax, (%rdi), %rax
+; X64-BMI2-NEXT:    retq
 entry:
   %x1 = load i64, i64* %x
   %and = and i64 %x1, 4611686018427387903
@@ -613,31 +1068,49 @@ entry:
 }
 
 define i64 @bzhi64_small_constant_mask(i64 %x) {
-; CHECK-LABEL: bzhi64_small_constant_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andl $2147483647, %edi # imm = 0x7FFFFFFF
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    retq
+; X86-LABEL: bzhi64_small_constant_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: bzhi64_small_constant_mask:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    andl $2147483647, %edi # imm = 0x7FFFFFFF
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    retq
 entry:
   %and = and i64 %x, 2147483647
   ret i64 %and
 }
 
 define i32 @blsi32(i32 %x)   {
-; CHECK-LABEL: blsi32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blsil %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: blsi32:
+; X86:       # %bb.0:
+; X86-NEXT:    blsil {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsi32:
+; X64:       # %bb.0:
+; X64-NEXT:    blsil %edi, %eax
+; X64-NEXT:    retq
   %tmp = sub i32 0, %x
   %tmp2 = and i32 %x, %tmp
   ret i32 %tmp2
 }
 
 define i32 @blsi32_load(i32* %x)   {
-; CHECK-LABEL: blsi32_load:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blsil (%rdi), %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: blsi32_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    blsil (%eax), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsi32_load:
+; X64:       # %bb.0:
+; X64-NEXT:    blsil (%rdi), %eax
+; X64-NEXT:    retq
   %x1 = load i32, i32* %x
   %tmp = sub i32 0, %x1
   %tmp2 = and i32 %x1, %tmp
@@ -645,30 +1118,58 @@ define i32 @blsi32_load(i32* %x)   {
 }
 
 define i64 @blsi64(i64 %x)   {
-; CHECK-LABEL: blsi64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blsiq %rdi, %rax
-; CHECK-NEXT:    retq
+; X86-LABEL: blsi64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsi64:
+; X64:       # %bb.0:
+; X64-NEXT:    blsiq %rdi, %rax
+; X64-NEXT:    retq
   %tmp = sub i64 0, %x
   %tmp2 = and i64 %tmp, %x
   ret i64 %tmp2
 }
 
 define i32 @blsmsk32(i32 %x)   {
-; CHECK-LABEL: blsmsk32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blsmskl %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: blsmsk32:
+; X86:       # %bb.0:
+; X86-NEXT:    blsmskl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsmsk32:
+; X64:       # %bb.0:
+; X64-NEXT:    blsmskl %edi, %eax
+; X64-NEXT:    retq
   %tmp = sub i32 %x, 1
   %tmp2 = xor i32 %x, %tmp
   ret i32 %tmp2
 }
 
 define i32 @blsmsk32_load(i32* %x)   {
-; CHECK-LABEL: blsmsk32_load:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blsmskl (%rdi), %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: blsmsk32_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    blsmskl (%eax), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsmsk32_load:
+; X64:       # %bb.0:
+; X64-NEXT:    blsmskl (%rdi), %eax
+; X64-NEXT:    retq
   %x1 = load i32, i32* %x
   %tmp = sub i32 %x1, 1
   %tmp2 = xor i32 %x1, %tmp
@@ -676,30 +1177,58 @@ define i32 @blsmsk32_load(i32* %x)   {
 }
 
 define i64 @blsmsk64(i64 %x)   {
-; CHECK-LABEL: blsmsk64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blsmskq %rdi, %rax
-; CHECK-NEXT:    retq
+; X86-LABEL: blsmsk64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsmsk64:
+; X64:       # %bb.0:
+; X64-NEXT:    blsmskq %rdi, %rax
+; X64-NEXT:    retq
   %tmp = sub i64 %x, 1
   %tmp2 = xor i64 %tmp, %x
   ret i64 %tmp2
 }
 
 define i32 @blsr32(i32 %x)   {
-; CHECK-LABEL: blsr32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blsrl %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: blsr32:
+; X86:       # %bb.0:
+; X86-NEXT:    blsrl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsr32:
+; X64:       # %bb.0:
+; X64-NEXT:    blsrl %edi, %eax
+; X64-NEXT:    retq
   %tmp = sub i32 %x, 1
   %tmp2 = and i32 %x, %tmp
   ret i32 %tmp2
 }
 
 define i32 @blsr32_load(i32* %x)   {
-; CHECK-LABEL: blsr32_load:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blsrl (%rdi), %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: blsr32_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    blsrl (%eax), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsr32_load:
+; X64:       # %bb.0:
+; X64-NEXT:    blsrl (%rdi), %eax
+; X64-NEXT:    retq
   %x1 = load i32, i32* %x
   %tmp = sub i32 %x1, 1
   %tmp2 = and i32 %x1, %tmp
@@ -707,10 +1236,27 @@ define i32 @blsr32_load(i32* %x)   {
 }
 
 define i64 @blsr64(i64 %x)   {
-; CHECK-LABEL: blsr64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blsrq %rdi, %rax
-; CHECK-NEXT:    retq
+; X86-LABEL: blsr64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsr64:
+; X64:       # %bb.0:
+; X64-NEXT:    blsrq %rdi, %rax
+; X64-NEXT:    retq
   %tmp = sub i64 %x, 1
   %tmp2 = and i64 %tmp, %x
   ret i64 %tmp2
@@ -719,11 +1265,18 @@ define i64 @blsr64(i64 %x)   {
 ; PR35792 - https://bugs.llvm.org/show_bug.cgi?id=35792
 
 define i64 @blsr_disguised_constant(i64 %x) {
-; CHECK-LABEL: blsr_disguised_constant:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blsrl %edi, %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: blsr_disguised_constant:
+; X86:       # %bb.0:
+; X86-NEXT:    blsrl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsr_disguised_constant:
+; X64:       # %bb.0:
+; X64-NEXT:    blsrl %edi, %eax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
   %a1 = and i64 %x, 65535
   %a2 = add i64 %x, 65535
   %r = and i64 %a1, %a2
@@ -733,11 +1286,19 @@ define i64 @blsr_disguised_constant(i64
 ; The add here used to get shrunk, but the and did not thus hiding the blsr pattern.
 ; We now use the knowledge that upper bits of the shift guarantee the and result has 0s in the upper bits to reduce it too.
 define i64 @blsr_disguised_shrunk_add(i64 %x) {
-; CHECK-LABEL: blsr_disguised_shrunk_add:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $48, %rdi
-; CHECK-NEXT:    blsrl %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: blsr_disguised_shrunk_add:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrl $16, %eax
+; X86-NEXT:    blsrl %eax, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: blsr_disguised_shrunk_add:
+; X64:       # %bb.0:
+; X64-NEXT:    shrq $48, %rdi
+; X64-NEXT:    blsrl %edi, %eax
+; X64-NEXT:    retq
   %a = lshr i64 %x, 48
   %b = add i64 %a, -1
   %c = and i64 %b, %a




More information about the llvm-commits mailing list