[llvm] 7f648d2 - Reland "[X86][MC] Always emit `rep` prefix for `bsf`"
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 4 19:26:33 PDT 2022
Author: Phoebe Wang
Date: 2022-08-05T10:22:48+08:00
New Revision: 7f648d27a85a98fa077f0968dea081821627d477
URL: https://github.com/llvm/llvm-project/commit/7f648d27a85a98fa077f0968dea081821627d477
DIFF: https://github.com/llvm/llvm-project/commit/7f648d27a85a98fa077f0968dea081821627d477.diff
LOG: Reland "[X86][MC] Always emit `rep` prefix for `bsf`"
`BMI` new instruction `tzcnt` has better performance than `bsf` on new
processors. Its encoding has a mandatory prefix '0xf3' compared to
`bsf`. If we force emit `rep` prefix for `bsf`, we will gain better
performance when the same code run on new processors.
GCC has already done this way: https://c.godbolt.org/z/6xere6fs1
Fixes #34191
Reviewed By: craig.topper, skan
Differential Revision: https://reviews.llvm.org/D130956
Added:
Modified:
llvm/lib/Target/X86/X86MCInstLower.cpp
llvm/test/CodeGen/X86/clz.ll
llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
llvm/test/CodeGen/X86/stack-folding-x86_64.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 76ddb3d9cf4ee..bfbded8c5d64c 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -982,6 +982,15 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg()))
std::swap(OutMI.getOperand(1), OutMI.getOperand(2));
}
+ // Add an REP prefix to BSF instructions so that new processors can
+ // recognize as TZCNT, which has better performance than BSF.
+ if (X86::isBSF(OutMI.getOpcode()) && !MF.getFunction().hasOptSize()) {
+ // BSF and TZCNT have
diff erent interpretations on ZF bit. So make sure
+ // it won't be used later.
+ const MachineOperand *FlagDef = MI->findRegisterDefOperand(X86::EFLAGS);
+ if (FlagDef && FlagDef->isDead())
+ OutMI.setFlags(X86::IP_HAS_REPEAT);
+ }
break;
}
}
diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll
index 0a9f276fcc768..aeb14820f00ed 100644
--- a/llvm/test/CodeGen/X86/clz.ll
+++ b/llvm/test/CodeGen/X86/clz.ll
@@ -18,13 +18,13 @@ declare i64 @llvm.ctlz.i64(i64, i1)
define i8 @cttz_i8(i8 %x) {
; X86-LABEL: cttz_i8:
; X86: # %bb.0:
-; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: rep bsfl {{[0-9]+}}(%esp), %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
;
; X64-LABEL: cttz_i8:
; X64: # %bb.0:
-; X64-NEXT: bsfl %edi, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
@@ -46,13 +46,13 @@ define i8 @cttz_i8(i8 %x) {
define i16 @cttz_i16(i16 %x) {
; X86-LABEL: cttz_i16:
; X86: # %bb.0:
-; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: rep bsfl {{[0-9]+}}(%esp), %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: cttz_i16:
; X64: # %bb.0:
-; X64-NEXT: bsfl %edi, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
@@ -74,12 +74,12 @@ define i16 @cttz_i16(i16 %x) {
define i32 @cttz_i32(i32 %x) {
; X86-LABEL: cttz_i32:
; X86: # %bb.0:
-; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: rep bsfl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
; X64-LABEL: cttz_i32:
; X64: # %bb.0:
-; X64-NEXT: bsfl %edi, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
;
; X86-CLZ-LABEL: cttz_i32:
@@ -102,20 +102,20 @@ define i64 @cttz_i64(i64 %x) {
; X86-NOCMOV-NEXT: testl %eax, %eax
; X86-NOCMOV-NEXT: jne .LBB3_1
; X86-NOCMOV-NEXT: # %bb.2:
-; X86-NOCMOV-NEXT: bsfl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT: rep bsfl {{[0-9]+}}(%esp), %eax
; X86-NOCMOV-NEXT: addl $32, %eax
; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
; X86-NOCMOV-NEXT: .LBB3_1:
-; X86-NOCMOV-NEXT: bsfl %eax, %eax
+; X86-NOCMOV-NEXT: rep bsfl %eax, %eax
; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
;
; X86-CMOV-LABEL: cttz_i64:
; X86-CMOV: # %bb.0:
; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT: bsfl %ecx, %edx
-; X86-CMOV-NEXT: bsfl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT: rep bsfl %ecx, %edx
+; X86-CMOV-NEXT: rep bsfl {{[0-9]+}}(%esp), %eax
; X86-CMOV-NEXT: addl $32, %eax
; X86-CMOV-NEXT: testl %ecx, %ecx
; X86-CMOV-NEXT: cmovnel %edx, %eax
@@ -124,7 +124,7 @@ define i64 @cttz_i64(i64 %x) {
;
; X64-LABEL: cttz_i64:
; X64: # %bb.0:
-; X64-NEXT: bsfq %rdi, %rax
+; X64-NEXT: rep bsfq %rdi, %rax
; X64-NEXT: retq
;
; X86-CLZ-LABEL: cttz_i64:
@@ -519,7 +519,7 @@ define i8 @cttz_i8_zero_test(i8 %n) {
; X86-NEXT: je .LBB12_1
; X86-NEXT: # %bb.2: # %cond.false
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: bsfl %eax, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
; X86-NEXT: .LBB12_1:
@@ -533,7 +533,7 @@ define i8 @cttz_i8_zero_test(i8 %n) {
; X64-NEXT: je .LBB12_1
; X64-NEXT: # %bb.2: # %cond.false
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: bsfl %eax, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
; X64-NEXT: .LBB12_1:
@@ -567,7 +567,7 @@ define i16 @cttz_i16_zero_test(i16 %n) {
; X86-NEXT: testw %ax, %ax
; X86-NEXT: je .LBB13_1
; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: bsfl %eax, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
; X86-NEXT: .LBB13_1:
@@ -580,7 +580,7 @@ define i16 @cttz_i16_zero_test(i16 %n) {
; X64-NEXT: testw %di, %di
; X64-NEXT: je .LBB13_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: bsfl %edi, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
; X64-NEXT: .LBB13_1:
@@ -614,7 +614,7 @@ define i32 @cttz_i32_zero_test(i32 %n) {
; X86-NEXT: testl %eax, %eax
; X86-NEXT: je .LBB14_1
; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: bsfl %eax, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
; X86-NEXT: .LBB14_1:
; X86-NEXT: movl $32, %eax
@@ -625,7 +625,7 @@ define i32 @cttz_i32_zero_test(i32 %n) {
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB14_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: bsfl %edi, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB14_1:
; X64-NEXT: movl $32, %eax
@@ -649,6 +649,7 @@ define i64 @cttz_i64_zero_test(i64 %n) {
; X86-NOCMOV-LABEL: cttz_i64_zero_test:
; X86-NOCMOV: # %bb.0:
; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NOCMOV-NOT: rep
; X86-NOCMOV-NEXT: bsfl {{[0-9]+}}(%esp), %edx
; X86-NOCMOV-NEXT: movl $32, %eax
; X86-NOCMOV-NEXT: je .LBB15_2
@@ -662,17 +663,19 @@ define i64 @cttz_i64_zero_test(i64 %n) {
; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
; X86-NOCMOV-NEXT: .LBB15_3:
-; X86-NOCMOV-NEXT: bsfl %ecx, %eax
+; X86-NOCMOV-NEXT: rep bsfl %ecx, %eax
; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
;
; X86-CMOV-LABEL: cttz_i64_zero_test:
; X86-CMOV: # %bb.0:
; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NOT: rep
; X86-CMOV-NEXT: bsfl {{[0-9]+}}(%esp), %ecx
; X86-CMOV-NEXT: movl $32, %edx
; X86-CMOV-NEXT: cmovnel %ecx, %edx
; X86-CMOV-NEXT: addl $32, %edx
+; X86-CMOV-NOT: rep
; X86-CMOV-NEXT: bsfl %eax, %eax
; X86-CMOV-NEXT: cmovel %edx, %eax
; X86-CMOV-NEXT: xorl %edx, %edx
@@ -683,7 +686,7 @@ define i64 @cttz_i64_zero_test(i64 %n) {
; X64-NEXT: testq %rdi, %rdi
; X64-NEXT: je .LBB15_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: bsfq %rdi, %rax
+; X64-NEXT: rep bsfq %rdi, %rax
; X64-NEXT: retq
; X64-NEXT: .LBB15_1:
; X64-NEXT: movl $64, %eax
@@ -833,7 +836,7 @@ define i8 @cttz_i8_knownbits(i8 %x) {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: orb $2, %al
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: bsfl %eax, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
;
@@ -841,7 +844,7 @@ define i8 @cttz_i8_knownbits(i8 %x) {
; X64: # %bb.0:
; X64-NEXT: orb $2, %dil
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: bsfl %eax, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
@@ -994,12 +997,12 @@ define i64 @cttz_i64_zero_test_knownneverzero(i64 %n) {
; X86-NOCMOV-NEXT: # %bb.2:
; X86-NOCMOV-NEXT: movl $-2147483648, %eax # imm = 0x80000000
; X86-NOCMOV-NEXT: orl {{[0-9]+}}(%esp), %eax
-; X86-NOCMOV-NEXT: bsfl %eax, %eax
+; X86-NOCMOV-NEXT: rep bsfl %eax, %eax
; X86-NOCMOV-NEXT: orl $32, %eax
; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
; X86-NOCMOV-NEXT: .LBB22_1:
-; X86-NOCMOV-NEXT: bsfl %eax, %eax
+; X86-NOCMOV-NEXT: rep bsfl %eax, %eax
; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
;
@@ -1008,8 +1011,8 @@ define i64 @cttz_i64_zero_test_knownneverzero(i64 %n) {
; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-CMOV-NEXT: movl $-2147483648, %eax # imm = 0x80000000
; X86-CMOV-NEXT: orl {{[0-9]+}}(%esp), %eax
-; X86-CMOV-NEXT: bsfl %ecx, %edx
-; X86-CMOV-NEXT: bsfl %eax, %eax
+; X86-CMOV-NEXT: rep bsfl %ecx, %edx
+; X86-CMOV-NEXT: rep bsfl %eax, %eax
; X86-CMOV-NEXT: orl $32, %eax
; X86-CMOV-NEXT: testl %ecx, %ecx
; X86-CMOV-NEXT: cmovnel %edx, %eax
@@ -1020,7 +1023,7 @@ define i64 @cttz_i64_zero_test_knownneverzero(i64 %n) {
; X64: # %bb.0:
; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
; X64-NEXT: orq %rdi, %rax
-; X64-NEXT: bsfq %rax, %rax
+; X64-NEXT: rep bsfq %rax, %rax
; X64-NEXT: retq
;
; X86-CLZ-LABEL: cttz_i64_zero_test_knownneverzero:
@@ -1121,3 +1124,55 @@ define i32 @PR47603_zext(i32 %a0, ptr %a1) {
%sext = sext i8 %load to i32
ret i32 %sext
}
+
+define i32 @cttz_i32_osize(i32 %x) optsize {
+; X86-LABEL: cttz_i32_osize:
+; X86: # %bb.0:
+; X86-NOT: rep
+; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: cttz_i32_osize:
+; X64: # %bb.0:
+; X64-NOT: rep
+; X64-NEXT: bsfl %edi, %eax
+; X64-NEXT: retq
+;
+; X86-CLZ-LABEL: cttz_i32_osize:
+; X86-CLZ: # %bb.0:
+; X86-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i32_osize:
+; X64-CLZ: # %bb.0:
+; X64-CLZ-NEXT: tzcntl %edi, %eax
+; X64-CLZ-NEXT: retq
+ %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true)
+ ret i32 %tmp
+}
+
+define i32 @cttz_i32_msize(i32 %x) minsize {
+; X86-LABEL: cttz_i32_msize:
+; X86: # %bb.0:
+; X86-NOT: rep
+; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: cttz_i32_msize:
+; X64: # %bb.0:
+; X64-NOT: rep
+; X64-NEXT: bsfl %edi, %eax
+; X64-NEXT: retq
+;
+; X86-CLZ-LABEL: cttz_i32_msize:
+; X86-CLZ: # %bb.0:
+; X86-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i32_msize:
+; X64-CLZ: # %bb.0:
+; X64-CLZ-NEXT: tzcntl %edi, %eax
+; X64-CLZ-NEXT: retq
+ %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true)
+ ret i32 %tmp
+}
diff --git a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
index f3d4b6221d085..9069688c8037c 100644
--- a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
+++ b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -353,6 +353,7 @@ define i1 @asm_clobbering_flags(ptr %mem) nounwind {
; CHECK32-NEXT: testl %edx, %edx
; CHECK32-NEXT: setg %al
; CHECK32-NEXT: #APP
+; CHECK32-NOT: rep
; CHECK32-NEXT: bsfl %edx, %edx
; CHECK32-NEXT: #NO_APP
; CHECK32-NEXT: movl %edx, (%ecx)
@@ -364,6 +365,7 @@ define i1 @asm_clobbering_flags(ptr %mem) nounwind {
; CHECK64-NEXT: testl %ecx, %ecx
; CHECK64-NEXT: setg %al
; CHECK64-NEXT: #APP
+; CHECK64-NOT: rep
; CHECK64-NEXT: bsfl %ecx, %ecx
; CHECK64-NEXT: #NO_APP
; CHECK64-NEXT: movl %ecx, (%rdi)
diff --git a/llvm/test/CodeGen/X86/stack-folding-x86_64.ll b/llvm/test/CodeGen/X86/stack-folding-x86_64.ll
index e6fbec1ace3d0..c505f4ba8f77c 100644
--- a/llvm/test/CodeGen/X86/stack-folding-x86_64.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-x86_64.ll
@@ -37,7 +37,7 @@ define i32 @stack_fold_bsf_i32(i32 %a0) {
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: bsfl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-NEXT: rep bsfl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: popq %r12
@@ -82,7 +82,7 @@ define i64 @stack_fold_bsf_i64(i64 %a0) {
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: bsfq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; CHECK-NEXT: rep bsfq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: popq %r12
More information about the llvm-commits
mailing list