[llvm] 515924f - [X86] bittest-big-integer.ll - add BLSR style pattern test (#168356)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 17 04:38:55 PST 2025
Author: Simon Pilgrim
Date: 2025-11-17T12:38:51Z
New Revision: 515924f765407565efb65a70709b3f7d169366d0
URL: https://github.com/llvm/llvm-project/commit/515924f765407565efb65a70709b3f7d169366d0
DIFF: https://github.com/llvm/llvm-project/commit/515924f765407565efb65a70709b3f7d169366d0.diff
LOG: [X86] bittest-big-integer.ll - add BLSR style pattern test (#168356)
Test using CTTZ to determine the lowest set bit, clear it and return the
index
Shows failure to use RMW pattern on the load-btr-store due to additional
(but non-interference) uses of the load.
Added:
Modified:
llvm/test/CodeGen/X86/bittest-big-integer.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 9b7569ff8b29f..b85a20b9d6b6e 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1488,3 +1488,618 @@ define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
store i128 %res2, ptr %word
ret i1 %cmp1
}
+
+define i32 @blsr_u512(ptr %word) nounwind {
+; X86-LABEL: blsr_u512:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $240, %esp
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: movl 12(%ebx), %esi
+; X86-NEXT: movl 28(%ebx), %eax
+; X86-NEXT: movl 60(%ebx), %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl 44(%ebx), %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl 20(%ebx), %edx
+; X86-NEXT: movl 52(%ebx), %eax
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl 4(%ebx), %edi
+; X86-NEXT: movl 36(%ebx), %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl 24(%ebx), %edx
+; X86-NEXT: movl 56(%ebx), %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl 8(%ebx), %ecx
+; X86-NEXT: movl 40(%ebx), %esi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl 16(%ebx), %edx
+; X86-NEXT: movl 48(%ebx), %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl (%ebx), %esi
+; X86-NEXT: movl 32(%ebx), %ebx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: je .LBB26_1
+; X86-NEXT: # %bb.2: # %cond.false
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB26_3
+; X86-NEXT: # %bb.4: # %cond.false
+; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: addl $32, %eax
+; X86-NEXT: jmp .LBB26_5
+; X86-NEXT: .LBB26_1:
+; X86-NEXT: movl $512, %ecx # imm = 0x200
+; X86-NEXT: jmp .LBB26_41
+; X86-NEXT: .LBB26_3:
+; X86-NEXT: rep bsfl %ebx, %eax
+; X86-NEXT: .LBB26_5: # %cond.false
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: jne .LBB26_6
+; X86-NEXT: # %bb.7: # %cond.false
+; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: addl $32, %ecx
+; X86-NEXT: jmp .LBB26_8
+; X86-NEXT: .LBB26_6:
+; X86-NEXT: rep bsfl %ecx, %ecx
+; X86-NEXT: .LBB26_8: # %cond.false
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: jne .LBB26_10
+; X86-NEXT: # %bb.9: # %cond.false
+; X86-NEXT: addl $64, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: .LBB26_10: # %cond.false
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: jne .LBB26_11
+; X86-NEXT: # %bb.12: # %cond.false
+; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: addl $32, %ecx
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: je .LBB26_15
+; X86-NEXT: .LBB26_14:
+; X86-NEXT: rep bsfl %edx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: je .LBB26_17
+; X86-NEXT: jmp .LBB26_18
+; X86-NEXT: .LBB26_11:
+; X86-NEXT: rep bsfl %esi, %ecx
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB26_14
+; X86-NEXT: .LBB26_15: # %cond.false
+; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: addl $32, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: jne .LBB26_18
+; X86-NEXT: .LBB26_17: # %cond.false
+; X86-NEXT: addl $64, %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: .LBB26_18: # %cond.false
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: jne .LBB26_20
+; X86-NEXT: # %bb.19: # %cond.false
+; X86-NEXT: subl $-128, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: .LBB26_20: # %cond.false
+; X86-NEXT: addl $256, %eax # imm = 0x100
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: jne .LBB26_21
+; X86-NEXT: # %bb.22: # %cond.false
+; X86-NEXT: rep bsfl %edi, %ebx
+; X86-NEXT: addl $32, %ebx
+; X86-NEXT: jmp .LBB26_23
+; X86-NEXT: .LBB26_21:
+; X86-NEXT: rep bsfl %edx, %ebx
+; X86-NEXT: .LBB26_23: # %cond.false
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: jne .LBB26_24
+; X86-NEXT: # %bb.25: # %cond.false
+; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: addl $32, %ecx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: je .LBB26_27
+; X86-NEXT: jmp .LBB26_28
+; X86-NEXT: .LBB26_24:
+; X86-NEXT: rep bsfl %ecx, %ecx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: jne .LBB26_28
+; X86-NEXT: .LBB26_27: # %cond.false
+; X86-NEXT: addl $64, %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: .LBB26_28: # %cond.false
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: jne .LBB26_29
+; X86-NEXT: # %bb.30: # %cond.false
+; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: addl $32, %ecx
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: je .LBB26_33
+; X86-NEXT: .LBB26_32:
+; X86-NEXT: rep bsfl %edx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: je .LBB26_35
+; X86-NEXT: jmp .LBB26_36
+; X86-NEXT: .LBB26_29:
+; X86-NEXT: rep bsfl %esi, %ecx
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB26_32
+; X86-NEXT: .LBB26_33: # %cond.false
+; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: addl $32, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: jne .LBB26_36
+; X86-NEXT: .LBB26_35: # %cond.false
+; X86-NEXT: addl $64, %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: .LBB26_36: # %cond.false
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: jne .LBB26_38
+; X86-NEXT: # %bb.37: # %cond.false
+; X86-NEXT: subl $-128, %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: .LBB26_38: # %cond.false
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: jne .LBB26_40
+; X86-NEXT: # %bb.39: # %cond.false
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: .LBB26_40: # %cond.false
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: .LBB26_41: # %cond.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: andl $60, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT: subl %esi, %edx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: movl 56(%edx), %edi
+; X86-NEXT: movl 60(%edx), %esi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 52(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: notl %edi
+; X86-NEXT: andl %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%edx), %eax
+; X86-NEXT: movl 44(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%edx), %eax
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: andl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%edx), %eax
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebx), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%ebx), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%ebx), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: negl %eax
+; X86-NEXT: movl 208(%esp,%eax), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl (%ebx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl %cl, %edi, %ebx
+; X86-NEXT: notl %ebx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, 24(%ecx)
+; X86-NEXT: movl %esi, 20(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, 16(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, 12(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, 8(%ecx)
+; X86-NEXT: movl %edi, 4(%ecx)
+; X86-NEXT: movl %eax, (%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 28(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 32(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 36(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 40(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 44(%ecx)
+; X86-NEXT: movl %edx, 48(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 52(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 56(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 60(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; SSE-LABEL: blsr_u512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: pushq %rax
+; SSE-NEXT: movq 56(%rdi), %rcx
+; SSE-NEXT: movq 48(%rdi), %rdx
+; SSE-NEXT: movq 40(%rdi), %rsi
+; SSE-NEXT: movq 32(%rdi), %r11
+; SSE-NEXT: movq 24(%rdi), %r8
+; SSE-NEXT: movq 16(%rdi), %r9
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %r10
+; SSE-NEXT: rep bsfq %rax, %rbx
+; SSE-NEXT: rep bsfq %r10, %r14
+; SSE-NEXT: addq $64, %r14
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: cmovneq %rbx, %r14
+; SSE-NEXT: rep bsfq %r9, %r15
+; SSE-NEXT: rep bsfq %r8, %rbx
+; SSE-NEXT: addq $64, %rbx
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovneq %r15, %rbx
+; SSE-NEXT: subq $-128, %rbx
+; SSE-NEXT: movq %rax, %r15
+; SSE-NEXT: movq %rax, %r12
+; SSE-NEXT: orq %r10, %r12
+; SSE-NEXT: cmovneq %r14, %rbx
+; SSE-NEXT: rep bsfq %r11, %r12
+; SSE-NEXT: rep bsfq %rsi, %r14
+; SSE-NEXT: addq $64, %r14
+; SSE-NEXT: testq %r11, %r11
+; SSE-NEXT: cmovneq %r12, %r14
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: rep bsfq %rdx, %r12
+; SSE-NEXT: movl $64, %eax
+; SSE-NEXT: rep bsfq %rcx, %rax
+; SSE-NEXT: addq $64, %rax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovneq %r12, %rax
+; SSE-NEXT: subq $-128, %rax
+; SSE-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: orq %rsi, %r11
+; SSE-NEXT: cmovneq %r14, %rax
+; SSE-NEXT: addq $256, %rax # imm = 0x100
+; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: orq %r8, %r10
+; SSE-NEXT: orq %r9, %r15
+; SSE-NEXT: orq %r10, %r15
+; SSE-NEXT: cmovneq %rbx, %rax
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $32, %ecx
+; SSE-NEXT: movl %eax, %edx
+; SSE-NEXT: andl $480, %edx # imm = 0x1E0
+; SSE-NEXT: shrl $3, %edx
+; SSE-NEXT: movl %edx, %esi
+; SSE-NEXT: andl $-8, %esi
+; SSE-NEXT: movq -128(%rsp,%rsi), %r8
+; SSE-NEXT: shrq %cl, %r8
+; SSE-NEXT: movl -120(%rsp,%rsi), %esi
+; SSE-NEXT: addl %esi, %esi
+; SSE-NEXT: notl %ecx
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shlq %cl, %rsi
+; SSE-NEXT: orl %r8d, %esi
+; SSE-NEXT: btrl %eax, %esi
+; SSE-NEXT: movl %esi, (%rdi,%rdx)
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: addq $8, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: blsr_u512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: movq 56(%rdi), %rcx
+; AVX2-NEXT: movq 40(%rdi), %rdx
+; AVX2-NEXT: movq 32(%rdi), %r11
+; AVX2-NEXT: movq 24(%rdi), %rsi
+; AVX2-NEXT: movq 16(%rdi), %r8
+; AVX2-NEXT: movq (%rdi), %r9
+; AVX2-NEXT: movq 8(%rdi), %r10
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %r9, %rbx
+; AVX2-NEXT: tzcntq %r10, %rax
+; AVX2-NEXT: addq $64, %rax
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovneq %rbx, %rax
+; AVX2-NEXT: xorl %r14d, %r14d
+; AVX2-NEXT: tzcntq %r8, %r14
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %rsi, %rbx
+; AVX2-NEXT: addq $64, %rbx
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovneq %r14, %rbx
+; AVX2-NEXT: subq $-128, %rbx
+; AVX2-NEXT: movq %r9, %r14
+; AVX2-NEXT: movq %r9, %r15
+; AVX2-NEXT: orq %r10, %r15
+; AVX2-NEXT: cmovneq %rax, %rbx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %r11, %rax
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: tzcntq %rdx, %r12
+; AVX2-NEXT: addq $64, %r12
+; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: cmovneq %rax, %r12
+; AVX2-NEXT: movq 48(%rdi), %r15
+; AVX2-NEXT: xorl %r13d, %r13d
+; AVX2-NEXT: tzcntq %r15, %r13
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %rcx, %rax
+; AVX2-NEXT: addq $64, %rax
+; AVX2-NEXT: testq %r15, %r15
+; AVX2-NEXT: cmovneq %r13, %rax
+; AVX2-NEXT: subq $-128, %rax
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: orq %rdx, %r11
+; AVX2-NEXT: cmovneq %r12, %rax
+; AVX2-NEXT: addq $256, %rax # imm = 0x100
+; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: orq %rsi, %r10
+; AVX2-NEXT: orq %r8, %r14
+; AVX2-NEXT: orq %r10, %r14
+; AVX2-NEXT: cmovneq %rbx, %rax
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r15, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $32, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: andl $480, %edx # imm = 0x1E0
+; AVX2-NEXT: shrl $3, %edx
+; AVX2-NEXT: movl %edx, %esi
+; AVX2-NEXT: andl $-8, %esi
+; AVX2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8
+; AVX2-NEXT: notl %ecx
+; AVX2-NEXT: movl -120(%rsp,%rsi), %esi
+; AVX2-NEXT: addl %esi, %esi
+; AVX2-NEXT: shlxq %rcx, %rsi, %rcx
+; AVX2-NEXT: orl %r8d, %ecx
+; AVX2-NEXT: btrl %eax, %ecx
+; AVX2-NEXT: movl %ecx, (%rdi,%rdx)
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: blsr_u512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rax
+; AVX512-NEXT: vmovups (%rdi), %ymm0
+; AVX512-NEXT: vmovups 32(%rdi), %ymm1
+; AVX512-NEXT: vmovdqu64 (%rdi), %zmm2
+; AVX512-NEXT: vpternlogd {{.*#+}} zmm3 = -1
+; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm3
+; AVX512-NEXT: vpandnq %zmm3, %zmm2, %zmm3
+; AVX512-NEXT: vplzcntq %zmm3, %zmm3
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
+; AVX512-NEXT: vpsubq %zmm3, %zmm4, %zmm3
+; AVX512-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [512,512,512,512,512,512,512,512]
+; AVX512-NEXT: vpcompressq %zmm3, %zmm2 {%k1}
+; AVX512-NEXT: vmovq %xmm2, %rax
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovdqu %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqu %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: andl $32, %ecx
+; AVX512-NEXT: movl %ecx, %edx
+; AVX512-NEXT: notl %edx
+; AVX512-NEXT: movl %eax, %esi
+; AVX512-NEXT: shrl $3, %esi
+; AVX512-NEXT: movl %esi, %r8d
+; AVX512-NEXT: andl $56, %r8d
+; AVX512-NEXT: movl -120(%rsp,%r8), %r9d
+; AVX512-NEXT: addl %r9d, %r9d
+; AVX512-NEXT: shlxq %rdx, %r9, %rdx
+; AVX512-NEXT: shrl $3, %ecx
+; AVX512-NEXT: addq %rsp, %r8
+; AVX512-NEXT: addq $-128, %r8
+; AVX512-NEXT: orl (%rcx,%r8), %edx
+; AVX512-NEXT: btrl %eax, %edx
+; AVX512-NEXT: andl $60, %esi
+; AVX512-NEXT: movl %edx, (%rdi,%rsi)
+; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: popq %rcx
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %ld = load i512, ptr %word
+ %tz = tail call range(i512 0, 513) i512 @llvm.cttz.i512(i512 %ld, i1 false)
+ %tz.cast = trunc nuw nsw i512 %tz to i32
+ %tz.mask = and i512 %tz, 511
+ %mask = shl nuw i512 1, %tz.mask
+ %mask.not = xor i512 %mask, -1
+ %blsr = and i512 %ld, %mask.not
+ store i512 %blsr, ptr %word
+ ret i32 %tz.cast
+}
More information about the llvm-commits
mailing list