[LLVMbugs] [Bug 16726] New: Better unsigned byte, short, int and long rotations
bugzilla-daemon at llvm.org
bugzilla-daemon at llvm.org
Sun Jul 28 10:14:40 PDT 2013
http://llvm.org/bugs/show_bug.cgi?id=16726
Bug ID: 16726
Summary: Better unsigned byte, short, int and long rotations
Product: libraries
Version: trunk
Hardware: PC
OS: Windows XP
Status: NEW
Severity: enhancement
Priority: P
Component: Common Code Generator Code
Assignee: unassignedbugs at nondot.org
Reporter: bearophile at mailas.com
CC: llvmbugs at cs.uiuc.edu
Classification: Unclassified
This is a potential enhancement request, or two potential enhancement requests.
This is a D program that performs rotations (the rotation pattern used here it
the one recognized by the back-end of DMD, the reference D compiler):
// Left-shift x by n bits.
ubyte rol1(in ubyte x, in uint nBits) pure nothrow {
return cast(ubyte)((x << nBits) | (x >> ((ubyte.sizeof * 8) - nBits)));
}
// Right-shift x by n bits.
ubyte ror1(in ubyte x, in uint nBits) pure nothrow {
return cast(ubyte)((x >> nBits) | (x << ((ubyte.sizeof * 8) - nBits)));
}
ushort rol2(in ushort x, in uint nBits) pure nothrow {
return cast(ushort)((x << nBits) | (x >> ((ushort.sizeof * 8) - nBits)));
}
ushort ror2(in ushort x, in uint nBits) pure nothrow {
return cast(ushort)((x >> nBits) | (x << ((ushort.sizeof * 8) - nBits)));
}
uint rol3(in uint x, in uint nBits) pure nothrow {
return cast(uint)((x << nBits) | (x >> ((uint.sizeof * 8) - nBits)));
}
uint ror3(in uint x, in uint nBits) pure nothrow {
return cast(uint)((x >> nBits) | (x << ((uint.sizeof * 8) - nBits)));
}
ulong rol4(in ulong x, in uint nBits) pure nothrow {
return cast(ulong)((x << nBits) | (x >> ((ulong.sizeof * 8) - nBits)));
}
ulong ror4(in ulong x, in uint nBits) pure nothrow {
return cast(ulong)((x >> nBits) | (x << ((ulong.sizeof * 8) - nBits)));
}
void main() {}
That is similar to this C code:
#include "stdint.h"
// Left-shift x by n bits.
uint8_t rol1(const uint8_t x, const uint32_t nBits) {
return (uint8_t)((x << nBits) | (x >> ((sizeof(uint8_t) * 8) - nBits)));
}
// Right-shift x by n bits.
uint8_t ror1(const uint8_t x, const uint32_t nBits) {
return (uint8_t)((x >> nBits) | (x << ((sizeof(uint8_t) * 8) - nBits)));
}
uint16_t rol2(const uint16_t x, const uint32_t nBits) {
return (uint16_t)((x << nBits) | (x >> ((sizeof(uint16_t) * 8) - nBits)));
}
uint16_t ror2(const uint16_t x, const uint32_t nBits) {
return (uint16_t)((x >> nBits) | (x << ((sizeof(uint16_t) * 8) - nBits)));
}
uint32_t rol3(const uint32_t x, const uint32_t nBits) {
return (uint32_t)((x << nBits) | (x >> ((sizeof(uint32_t) * 8) - nBits)));
}
uint32_t ror3(const uint32_t x, const uint32_t nBits) {
return (uint32_t)((x >> nBits) | (x << ((sizeof(uint32_t) * 8) - nBits)));
}
uint64_t rol4(const uint64_t x, const uint32_t nBits) {
return (uint64_t)((x << nBits) | (x >> ((sizeof(uint64_t) * 8) - nBits)));
}
uint64_t ror4(const uint64_t x, const uint32_t nBits) {
return (uint64_t)((x >> nBits) | (x << ((sizeof(uint64_t) * 8) - nBits)));
}
int main() {
return 0;
}
The LDC2 compiler generates this asm for those functions (on a 32 bit system,
so ulong rotations need some code):
LDC D compiler V.0.11.0 based on DMD v2.062 and LLVM 3.3 almost svn.
ldmd2 -O -release -inline -noboundscheck -outoput-s
__D10rotations14rol1FNaNbxhxkZh:
pushl %esi
movzbl 8(%esp), %edx
movb %al, %cl
movl %edx, %esi
shll %cl, %esi
movl $8, %ecx
subl %eax, %ecx
shrl %cl, %edx
orl %esi, %edx
movzbl %dl, %eax
popl %esi
ret $4
__D10rotations14ror1FNaNbxhxkZh:
pushl %esi
movzbl 8(%esp), %edx
movb %al, %cl
movl %edx, %esi
shrl %cl, %esi
movl $8, %ecx
subl %eax, %ecx
shll %cl, %edx
orl %esi, %edx
movzbl %dl, %eax
popl %esi
ret $4
__D10rotations14rol2FNaNbxtxkZt:
pushl %esi
movzwl 8(%esp), %edx
movb %al, %cl
movl %edx, %esi
shll %cl, %esi
movl $16, %ecx
subl %eax, %ecx
shrl %cl, %edx
orl %esi, %edx
movzwl %dx, %eax
popl %esi
ret $4
__D10rotations14ror2FNaNbxtxkZt:
pushl %esi
movzwl 8(%esp), %edx
movb %al, %cl
movl %edx, %esi
shrl %cl, %esi
movl $16, %ecx
subl %eax, %ecx
shll %cl, %edx
orl %esi, %edx
movzwl %dx, %eax
popl %esi
ret $4
__D10rotations14rol3FNaNbxkxkZk:
movl 4(%esp), %edx
movb %al, %cl
roll %cl, %edx
movl %edx, %eax
ret $4
__D10rotations14ror3FNaNbxkxkZk:
movl 4(%esp), %edx
movb %al, %cl
rorl %cl, %edx
movl %edx, %eax
ret $4
__D10rotations14rol4FNaNbxmxkZm:
pushl %ebp
pushl %ebx
pushl %edi
pushl %esi
movl %eax, %ebx
movl $64, %ecx
subl %ebx, %ecx
movl 24(%esp), %edx
movl 20(%esp), %eax
movl %eax, %esi
shrdl %cl, %edx, %esi
movl %edx, %edi
shrl %cl, %edi
xorl %ebp, %ebp
testb $32, %cl
cmovnel %edi, %esi
cmovnel %ebp, %edi
movb %bl, %cl
shldl %cl, %eax, %edx
movb %bl, %cl
shll %cl, %eax
testb $32, %bl
cmovnel %eax, %edx
cmovnel %ebp, %eax
orl %esi, %eax
orl %edi, %edx
popl %esi
popl %edi
popl %ebx
popl %ebp
ret $8
__D10rotations14ror4FNaNbxmxkZm:
pushl %ebp
pushl %ebx
pushl %edi
pushl %esi
movl %eax, %ecx
movl 24(%esp), %edx
movl 20(%esp), %eax
movl %eax, %esi
shrdl %cl, %edx, %esi
movl %edx, %edi
shrl %cl, %edi
xorl %ebp, %ebp
testb $32, %cl
cmovnel %edi, %esi
cmovnel %ebp, %edi
movl $64, %ebx
subl %ecx, %ebx
movb %bl, %cl
shldl %cl, %eax, %edx
movb %bl, %cl
shll %cl, %eax
testb $32, %bl
cmovnel %eax, %edx
cmovnel %ebp, %eax
orl %esi, %eax
orl %edi, %edx
popl %esi
popl %edi
popl %ebx
popl %ebp
ret $8
While a similar D program (that uses templates), is compiled by GDC (the D
compiler based on the GCC back-end) to this asm (but this is a 64 bit system,
so rorq and rolq are available):
_D4temp10__T3rolThZ3rolFNaNbNfxhxkZh:
movl %edi, %eax
movl %esi, %ecx
rolb %cl, %al
ret
_D4temp10__T3rorThZ3rorFNaNbNfxhxkZh:
movl %edi, %eax
movl %esi, %ecx
rorb %cl, %al
ret
_D4temp10__T3rolTtZ3rolFNaNbNfxtxkZt:
movl %edi, %eax
movl %esi, %ecx
rolw %cl, %ax
ret
_D4temp10__T3rorTtZ3rorFNaNbNfxtxkZt:
movl %edi, %eax
movl %esi, %ecx
rorw %cl, %ax
ret
_D4temp10__T3rolTkZ3rolFNaNbNfxkxkZk:
movl %edi, %eax
movl %esi, %ecx
roll %cl, %eax
ret
_D4temp10__T3rorTkZ3rorFNaNbNfxkxkZk:
movl %edi, %eax
movl %esi, %ecx
rorl %cl, %eax
ret
_D4temp10__T3rolTmZ3rolFNaNbNfxmxkZm:
movq %rdi, %rax
movl %esi, %ecx
rolq %cl, %rax
ret
_D4temp10__T3rorTmZ3rorFNaNbNfxmxkZm:
movq %rdi, %rax
movl %esi, %ecx
rorq %cl, %rax
ret
So perhaps maybe llvm could improve the compilation of that code for the ubyte,
ushort and ulong cases on a 32 bit system.
Also if you compare the asm for the 32 bit (uint) case:
__D10rotations14rol3FNaNbxkxkZk:
movl 4(%esp), %edx
movb %al, %cl
roll %cl, %edx
movl %edx, %eax
ret $4
_D4temp10__T3rolTkZ3rolFNaNbNfxkxkZk:
movl %edi, %eax
movl %esi, %ecx
roll %cl, %eax
ret
You see GCC uses the roll putting the result in the eax register, so there is
no need to perform the movl. I presume llvm could improve to do the same.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20130728/ec5eb896/attachment.html>
More information about the llvm-bugs
mailing list