<html>
<head>
<base href="http://llvm.org/bugs/" />
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW --- - Better unsigned byte, short, int and long rotations"
href="http://llvm.org/bugs/show_bug.cgi?id=16726">16726</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Better unsigned byte, short, int and long rotations
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Windows XP
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Common Code Generator Code
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>bearophile@mailas.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvmbugs@cs.uiuc.edu
</td>
</tr>
<tr>
<th>Classification</th>
<td>Unclassified
</td>
</tr></table>
<p>
<div>
<pre>This is a potential enhancement request, or two potential enhancement requests.
This is a D program that performs rotations (the rotation pattern used here it
the one recognized by the back-end of DMD, the reference D compiler):
// Left-shift x by n bits.
ubyte rol1(in ubyte x, in uint nBits) pure nothrow {
return cast(ubyte)((x << nBits) | (x >> ((ubyte.sizeof * 8) - nBits)));
}
// Right-shift x by n bits.
ubyte ror1(in ubyte x, in uint nBits) pure nothrow {
return cast(ubyte)((x >> nBits) | (x << ((ubyte.sizeof * 8) - nBits)));
}
ushort rol2(in ushort x, in uint nBits) pure nothrow {
return cast(ushort)((x << nBits) | (x >> ((ushort.sizeof * 8) - nBits)));
}
ushort ror2(in ushort x, in uint nBits) pure nothrow {
return cast(ushort)((x >> nBits) | (x << ((ushort.sizeof * 8) - nBits)));
}
uint rol3(in uint x, in uint nBits) pure nothrow {
return cast(uint)((x << nBits) | (x >> ((uint.sizeof * 8) - nBits)));
}
uint ror3(in uint x, in uint nBits) pure nothrow {
return cast(uint)((x >> nBits) | (x << ((uint.sizeof * 8) - nBits)));
}
ulong rol4(in ulong x, in uint nBits) pure nothrow {
return cast(ulong)((x << nBits) | (x >> ((ulong.sizeof * 8) - nBits)));
}
ulong ror4(in ulong x, in uint nBits) pure nothrow {
return cast(ulong)((x >> nBits) | (x << ((ulong.sizeof * 8) - nBits)));
}
void main() {}
That is similar to this C code:
#include "stdint.h"
// Left-shift x by n bits.
uint8_t rol1(const uint8_t x, const uint32_t nBits) {
return (uint8_t)((x << nBits) | (x >> ((sizeof(uint8_t) * 8) - nBits)));
}
// Right-shift x by n bits.
uint8_t ror1(const uint8_t x, const uint32_t nBits) {
return (uint8_t)((x >> nBits) | (x << ((sizeof(uint8_t) * 8) - nBits)));
}
uint16_t rol2(const uint16_t x, const uint32_t nBits) {
return (uint16_t)((x << nBits) | (x >> ((sizeof(uint16_t) * 8) - nBits)));
}
uint16_t ror2(const uint16_t x, const uint32_t nBits) {
return (uint16_t)((x >> nBits) | (x << ((sizeof(uint16_t) * 8) - nBits)));
}
uint32_t rol3(const uint32_t x, const uint32_t nBits) {
return (uint32_t)((x << nBits) | (x >> ((sizeof(uint32_t) * 8) - nBits)));
}
uint32_t ror3(const uint32_t x, const uint32_t nBits) {
return (uint32_t)((x >> nBits) | (x << ((sizeof(uint32_t) * 8) - nBits)));
}
uint64_t rol4(const uint64_t x, const uint32_t nBits) {
return (uint64_t)((x << nBits) | (x >> ((sizeof(uint64_t) * 8) - nBits)));
}
uint64_t ror4(const uint64_t x, const uint32_t nBits) {
return (uint64_t)((x >> nBits) | (x << ((sizeof(uint64_t) * 8) - nBits)));
}
int main() {
return 0;
}
The LDC2 compiler generates this asm for those functions (on a 32 bit system,
so ulong rotations need some code):
LDC D compiler V.0.11.0 based on DMD v2.062 and LLVM 3.3 almost svn.
ldmd2 -O -release -inline -noboundscheck -outoput-s
__D10rotations14rol1FNaNbxhxkZh:
pushl %esi
movzbl 8(%esp), %edx
movb %al, %cl
movl %edx, %esi
shll %cl, %esi
movl $8, %ecx
subl %eax, %ecx
shrl %cl, %edx
orl %esi, %edx
movzbl %dl, %eax
popl %esi
ret $4
__D10rotations14ror1FNaNbxhxkZh:
pushl %esi
movzbl 8(%esp), %edx
movb %al, %cl
movl %edx, %esi
shrl %cl, %esi
movl $8, %ecx
subl %eax, %ecx
shll %cl, %edx
orl %esi, %edx
movzbl %dl, %eax
popl %esi
ret $4
__D10rotations14rol2FNaNbxtxkZt:
pushl %esi
movzwl 8(%esp), %edx
movb %al, %cl
movl %edx, %esi
shll %cl, %esi
movl $16, %ecx
subl %eax, %ecx
shrl %cl, %edx
orl %esi, %edx
movzwl %dx, %eax
popl %esi
ret $4
__D10rotations14ror2FNaNbxtxkZt:
pushl %esi
movzwl 8(%esp), %edx
movb %al, %cl
movl %edx, %esi
shrl %cl, %esi
movl $16, %ecx
subl %eax, %ecx
shll %cl, %edx
orl %esi, %edx
movzwl %dx, %eax
popl %esi
ret $4
__D10rotations14rol3FNaNbxkxkZk:
movl 4(%esp), %edx
movb %al, %cl
roll %cl, %edx
movl %edx, %eax
ret $4
__D10rotations14ror3FNaNbxkxkZk:
movl 4(%esp), %edx
movb %al, %cl
rorl %cl, %edx
movl %edx, %eax
ret $4
__D10rotations14rol4FNaNbxmxkZm:
pushl %ebp
pushl %ebx
pushl %edi
pushl %esi
movl %eax, %ebx
movl $64, %ecx
subl %ebx, %ecx
movl 24(%esp), %edx
movl 20(%esp), %eax
movl %eax, %esi
shrdl %cl, %edx, %esi
movl %edx, %edi
shrl %cl, %edi
xorl %ebp, %ebp
testb $32, %cl
cmovnel %edi, %esi
cmovnel %ebp, %edi
movb %bl, %cl
shldl %cl, %eax, %edx
movb %bl, %cl
shll %cl, %eax
testb $32, %bl
cmovnel %eax, %edx
cmovnel %ebp, %eax
orl %esi, %eax
orl %edi, %edx
popl %esi
popl %edi
popl %ebx
popl %ebp
ret $8
__D10rotations14ror4FNaNbxmxkZm:
pushl %ebp
pushl %ebx
pushl %edi
pushl %esi
movl %eax, %ecx
movl 24(%esp), %edx
movl 20(%esp), %eax
movl %eax, %esi
shrdl %cl, %edx, %esi
movl %edx, %edi
shrl %cl, %edi
xorl %ebp, %ebp
testb $32, %cl
cmovnel %edi, %esi
cmovnel %ebp, %edi
movl $64, %ebx
subl %ecx, %ebx
movb %bl, %cl
shldl %cl, %eax, %edx
movb %bl, %cl
shll %cl, %eax
testb $32, %bl
cmovnel %eax, %edx
cmovnel %ebp, %eax
orl %esi, %eax
orl %edi, %edx
popl %esi
popl %edi
popl %ebx
popl %ebp
ret $8
While a similar D program (that uses templates), is compiled by GDC (the D
compiler based on the GCC back-end) to this asm (but this is a 64 bit system,
so rorq and rolq are available):
_D4temp10__T3rolThZ3rolFNaNbNfxhxkZh:
movl %edi, %eax
movl %esi, %ecx
rolb %cl, %al
ret
_D4temp10__T3rorThZ3rorFNaNbNfxhxkZh:
movl %edi, %eax
movl %esi, %ecx
rorb %cl, %al
ret
_D4temp10__T3rolTtZ3rolFNaNbNfxtxkZt:
movl %edi, %eax
movl %esi, %ecx
rolw %cl, %ax
ret
_D4temp10__T3rorTtZ3rorFNaNbNfxtxkZt:
movl %edi, %eax
movl %esi, %ecx
rorw %cl, %ax
ret
_D4temp10__T3rolTkZ3rolFNaNbNfxkxkZk:
movl %edi, %eax
movl %esi, %ecx
roll %cl, %eax
ret
_D4temp10__T3rorTkZ3rorFNaNbNfxkxkZk:
movl %edi, %eax
movl %esi, %ecx
rorl %cl, %eax
ret
_D4temp10__T3rolTmZ3rolFNaNbNfxmxkZm:
movq %rdi, %rax
movl %esi, %ecx
rolq %cl, %rax
ret
_D4temp10__T3rorTmZ3rorFNaNbNfxmxkZm:
movq %rdi, %rax
movl %esi, %ecx
rorq %cl, %rax
ret
So perhaps maybe llvm could improve the compilation of that code for the ubyte,
ushort and ulong cases on a 32 bit system.
Also if you compare the asm for the 32 bit (uint) case:
__D10rotations14rol3FNaNbxkxkZk:
movl 4(%esp), %edx
movb %al, %cl
roll %cl, %edx
movl %edx, %eax
ret $4
_D4temp10__T3rolTkZ3rolFNaNbNfxkxkZk:
movl %edi, %eax
movl %esi, %ecx
roll %cl, %eax
ret
You see GCC uses the roll putting the result in the eax register, so there is
no need to perform the movl. I presume llvm could improve to do the same.</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>