<html>
    <head>
      <base href="http://llvm.org/bugs/" />
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW --- - popcnt not generated"
   href="http://llvm.org/bugs/show_bug.cgi?id=17101">17101</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>popcnt not generated
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>libraries
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>All
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>normal
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>Backend: X86
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>kkhoo@perfwizard.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvmbugs@cs.uiuc.edu
          </td>
        </tr>

        <tr>
          <th>Classification</th>
          <td>Unclassified
          </td>
        </tr></table>
      <p>
        <div>
        <pre>$ ./clang -v
clang version 3.4 (trunk 189776)
Target: x86_64-apple-darwin11.4.2
Thread model: posix

$ cat popcnt.c 
unsigned int foo(unsigned int x) {
    unsigned int countOfOnes = 0;
    unsigned int i;
    for (i=0; i<32; i++) {
        if (((x >> i) & 0x1) == 1) countOfOnes++;
    }
    return countOfOnes;
}

$ ./clang -S -O3 -march=core-avx2 popcnt.c -o /dev/stdout 
    .section    __TEXT,__text,regular,pure_instructions
    .section    __TEXT,__const
    .align    5
LCPI0_0:
    .long    1                       ## 0x1
    .long    2                       ## 0x2
    .long    4                       ## 0x4
    .long    8                       ## 0x8
    .long    16                      ## 0x10
    .long    32                      ## 0x20
    .long    64                      ## 0x40
    .long    128                     ## 0x80
LCPI0_2:
    .long    256                     ## 0x100
    .long    512                     ## 0x200
    .long    1024                    ## 0x400
    .long    2048                    ## 0x800
    .long    4096                    ## 0x1000
    .long    8192                    ## 0x2000
    .long    16384                   ## 0x4000
    .long    32768                   ## 0x8000
LCPI0_3:
    .long    16777216                ## 0x1000000
    .long    33554432                ## 0x2000000
    .long    67108864                ## 0x4000000
    .long    134217728               ## 0x8000000
    .long    268435456               ## 0x10000000
    .long    536870912               ## 0x20000000
    .long    1073741824              ## 0x40000000
    .long    2147483648              ## 0x80000000
LCPI0_4:
    .long    65536                   ## 0x10000
    .long    131072                  ## 0x20000
    .long    262144                  ## 0x40000
    .long    524288                  ## 0x80000
    .long    1048576                 ## 0x100000
    .long    2097152                 ## 0x200000
    .long    4194304                 ## 0x400000
    .long    8388608                 ## 0x800000
    .section    __TEXT,__literal4,4byte_literals
    .align    2
LCPI0_1:
    .long    1                       ## 0x1
    .section    __TEXT,__text,regular,pure_instructions
    .globl    _foo
    .align    4, 0x90
_foo:                                   ## @foo
    .cfi_startproc
## BB#0:                                ## %for.end
    pushq    %rbp
Ltmp2:
    .cfi_def_cfa_offset 16
Ltmp3:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp4:
    .cfi_def_cfa_register %rbp
    vmovd    %edi, %xmm0
    vbroadcastss    %xmm0, %ymm0
    vandps    LCPI0_0(%rip), %ymm0, %ymm1
    vpxor    %ymm2, %ymm2, %ymm2
    vpcmpeqd    %ymm2, %ymm1, %ymm1
    vpbroadcastd    LCPI0_1(%rip), %ymm3
    vpandn    %ymm3, %ymm1, %ymm1
    vandps    LCPI0_2(%rip), %ymm0, %ymm4
    vpcmpeqd    %ymm2, %ymm4, %ymm4
    vpandn    %ymm3, %ymm4, %ymm4
    vpaddd    %ymm1, %ymm4, %ymm1
    vandps    LCPI0_3(%rip), %ymm0, %ymm4
    vpcmpeqd    %ymm2, %ymm4, %ymm4
    vpandn    %ymm3, %ymm4, %ymm4
    vandps    LCPI0_4(%rip), %ymm0, %ymm0
    vpcmpeqd    %ymm2, %ymm0, %ymm0
    vpandn    %ymm3, %ymm0, %ymm0
    vpaddd    %ymm1, %ymm0, %ymm0
    vpaddd    %ymm0, %ymm4, %ymm0
    vextracti128    $1, %ymm0, %xmm1
    vpaddd    %ymm1, %ymm0, %ymm0
    vpalignr    $8, %ymm0, %ymm0, %ymm1 ## ymm1 =
ymm0[8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31,16,17,18,19,20,21,22,23]
    vpaddd    %ymm1, %ymm0, %ymm0
    vphaddd    %ymm0, %ymm0, %ymm0
    vmovd    %xmm0, %eax
    popq    %rbp
    vzeroupper
    ret

It looks like clang/llvm did something heroic here to recognize that this is a
popcount function and by using AVX2 to avoid the loop, but I was expecting this
loop to generate a simple popcnt instruction. I think this has been available
since Nehalem for Intel and Family 10H for AMD. 

Better codegen would be something like this:

popcnt %edi, %eax</pre>
        </div>
      </p>
      <hr>
      <span>You are receiving this mail because:</span>
      
      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>