<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/121548>121548</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            [x86] Un-optimal use of upper registers while generating vpmaddwd for reductions
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          venkataramananhashkumar
      </td>
    </tr>
</table>

<pre>
    For the given code below

#include <stdint.h>
int32_t foo(const int16_t *arr16, const uint8_t *arr8, int N) {
    int32_t sum = 1;
    for ( int k = 0 ; k < N ; k++ ) {
        sum += arr16[k] * arr8[k];
    }
    return sum;
}

LLVM with -O3 -march=znver5 -S -fno-unroll-loops generated the below code.

.LBB0_8:
        vpmovzxbw       (%rsi,%rax), %ymm1
 vpmaddwd        (%rdi,%rax,2), %ymm1, %ymm1  
        addq    $16, %rax
 vpaddd  %zmm0, %zmm1, %zmm0  <== upper half of  zmm1 and zmm0 are zero.
 cmpq    %rax, %rdx
        jne     .LBB0_8

        vextracti64x4 $1, %zmm0, %ymm1 <== Un-optimal extract 
        vpaddd  %zmm1, %zmm0, %zmm0 
        vextracti128    $1, %ymm0, %xmm1
        vpaddd  %xmm1, %xmm0, %xmm0
        vpshufd $238, %xmm0, %xmm1
        vpaddd  %xmm1, %xmm0, %xmm0
        vpshufd $85, %xmm0, %xmm1
        vpaddd  %xmm1, %xmm0, %xmm0
        vmovd   %xmm0, %eax

"vpmaddwd" in the above code only writes to lower half of zmm1 (ie. ymm1).   But we are using full zmm add  for the sum.  Also we have un-optimal extract and add in the reduction code. 


 
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJysVU2T4jgM_TXmoiLl2AmEQw4EltPs7GFr99rlxIZ42rFZfwSaX79lJzQw08fp6qooSHp6epYV5pw8aSFqVDao3C9Y8L2x9Sj0O_PMsoFppnvm-vcwMLtoDf-oD8aC7wWc5Cg0dIYLaIUyF4S38Z9QqTsVuABEd85zqX3WI_oHwlupPSVvHo7GIFJ1RjsPUvt89eYBkS2zNl8hsoPJE6T21aenig6pPXxHZANo3SC8BQC4Y7owAKJ7yBG9u47GAiJVynpPTgyINsnewffJRqRBpIFX0PiXAEkT0yZiZfOOyn2kA4nP9P4oh9b72bLCB6sjwuSdPAhvv33790-4SN_D8i8Ky4HZrkd0f9OjsCUs_4blUZtl0NYotVTGnB2chBaWecGT5knopHk2IWbfmga_VYhun6iP58GMt2t7md8RqRAprZOI7KLBrohsop6IlB_DkMfU8Twwzi8cXnP4c86O_JT3MAGe6jPO_5swiulAZ4BUhnHOo6u8DQOenbcHWPwV4gEhuo_ah_NZWOiZOoI5AsRIYJpDimNWwE1YE8WAbjjPVWe2yeLXJ2I_tEjPu2r4RTVx9ZZ1Xq6Ka5G4P1F6bvVB7h-9NGcvB6ZgToaXY3ju9Qu01MNXDHJS3fV7FL5nXecT-7XK9VHl-hKPX-JdH448ghNafRn-m-Cr8vejD2bkAD8FiTRbafmQ-xwjQkDqdGlYa0YxLSqj1QdcrPTCgTegzOVpuNJsIVJJkcE03psMAJrg4SLSrAUn9QmOQakYHOd82jKxigtDBrBVzsTono0Cwq_jEUc3ps3UrOCh89JMezSDuQ28hQWvKd_QDVuIOl_TsqqqFckXfU02a9YKzlekoHle0DVbFSvadu2qoKwg5ULWBJMS55jidb7GeXZkG34kDFcbviYcC1RgMTCpMqXGITP2tJDOBVHnJC-LaqFYK5S7fxFsHaOWbTg5VGAlnXePPC-9St-Oa7WKu_HpPgQnoqbT9bXiJJ0X1sGll0rcl1oU83PvRB0_5XCLYFXde392cbWRAyKHk_R9aLPODIgcIoX5sTxb80N0HpFD6sMhcphbGWvyfwAAAP__6jL9Rw">