<table border="1" cellspacing="0" cellpadding="8">

    <tr>

        <th>Issue</th>

        <td>

            <a href=https://github.com/llvm/llvm-project/issues/59860>59860</a>

        </td>

    </tr>

    <tr>

        <th>Summary</th>

        <td>

            [X86] clang emits slowers shuffles with broadcasti128 instrisic than the float equivelent

        </td>

    </tr>

    <tr>

      <th>Labels</th>

      <td>

            new issue

      </td>

    </tr>

    <tr>

      <th>Assignees</th>

      <td>

      </td>

    </tr>

    <tr>

      <th>Reporter</th>

      <td>

          KyleSiefring

      </td>

    </tr>

</table>

<pre>

    When I use `_mm256_broadcastsi128_si256(_mm_loadu_si128(...))` with certain inter-lane shuffles, clang will emits cross-lane shuffles. When I use `_mm256_broadcast_pd(...)` and `_mm256_broadcast_ps(...)` instead, the emitted assembly is reasonable.

I have three examples of this. The first I will include here and the others can be found here: https://godbolt.org/z/8hYjrEhzM.

This code separate the top and bottom 8 bytes into different lanes. i.e. s0 and s1 are transformed into {s0.[0..8], s1.[0..8] | s0.[8..16], s1.[8..16]}.

```

void zip_int(const uint8_t *src, uint8_t *dst) {

  __m256d s0 = _mm256_castsi256_pd(_mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i *)src)));

  __m256d s1 = _mm256_castsi256_pd(_mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i *)src + 2)));

  __m256i s = _mm256_castpd_si256(_mm256_shuffle_pd(s0, s1, 0xC));

  _mm256_store_si256((__m256i *)dst, s);

}

 vbroadcastf128  ymm0, xmmword ptr [rdi] # ymm0 = mem[0,1,0,1]

 vbroadcastf128  ymm1, xmmword ptr [rdi + 32] # ymm1 = mem[0,1,0,1]

 vpermpd ymm1, ymm1, 224                 # ymm1 = ymm1[0,0,2,3]

 vpermpd ymm0, ymm0, 244                 # ymm0 = ymm0[0,1,3,3]

 vblendps        ymm0, ymm0, ymm1, 204           # ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]

        vmovaps ymmword ptr [rsi], ymm0

        vzeroupper

        ret

```

```

void zip_float(const uint8_t *src, uint8_t *dst) {

  __m256d s0 = _mm256_broadcast_pd((__m128d *)src);

  __m256d s1 = _mm256_broadcast_pd((__m128d *)src + 2);

  __m256i s = _mm256_castpd_si256(_mm256_shuffle_pd(s0, s1, 0xC));

 _mm256_store_si256((__m256i *)dst, s);

}

        vbroadcastf128  ymm0, xmmword ptr [rdi] # ymm0 = mem[0,1,0,1]

        vbroadcastf128  ymm1, xmmword ptr [rdi + 32] # ymm1 = mem[0,1,0,1]

        vshufpd ymm0, ymm0, ymm1, 12            # ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]

 vmovapd ymmword ptr [rsi], ymm0

        vzeroupper

        ret

```

</pre>

<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy0Vt9vozgQ_mucl9EiYwIhD3lo04u0Ot3TrXR3T5HBQ_HKYM5j0m3_-pMNSUO2--OkLaK4GcbffGP7G0YS6cceccfye5Y_rOToW-t2vz8b_FNj43T_uKqset791WIPH2EkBFbwY9eJvDhWzkpVS_KkU1EeSYu8YKI8dt3RWKnGY7QzUSZJwsQ23AWHJ-1bqNF5qXvQvUf3wcgegdqxaQwSE3uojewf4UkbA9hpT1A7S7T0S-D7pI6Deg1dcJC9etuNFm66J49SBRa-xRjeowJJhF1lnkETOJRke1kZTBh_YPxuen6EVp4QfOsQAb_IbjBIYBvwraYEPrUIjXbk4eOUme5rMyqEFh1GdiGe9S06glr2UCE0duxVdGDZHbTeD8SyOyYOTBweraqs8Yl1j0wcXpg4lO0_n91v7csfC1qfWk1QW4VAOEgnPcZA3g4xaGW9tx2UUD17pLAhFpRuGnTYewgrTgnoBBOgaQ0pBekQvJM9NdZ1qKZJbHNPPGH5PU-SkuUPYQUpvTIA2-xhcimTJC0WPhfL5uHMvuDzHX-erFbwooej7j0TZW178jDq3pdHD0zckasD2pVFkWdiG3hNCADHY9h7FTJh2QPMR2E6weG_eGL-7-EOxmOXilKHoExsI5PtfGdfB0_fOTgwcQ_i2xQ00C2DQV3HCNZZZRMr4tNGhSf_sv8ad57jrcML0EQtxpuoxf3YA13PZZuHGeR0yblJRQnw3HUx6peue7JOweAdsPzeKR2PksiiR0ykwy6cMib2geE05t_BTd_GjeuWiSv49CfgB3TdoC6w51GINdxeC9ToOMGGP8HEPnsTls-wcRTrb8LyMyy_YpvdwFYGezXQeeot_IU9X_9UgKjgOZNLBtESfNZM7POFT8HEfvNKZ75OnT3JgQL0YktIzyUi4i2nvKCz4zCgW9od-jerx_dLSmOs_MVF5eYzdNGpWhaJH5WHn4B5lfu7C_1X6Py8ge8k92_D_0rVn6OExXtDpedoqfixTq_lwRfyyRbvrlUc9aLeRy8rtcvUNtvKFe7SYpNxkYo0XbW7qi7WOW7SCpFzzrNmk6bblKeV3ObrXK1Xeie4yHjKi7TgRZ4m222zrjdlJSuO2YZv2ZpjJ7VJjDl1oW9ZaaIRd_m2LPjKyAoNxU5UiB6fIL5kImzNyu3CnA_V-EhszY0mT68oXnsTW9i_y9BCzN3j1DiSsU-hoTp3jVP7eTkd4TMaWz6nSdfgW9nH7ihWBMB_R31Cg71fjc7sbvov7duxSmrbMXEIXObhw-DsZ6w9E4eYATFxiBn-FwAA__-Bskdu">