<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - Zero XMM / YMM registers are treated separately"

   href="https://llvm.org/bugs/show_bug.cgi?id=26018">26018</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Zero XMM / YMM registers are treated separately

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>All

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: X86

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>llvm-dev@redking.me.uk

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>It should be possible to share 128-bit and 256-bit zero vector registers

instead of generating them separately, increasing instruction count and wasting

registers.

Zero ZMM registers probably have the same issue.

As a stretch goal it might be possible to recognise that a VEX encoded 128-bit

instruction will implicitly zero the upper bits and make use of it.

Example: llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll 

define void @endless_loop() {

; CHECK-LABEL: endless_loop:

; CHECK-NEXT:  # BB#0:

; CHECK-NEXT:    vmovaps (%eax), %ymm0

; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0

; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]

; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]

; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1

; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2 <-- XMM ZERO

; CHECK-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]

; CHECK-NEXT:    vxorps %ymm2, %ymm2, %ymm2 <-- YMM ZERO

; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]

; CHECK-NEXT:    vmovaps %ymm0, (%eax)

; CHECK-NEXT:    vmovaps %ymm1, (%eax)

; CHECK-NEXT:    vzeroupper

; CHECK-NEXT:    retl

entry:

  %0 = load <8 x i32>, <8 x i32> addrspace(1)* undef, align 32

  %1 = shufflevector <8 x i32> %0, <8 x i32> undef, <16 x i32> <i32 4, i32 4,

i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,

i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>

  %2 = shufflevector <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0,

i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef>, <16

x i32> %1, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,

i32 8, i32 9, i32 10, i32 11, i$

  store <16 x i32> %2, <16 x i32> addrspace(1)* undef, align 64

  ret void

}</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>