<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - [AMDGPU] integer division off-by-one for some inputs"
   href="https://bugs.llvm.org/show_bug.cgi?id=46212">46212</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>[AMDGPU] integer division off-by-one for some inputs
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>libraries
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>All
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>All
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>normal
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>Backend: AMDGPU
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>tilkax@gmail.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr></table>
      <p>
        <div>
        <pre>For this IR:

define i32 @divide(i32, i32) {
    %3 = udiv i32 %0, %1
    ret i32 %3
}

LLVM generates this code (-mcpu=gfx1010):

        v_cvt_f32_u32_e32 v2, v1
        v_rcp_f32_e32 v2, v2
        s_mov_b32 s4, 0x4f800000
        v_mul_f32_e32 v2, s4, v2
        v_cvt_u32_f32_e32 v2, v2
        v_mul_hi_u32 v3, v2, v1
        v_mul_lo_u32 v4, v2, v1
        s_mov_b32 s4, 0
        v_sub_nc_u32_e32 v5, s4, v4
        v_cmp_eq_u32_e64 s4, s4, v3
        v_cndmask_b32_e64 v3, v4, v5, s4
        v_mul_hi_u32 v3, v3, v2
        v_add_nc_u32_e32 v4, v2, v3
        v_sub_nc_u32_e32 v2, v2, v3
        v_cndmask_b32_e64 v2, v2, v4, s4
        v_mul_hi_u32 v2, v2, v0
        v_mul_lo_u32 v3, v2, v1
        v_sub_nc_u32_e32 v4, v0, v3
        v_cmp_ge_u32_e64 s4, v4, v1
        v_cmp_ge_u32_e64 s5, v0, v3
        s_and_b32 s4, s4, s5
        s_mov_b32 s6, 1
        v_add_nc_u32_e32 v0, s6, v2
        s_mov_b32 s6, -1
        v_add_nc_u32_e32 v1, s6, v2
        v_cndmask_b32_e64 v0, v2, v0, s4
        v_cndmask_b32_e64 v0, v1, v0, s5

For similar HLSL source, the AMD DirectX driver generates better code but uses
the same algorithm:

  v_cvt_f32_u32  v3, v2                                 // 000000000018:
7E060D02
  v_rcp_f32     v3, v3                                  // 00000000001C:
7E065503
  v_mul_f32     v3, lit(0x4f800000), v3                 // 000000000020:
100606FF 4F800000
  v_cvt_u32_f32  v3, v3                                 // 000000000028:
7E060F03
  v_mad_u64_u32  v[4:5], vcc, v2, v3, 0                 // 00000000002C:
D5766A04 02020702
  v_cmp_ne_i32  s[0:1], 0, v5                           // 000000000034:
7D0A0AF9 06868080
  v_sub_co_u32  v6, vcc, 0, v4                          // 00000000003C:
D7106A06 00020880
  v_cndmask_b32  v4, v6, v4, s[0:1]                     // 000000000044:
D5010004 00020906
  v_mul_hi_u32  v4, v4, v3                              // 00000000004C:
D56A0004 00020704
  v_sub_co_u32  v5, vcc, v3, v4                         // 000000000054:
D7106A05 00020903
  v_add_nc_u32  v4, v3, v4                              // 00000000005C:
4A080903
  v_cndmask_b32  v1, v4, v5, s[0:1]                     // 000000000060:
D5010001 00020B04
  v_mul_hi_u32  v1, v1, v0                              // 000000000068:
D56A0001 00020101
  v_mul_lo_u32  v3, v1, v2                              // 000000000070:
D5690003 00020501
  v_sub_nc_u32  v5, v0, v3                              // 000000000078:
4C0A0700
  v_cmp_ge_u32  s[0:1], v0, v3                          // 00000000007C:
7D8C06F9 06068000
  v_cmp_ge_u32  s[2:3], v5, v2                          // 000000000084:
7D8C04F9 06068205
  s_and_b64     s[2:3], s[0:1], s[2:3]                  // 00000000008C:
87820200
  v_add_co_ci_u32  v0, vcc, v1, 0, s[2:3]               // 000000000090:
D5286A00 00090101
  v_add_co_ci_u32  v0, vcc, v0, -1, s[0:1]              // 000000000098:
D5286A00 00018300
  v_cmp_ne_i32  vcc, 0, v2                              // 0000000000A0:
7D0A0480
  v_cndmask_b32  v0, -1, v0, vcc                        // 0000000000A4:
020000C1

Using PIX for Windows, I verified that for 0xFFFFFFFF / 0x11111111 this code
returns 14.</pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>