<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - SimplifyCFG reorders CUDA activemask around branches"

   href="https://bugs.llvm.org/show_bug.cgi?id=47210">47210</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>SimplifyCFG reorders CUDA activemask around branches

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>new-bugs

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>new bugs

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>rebs@cs.stanford.edu

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>htmldeveloper@gmail.com, llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>CUDA has an intrinsic called `__activemask()` that populates a 32-bit variable

with a bitmask indicating which threads are executing the current instruction.

This is sensitive to branching behavior; in CUDA’s SIMT model, when threads

diverge at a branch, one side of the branch will be executed (with other

threads being masked off), and then the other branch path will be executed. If

32 threads execute the following code, the first thread should print a bitmask

containing only that thread, and the others should print a bitmask containing

all other threads:

```

__device__ void activemask_test() {

    if (threadIdx.x == 0) {

        printf("first thread: %u\n", __activemask());

    } else {

        printf("others: %u\n", __activemask());

    }

}

```

Correct output (compiled using nvcc -O3):

```

first thread: 1

others: 14

others: 14

others: 14

```

Incorrect output (compiled using clang-10 -O1):

```

first thread: 15

others: 15

others: 15

others: 15

```

I am compiling using the following invocation:

```

clang-10 -S -emit-llvm --cuda-gpu-arch=sm_62 -O1 shfl.cu -mllvm

-opt-bisect-limit=18

```

Before running SimplifyCFGPass, the IR has an __activemask() call separately in

each branch:

```

; Function Attrs: convergent nounwind

define dso_local void @_Z15activemask_testv() #0 {

  %1 = alloca %printf_args

  %2 = alloca %printf_args.0

  %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #4, !range !10

  %4 = icmp eq i32 %3, 0

  br i1 %4, label %5, label %10

5:                                                ; preds = %0

  %6 = call i32 @_Z12__activemaskv() #5

  %7 = getelementptr inbounds %printf_args, %printf_args* %1, i32 0, i32 0

  store i32 %6, i32* %7, align 4

  %8 = bitcast %printf_args* %1 to i8*

  %9 = call i32 @vprintf(i8* getelementptr inbounds ([18 x i8], [18 x i8]*

@.str, i64 0, i64 0), i8* %8)

  br label %15

10:                                               ; preds = %0

  %11 = call i32 @_Z12__activemaskv() #5

  %12 = getelementptr inbounds %printf_args.0, %printf_args.0* %2, i32 0, i32 0

  store i32 %11, i32* %12, align 4

  %13 = bitcast %printf_args.0* %2 to i8*

  %14 = call i32 @vprintf(i8* getelementptr inbounds ([12 x i8], [12 x i8]*

@.str1, i64 0, i64 0), i8* %13)

  br label %15

15:                                               ; preds = %10, %5

  ret void

}

```

After SimplifyCFGPass, activemask has been hoisted to execute before any

branch:

```

; Function Attrs: convergent nounwind

define dso_local void @_Z15activemask_testv() #0 {

  %1 = alloca %printf_args

  %2 = alloca %printf_args.0

  %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #4, !range !10

  %4 = icmp eq i32 %3, 0

  %5 = call i32 @_Z12__activemaskv() #5

  br i1 %4, label %6, label %10

6:                                                ; preds = %0

  %7 = getelementptr inbounds %printf_args, %printf_args* %1, i32 0, i32 0

  store i32 %5, i32* %7, align 4

  %8 = bitcast %printf_args* %1 to i8*

  %9 = call i32 @vprintf(i8* getelementptr inbounds ([18 x i8], [18 x i8]*

@.str, i64 0, i64 0), i8* %8)

  br label %14

10:                                               ; preds = %0

  %11 = getelementptr inbounds %printf_args.0, %printf_args.0* %2, i32 0, i32 0

  store i32 %5, i32* %11, align 4

  %12 = bitcast %printf_args.0* %2 to i8*

  %13 = call i32 @vprintf(i8* getelementptr inbounds ([12 x i8], [12 x i8]*

@.str1, i64 0, i64 0), i8* %12)

  br label %14

14:                                               ; preds = %10, %6

  ret void

}

```

The same behavior happens when trying to use inline assembly instead of a

function call. There seems to be no way to indicate to the compiler that we do

not want the activemask instruction to be reordered around branches, and

specifying memory and control code clobbers does not prevent this behavior:

```

__device__ void activemask_test() {

    if (threadIdx.x == 0) {

        unsigned int mask;

        asm volatile("activemask.b32 %0;" : "=r"(mask)::"memory", "cc");

        printf("first thread: %u\n", mask);

    } else {

        unsigned int mask;

        asm volatile("activemask.b32 %0;" : "=r"(mask)::"memory", "cc");

        printf("others: %u\n", mask);

    }

}

```

This hoisting optimization is safe for all CPU instructions, but isn’t

necessarily safe for the SIMT model of execution, and it seems that there is no

way to denote instructions or function calls that should not be hoisted. Maybe

this behavior could be disabled when compiling for ptx, or a new attribute

could be added to mark branch-dependent code.

Our current workaround is to replace __activemask() with a macro that uses an

opaquely defined structure, whose name depends on the current source line

number, to ensure that the types of each inline assembly block are unique, and

thus not subject to merging.

```

#define STORE_ACTIVEMASK1(var, line)\

  do { \

    struct __active_mask_ ## line; \

    struct __active_mask_ ## line *__mask_guard_ ## line; \

    unsigned __mask_ ## line; \

    asm("activemask.b32 %0;" : "=r"(__mask_ ## line) : "r"(__mask_guard_ ##

line)); \

    var = __mask_ ## line; \

  } while (0)

#define STORE_ACTIVEMASK0(var, line) STORE_ACTIVEMASK1(var, line)

#define STORE_ACTIVEMASK(var) STORE_ACTIVEMASK0(var, __LINE__)

```

However, this is not optimal, particularly if used in a short function that

gets inlined into other branchy code.

This problem is related to <a class="bz_bug_link 

          bz_status_CONFIRMED "

   title="CONFIRMED - [CUDA][NVPTX] Incorrect compilation of __activemask()"

   href="show_bug.cgi?id=35249">bug #35249</a>. Apologies if this is considered a

duplicate, but we decided to file a new bug since that ticket was primarily

focused on a different (resolved) issue, and we are adding new information.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>