[llvm-bugs] [Bug 36361] New: Ignored memory fence after 64bit __shfl_*_sync intrinsics
via llvm-bugs
llvm-bugs at lists.llvm.org
Mon Feb 12 14:25:17 PST 2018
https://bugs.llvm.org/show_bug.cgi?id=36361
Bug ID: 36361
Summary: Ignored memory fence after 64bit __shfl_*_sync
intrinsics
Product: clang
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: normal
Priority: P
Component: CUDA
Assignee: unassignedclangbugs at nondot.org
Reporter: j.l.k at gmx.com
CC: llvm-bugs at lists.llvm.org
Clang ignores the implicit memory fence after 64bit __shfl_*_sync intrinsics
which leads to incorrect behaviour.
Testing platform: clang version 7.0.0 (trunk 324341), CUDA 9.1,
--cuda-gpu-arch=sm_60
Repro code:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#include <stdio.h>
template <typename T>
__device__
T warpReduceSum(T val)
{
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(0xFFFFFFFF, val, offset);
return val;
}
template <typename T>
__global__ void kernel()
{
T i = 1;
T sum = warpReduceSum( i );
if( threadIdx.x == 0 )
printf( "sum = %d \n", (int) sum );
}
int main()
{
printf("float: ");
kernel< float ><<< 1, 32 >>>();
cudaDeviceSynchronize();
printf("double: ");
kernel< double ><<< 1, 32 >>>();
cudaDeviceSynchronize();
printf("long: ");
kernel< long ><<< 1, 32 >>>();
cudaDeviceSynchronize();
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Typical output when compiled with clang:
float: sum = 32
double: sum = -2147483648
long: sum = -746962023
Expected output (accomplished with nvcc):
float: sum = 32
double: sum = 32
long: sum = 32
PTX for the float kernel, which seems correct:
Function : _Z6kernelIfEvv
.headerflags @"EF_CUDA_SM60 EF_CUDA_PTX_SM(EF_CUDA_SM60)"
/* 0x001c5000fe0007f6 */
/*0008*/ MOV R1, c[0x0][0x20];
/* 0x4c98078000870001 */
/*0010*/ { MOV32I R7, 0x3f800000;
/* 0x0103f8000007f007 */
/*0018*/ S2R R6, SR_TID.X; }
/* 0xf0c8000002170006 */
/* 0x009fd0002be007f0 */
/*0028*/ { IADD32I R1, R1, -0x8;
/* 0x1c0fffffff870101 */
/*0030*/ SHFL.DOWN PT, R0, R7, 0x10, 0x1f; }
/* 0xef17007cb1070700 */
/*0038*/ FADD R0, R0, 1;
/* 0x3858003f80070000 */
/* 0x001cc402fe80073f */
/*0048*/ SHFL.DOWN PT, R2, R0, 0x8, 0x1f;
/* 0xef17007cb0870002 */
/*0050*/ FADD R2, R0, R2;
/* 0x5c58000000270002 */
/*0058*/ SHFL.DOWN PT, R3, R2, 0x4, 0x1f;
/* 0xef17007cb0470203 */
/* 0x001fd000ffa00fed */
/*0068*/ ISETP.NE.AND P0, PT, R6, RZ, PT;
/* 0x5b6b03800ff70607 */
/*0070*/ DEPBAR {1};
/* 0xf0f0000000070002 */
/*0078*/ FADD R3, R2, R3;
/* 0x5c58000000370203 */
/* 0x001ff400fda00711 */
/*0088*/ SHFL.DOWN PT, R4, R3, 0x2, 0x1f;
/* 0xef17007cb0270304 */
/*0090*/ IADD R6.CC, R1, c[0x0][0x4];
/* 0x4c10800000170106 */
/*0098*/ IADD.X R7, RZ, c[0x0][0x104];
/* 0x4c1008000417ff07 */
/* 0x001ff40002200ff4 */
/*00a8*/ FADD R4, R3, R4;
/* 0x5c58000000470304 */
/*00b0*/ SHFL.DOWN PT, R5, R4, 0x1, 0x1f;
/* 0xef17007cb0170405 */
/*00b8*/ @P0 EXIT;
/* 0xe30000000000000f */
/* 0x001fc400e2200ff2 */
/*00c8*/ FADD R0, R4, R5;
/* 0x5c58000000570400 */
/*00d0*/ F2I.S32.F32.TRUNC R0, R0;
/* 0x5cb0018000071a00 */
/*00d8*/ MOV32I R4, 0x0;
/* 0x010000000007f004 */
/* 0x003ff4011e4007fd */
/*00e8*/ MOV32I R5, 0x0;
/* 0x010000000007f005 */
/*00f0*/ STL [R1], R0;
/* 0xef54000000070100 */
/*00f8*/ JCAL 0x0;
/* 0xe220000000000040 */
/* 0x001f8400fde007ef */
/*0108*/ NOP;
/* 0x50b0000000070f00 */
/*0110*/ NOP;
/* 0x50b0000000070f00 */
/*0118*/ NOP;
/* 0x50b0000000070f00 */
/* 0x001f8000ffe007ff */
/*0128*/ EXIT;
/* 0xe30000000007000f */
/*0130*/ BRA 0x130;
/* 0xe2400fffff87000f */
/*0138*/ NOP;
/* 0x50b0000000070f00 */
PTX for the double kernel, which is definitely wrong:
Function : _Z6kernelIdEvv
.headerflags @"EF_CUDA_SM60 EF_CUDA_PTX_SM(EF_CUDA_SM60)"
/* 0x001c4400fe0007f6 */
/*0008*/ MOV R1, c[0x0][0x20];
/* 0x4c98078000870001 */
/*0010*/ { IADD32I R1, R1, -0x8;
/* 0x1c0fffffff870101 */
/*0018*/ S2R R6, SR_TID.X; }
/* 0xf0c8000002170006 */
/* 0x0004f40026200131 */
/*0028*/ SHFL.DOWN PT, R2, R0, 0x10, 0x1f;
/* 0xef17007cb1070002 */
/*0030*/ SHFL.DOWN PT, R3, R0, 0x10, 0x1f;
/* 0xef17007cb1070003 */
/*0038*/ SHFL.DOWN PT, R4, R0, 0x8, 0x1f;
/* 0xef17007cb0870004 */
/* 0x001fc00026200ff0 */
/*0048*/ { ISETP.NE.AND P0, PT, R6, RZ, PT;
/* 0x5b6b03800ff70607 */
/*0050*/ SHFL.DOWN PT, R5, R0, 0x8, 0x1f; }
/* 0xef17007cb0870005 */
/*0058*/ { IADD R6.CC, R1, c[0x0][0x4];
/* 0x4c10800000170106 */
/*0068*/ SHFL.DOWN PT, R8, R0, 0x4, 0x1f; }
/* 0x0004c400fe000136 */
/* 0xef17007cb0470008 */
/*0070*/ { IADD.X R7, RZ, c[0x0][0x104];
/* 0x4c1008000417ff07 */
/*0078*/ SHFL.DOWN PT, R9, R0, 0x4, 0x1f; }
/* 0xef17007cb0470009 */
/* 0x0004c40026200131 */
/*0088*/ SHFL.DOWN PT, R10, R0, 0x2, 0x1f;
/* 0xef17007cb027000a */
/*0090*/ SHFL.DOWN PT, R11, R0, 0x2, 0x1f;
/* 0xef17007cb027000b */
/*0098*/ SHFL.DOWN PT, R12, R0, 0x1, 0x1f;
/* 0xef17007cb017000c */
/* 0x001fbc00fde00132 */
/*00a8*/ SHFL.DOWN PT, R13, R0, 0x1, 0x1f;
/* 0xef17007cb017000d */
/*00b0*/ NOP;
/* 0x50b0000000070f00 */
/*00b8*/ NOP;
/* 0x50b0000000070f00 */
/* 0x0020c802e3c007fd */
/*00c8*/ @P0 EXIT;
/* 0xe30000000000000f */
/*00d0*/ DADD R2, R2, 1;
/* 0x3870003ff0070202 */
/*00d8*/ DADD R2, R2, R4;
/* 0x5c70000000470202 */
/* 0x0040c800fec00ff1 */
/*00e8*/ MOV R4, R8;
/* 0x5c98078000870004 */
/*00f0*/ MOV R5, R9;
/* 0x5c98078000970005 */
/*00f8*/ DADD R2, R2, R4;
/* 0x5c70000000470202 */
/* 0x0040c800fcc00ff1 */
/*0108*/ MOV R4, R10;
/* 0x5c98078000a70004 */
/*0110*/ MOV R5, R11;
/* 0x5c98078000b70005 */
/*0118*/ DADD R2, R2, R4;
/* 0x5c70000000470202 */
/* 0x00407800fec00ff1 */
/*0128*/ MOV R4, R12;
/* 0x5c98078000c70004 */
/*0130*/ MOV R5, R13;
/* 0x5c98078000d70005 */
/*0138*/ DADD R2, R2, R4;
/* 0x5c70000000470202 */
/* 0x0023c400fe000f14 */
/*0148*/ F2I.S32.F64.TRUNC R2, R2;
/* 0x5cb0018000271e02 */
/*0150*/ { MOV32I R4, 0x0;
/* 0x010000000007f004 */
/*0158*/ STL [R1], R2; }
/* 0xef54000000070102 */
/* 0x001ffc01ffa007e6 */
/*0168*/ MOV32I R5, 0x0;
/* 0x010000000007f005 */
/*0170*/ JCAL 0x0;
/* 0xe220000000000040 */
/*0178*/ EXIT;
/* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*0188*/ BRA 0x180;
/* 0xe2400fffff07000f */
/*0190*/ NOP;
/* 0x50b0000000070f00 */
/*0198*/ NOP;
/* 0x50b0000000070f00 */
/* 0x001f8000fc0007e0 */
/*01a8*/ NOP;
/* 0x50b0000000070f00 */
/*01b0*/ NOP;
/* 0x50b0000000070f00 */
/*01b8*/ NOP;
/* 0x50b0000000070f00 */
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20180212/cd77f17a/attachment-0001.html>
More information about the llvm-bugs
mailing list