<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/128853>128853</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
NVPTX produces incorrect PTX with 8-bit integer vector input which results in wrong answer
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
rashedmyt
</td>
</tr>
</table>
<pre>
This issue is created as suggested in https://github.com/llvm/llvm-project/issues/107219.
https://cuda.godbolt.org/z/1ebcMfv87 highlights the differences in the PTX generated for the reproducer IR given below b/w LLVM 16 and current tip of main branch.
<details>
<summary>Original LLVM IR</summary>
```llvm
; LLVM version:16.0.6
; ModuleID = 'module'
source_filename = "module"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
define void @KERNEL(ptr addrspace(1) align 8 %out0, <2 x i8> %in0, i32 %arraySize) addrspace(1) {
check:
%tid_x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%ntid_x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
%ctaid_x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%ctaid_y = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
%nctaid_x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
%0 = mul i32 %nctaid_x, %ctaid_y
%1 = add i32 %0, %ctaid_x
%2 = mul i32 %1, %ntid_x
%linearTid = add i32 %2, %tid_x
%3 = icmp ult i32 %linearTid, %arraySize
%4 = extractelement <2 x i8> %in0, i64 0
%5 = sitofp i8 %4 to double
%6 = extractelement <2 x i8> %in0, i64 1
%7 = sitofp i8 %6 to double
%8 = insertelement <2 x double> undef, double %5, i64 0
%9 = insertelement <2 x double> %8, double %7, i64 1
br i1 %3, label %StraightLineCodeBlock, label %exit
exit: ; preds = %StraightLineCodeBlock, %check
ret void
StraightLineCodeBlock: ; preds = %check
store <2 x double> %9, ptr addrspace(1) %out0, align 16
br label %exit
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #0
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #0
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #0
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
!nvvm.annotations = !{!0}
!0 = !{ptr addrspace(1) @KERNEL, !"kernel", i32 1}
```
</details>
<details>
<summary>C++ Driver Code</summary>
```cpp
// Type your code here, or load an example.
#include <array>
#include <complex>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#include <cuda.h>
inline void checkCudaErrors(CUresult err) {
if (err != CUDA_SUCCESS)
std::cout << std::string("CUDA Error: ") + std::to_string(err) << std::endl;
}
#define ALIGN_UP(offset, alignment) \
(offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
#define ADD_TO_PARAM_BUFFER(value, alignment) \
do { \
ALIGN_UP(paramBufferSize, alignment); \
memcpy(paramBuffer + paramBufferSize, &(value), sizeof(value)); \
paramBufferSize += sizeof(value); \
} while (0)
template <typename T, typename std::enable_if<std::is_pointer<T>::value,
T>::type * = nullptr>
void ProcessInput(CUdeviceptr &devPtr, T input, int const numel,
char *paramBuffer, size_t ¶mBufferSize) {
checkCudaErrors(cuMemAlloc(&devPtr, sizeof(*input) * numel));
checkCudaErrors(cuMemcpyHtoD(devPtr, input, sizeof(*input) * numel));
ADD_TO_PARAM_BUFFER(devPtr, __alignof(devPtr));
}
template <typename T, typename std::enable_if<!std::is_pointer<T>::value,
T>::type * = nullptr>
void ProcessInput(CUdeviceptr &devPtr, T input, int const numel,
char *paramBuffer, size_t ¶mBufferSize) {
ADD_TO_PARAM_BUFFER(input, __alignof(input));
}
template <typename T>
CUdeviceptr ProcessEachInput(T input, int const numel, char *paramBuffer,
size_t ¶mBufferSize) {
CUdeviceptr devPtr(0LL);
ProcessInput<T>(devPtr, input, numel, paramBuffer, paramBufferSize);
return devPtr;
}
// Forward declare
template <typename OutputType, typename... InputTypes>
int RunPtx(std::string const &ptx, int const numel, int deviceID,
std::vector<OutputType *> const &outputPtrs, InputTypes... inputs);
/*
* Overload to convert single outputs to vector of outputs
*/
template <typename OutputType, typename... InputTypes>
inline int RunPtx(std::string const &ptx, int const numel, int deviceID,
OutputType *output, InputTypes... inputs) {
return RunPtx(ptx, numel, deviceID, std::vector<OutputType *>{output},
inputs...);
}
template <typename OutputType, typename... InputTypes>
inline int RunPtx(std::string const &ptx, int const numel, int deviceID,
std::vector<OutputType *> const &outputPtrs,
InputTypes... inputs) {
CUdevice device;
CUmodule cudaModule;
CUcontext context;
CUfunction function;
int devCount;
checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGetCount(&devCount));
checkCudaErrors(cuDeviceGet(&device, deviceID));
char name[128];
checkCudaErrors(cuDeviceGetName(name, 128, device));
// std::cout << "Using CUDA Device [0]: " << name << "\n";
int devMajor, devMinor;
checkCudaErrors(cuDeviceGetAttribute(
&devMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
checkCudaErrors(cuDeviceGetAttribute(
&devMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
// std::cout << "Device Compute Capability: " << devMajor << "." << devMinor
// << "\n";
if (devMajor < 2) {
std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
return 1;
}
checkCudaErrors(cuCtxCreate(&context, 0, device));
checkCudaErrors(cuModuleLoadDataEx(&cudaModule, ptx.c_str(), 0, 0, 0));
/* Get kernel function */
checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "KERNEL"));
CUdeviceptr numelDevPtr;
checkCudaErrors(cuMemAlloc(&numelDevPtr, sizeof(numel)));
checkCudaErrors(cuMemcpyHtoD(numelDevPtr, &numel, sizeof(numel)));
/* Kernel parameters using EXTRA field */
char paramBuffer[1024];
size_t paramBufferSize = 0;
/*Set up output and array size info*/
size_t numOutputs = outputPtrs.size();
std::vector<CUdeviceptr> outputDevPtrs(numOutputs);
for (size_t i = 0; i < numOutputs; i++) {
checkCudaErrors(cuMemAlloc(&outputDevPtrs[i], sizeof(OutputType) * numel));
CUdeviceptr opPtr =
outputDevPtrs[i]; // Fails to build on windows without this
ADD_TO_PARAM_BUFFER(opPtr, __alignof(opPtr));
}
// Expand parameter pack of inputs
CUdeviceptr inputDevPtrs[] = {
ProcessEachInput(inputs, numel, paramBuffer, paramBufferSize)...};
ADD_TO_PARAM_BUFFER(numel, __alignof(numel));
unsigned blockSizeX = unsigned(numel);
unsigned blockSizeY, blockSizeZ, gridSizeX, gridSizeY, gridSizeZ;
blockSizeY = blockSizeZ = gridSizeX = gridSizeY = gridSizeZ = 1;
void *extra[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, paramBuffer,
CU_LAUNCH_PARAM_BUFFER_SIZE, ¶mBufferSize,
CU_LAUNCH_PARAM_END};
// std::cout << "Launching kernel\n";
/* Kernel launch */
checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
blockSizeX, blockSizeY, blockSizeZ, 0, NULL,
NULL, extra));
cudaDeviceSynchronize();
for (size_t i = 0; i < numOutputs; i++) {
checkCudaErrors(cuMemcpyDtoH(outputPtrs[i], outputDevPtrs[i],
sizeof(OutputType) * numel));
checkCudaErrors(cuMemFree(outputDevPtrs[i]));
}
for (unsigned i = 0; i < sizeof...(InputTypes); i++) {
checkCudaErrors(cuMemFree(inputDevPtrs[i]));
}
checkCudaErrors(cuModuleUnload(cudaModule));
checkCudaErrors(cuCtxDestroy(context));
return 0;
}
bool readFileToString(std::string &ptx, std::string const &pathToFile) {
try {
std::ifstream t(pathToFile);
std::stringstream buffer;
buffer << t.rdbuf();
ptx = buffer.str();
return true;
} catch (...) {
return false;
}
}
int main(void) {
std::string llvm_16_ptx, llvm_19_ptx;
readFileToString(llvm_16_ptx, "llvm16.ptx");
readFileToString(llvm_19_ptx, "llvm19.ptx");
int const numel = 1;
std::complex<int8_t> input{1, 2};
std::array<std::complex<double>, numel> output;
output.fill(std::complex<double>{0.0, 0.0});
std::cout << "Input:" << std::endl << static_cast<std::complex<int>>(input) << std::endl;
RunPtx(llvm_16_ptx, numel, /*deviceID=*/0, output.data(), input);
std::cout << "Output with LLVM 16:" << std::endl;
for (int i = 0; i < numel; i++) {
std::cout << output[i] << std::endl;
}
RunPtx(llvm_19_ptx, numel, /*deviceID=*/0, output.data(), input);
std::cout << "Output with LLVM 19:" << std::endl;
for (int i = 0; i < numel; i++) {
std::cout << output[i] << std::endl;
}
}
```
</details>
**Build commands**
```bash
# Generating PTX from the repro LLVM IR using version 16 and 19
<LLVM_16_LOCATION>/bin/llc -mcpu=sm_86 repro.ll -o llvm16.ptx
<LLVM_19_LOCATION>/bin/llc -mcpu=sm_86 repro.ll -o llvm19.ptx
# Generating executable using NVCC
<CUDA_LOCATION>/bin/nvcc -lcuda -std=c++17 repro.cu -o repro.out
```
**Execution**
```bash
$ ./repro.out
Input:
(1,2)
Output with LLVM 16:
(1,2)
Output with LLVM 19:
(1,0)
```
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJzMW1tz4rjy_zTKiyouI3MxD3kwBmbz39wqIVs7-0IJW4DOGMklywT24f_ZT-nmG5DLnN2qoaYmWFb_utXdanVLAhcF3TBCbsBgAgbTK1zKLRc3Ahdbku6O8mrF0-PNYksLSIuiJJAWMBEES5JCXMCi3GxIoR4og1sp8wIEEUBzgOYbKrflykv4DqB5lu3dn-tc8P-QRAI014gFQPOeP0K9sQf8CPhRGyYpU-xteLrimfS42AA0_1tRkFVyv96HI7ilm21GN1tZQLklMKXrNRGEJaRQMqmmp8WfcEMYEVrqNRe6VZBc8LRMiIC3z3BD94TBFcn4G1wBNH-Dd3d_3MPeEGKWwqQUgjAJJc0hX8MdpgyuBGbJ1soMgjglEtOsAMHMPBflbofFEQSzR0E3lOHMQN4-gyAGaF6_txBD3_zTulIQE0OwJ6KgnIEg6g093xvad_c8LTNyO4UgmEKARjv9DNAI-FHBS5GQ5ZpmhOEdsV2Q64KAH0ksNkTCFEuc4SMvpetErumwD4Jo2L-mPRQqtii83veGWoDrfYBAEAXomukW_TTstzCloHlWMWX7XB6G_Wu2pynF18qepjfwo5SsKSNwz2kKQd__ffb8MLsDKMylgDhNRZHjhAAU9gAaQ5zRDYMhBGjAS-kDFEMQxAgeIA1BMFPtlOlmGiD1hIXAxxf6N9HUXTgwmgA_SrYk-aGczY-gopE0XR606AnOMoPU1ybx2H6_8wTBqZfLg1cIsvEkTb0DQCFAYwfAvobAzkAkEn8JQ_c_D3L8IsixM5ivisLOyuJr-l2ZOcs4XG3DSlbXvae74zR13f1Wv4Prh7qwPdvP2MB1yygjWCyUi7Vhke3e6h3oXjTZ5bDMpOtaYViS2rUsWV-TkYMUOJEkIzsVLi6557APfUc40IQFlXydQxoaLMlhystVVsEPvwjfc4SjU_jhKXxoBs0KIrrotmMwgyVLyVoxME1a-JPhjD-DpDi2gUZtwVcC0p62hmrP8Ipk6ulFCqwi_R1lJOYpmWQ8-dHqQQ5UmtCivwURVKEyFyQtbDy6DKIcTEcDJYEgUoclA3aeJojgO58TxjV4IbkgZ9UyVoKcjX6NqGcCYU8tBEpTJ4MfTd2iNIHzkiWScgYjKYVaVSHjaiKvcPIDMr4WhEDGiyNLIOMle6MshUVOkjLDEivjvNEsE0SWgsEd2XGhIgTjjJjpnZIkw4J8IU5CgAL_V5OP_eoCtsLqLyzh8ReWkF1SIpZS0FUpSaEbzXwdTf5BIWE9J1FPi4YZ4xIrpbj40FMJCer5za5-4935qFAnTbHuh9APIhjJVIplM6GeRXT5pUlPAZq3M9YPktgYoAlAEzgVdE8EVFHwgzw2yXM9CpXFw8UxJ_DISwETnhK4JYIo-biAGccpxAySA97lGXEJNQooS7Iy1XFSL7eWQ-tFwhXN4dyrdSEFwbtzryi__K5455UUlG3OvdmTRHJRq6Ato6pgttVLyrIq59VLQlymeCYEFwVAYfwqSKESDyJEnaVCSNcQoJAIoa0cTGH8Oo2WL69xPHt5camWWllSlcwGUWIy-hgEcd1oB6D8Hyl6qNnqVVK5i5oTk7q35MuKwEnTASQszUAwaS06KLBZfXR3--1h-foEUMjX64LIau1SWYGGG8Q2b6h6jK3Hh-02NFFtLeJraNfFIfx_0__M6xOZptPl4nH5FD1H98vJ63w-ewYo3OOsJBelS7mOBu4RNgeWY4F3k1KVnKbQaGOo4FfT7cguyY9tKj20MygADWvBdFZQ0L8JX7caNX4z5ahYdRAVF5MHdjHaAO9_KnONpvBtS3XmFvqVliXZ5RmW2uflMTeV50KJXj01HEeFzCVd62llG2mxzDllkggQxAs1YXSzs47iXbcqTAhQpP2FlVmWSzsB9cx6EjwhRXHL8lLqaZWSPU2ICqIADVOyf5JCibaA1HSJIWUSJpwVErJypyKoU2b7k2yxwogaKnbmWUoFfmLNehafzvekvCe7KMt4on24IVllK4AiK-NYD9hKZ-3_Hm6SH3-TfApQWMNWw_0K_vlpU4Mul9rrNZxrremr2PCzHgJQ79d2kv_FJ87rtmLYVK0z06c1q4faHJYd8QwnWzfqdwd3YWRK8E8Nrsnb6TP07-5q32rZwFr0rL9WInWUfMrdIttczEJ1Vymdlcy5eMMihTaDvKTIx1LmpVQpTNNXPc-DWmz1wqZMSoPPJXuSKsvsrLtWtUph8nBB3arJaOx2ehKAKjyXbsS1YMpGqoaseHD96knl2yhuyKmk1jotai-y-tCbYGq2PO6J0GmZ5ApwT4SEBWWbjEADW6g3RgrI167Rkiusf0CROkv6V_VpP20lmrG8ozPn29a9KumsEBXrBtvPWA6MJpb1yElqWHqe98n5_ktq9-edVsF9YIMqvlgx3HoVv5qtbqhyb7NRXr9KOJPkoAej_tYv1q5MdV_cKzvOmJdMVlPm_Kp7y6h0WVEVic51nGqBvxFpUN3ib58-S1zRqcG3nG7cnN42a1HuAAaTHgrBYPoZ-AdFgEJNh2KoCCsmLRYmnF6oPwBCryp86JoFGnAIBhNfS6GLD9fVnlY4MjCImfpTj8Oa4h7_hwsryz1lXLyXCVXDiVyhr9TmnNXorwKMX5fT2R-38WwZLRbPt5PXxWwZP94_6b_RUzS5vbtdfF_eR__3-HxeGT8vgx7IF2S4fbgkw4cGsVaI-S4vJYExzvGKZlQeOwZxqmmQep3XWuqK5QXjuQq2iQdRcyo3ZdV1bgU0e35-fFZyWaF9SAvIuIQv9xB5PuQCbvSRpOiwhC5I91yLCaBnLRTLQ6xRzJxy0QHF0L-g4wtAJt7ccZxOscSzg8WrI5He4j14iSqv7UGJ4-L-a3MxqzP8RiQ0uztVhKrX2wtFgOb4jUi3BWeEqQKcyvBaggGE3IYSOgkizWxOLwLTRm71ieKmSdOsQJoVx7uzqFXTdNAc_ofIdstBq_R3o06dQxJJRAFLHalmfy6eI7imJEvbKsailX8OJj0f9eto6rLik_o7mEL_hP0LkbDMbfqkD5r1RpcGgZSteYO1BWbl7tGmYAqzXjC9Qie_YaNqO116GwZU666hNiosjLYseANlrSYrCi1_Wg1Ff40bAuk2s0vYmNcfe0VbisGEKnU2jdjMay6VqE3X5PmTquCCaSMhOcskmEBXCGCa6aR2VdIshZzBN8pS_lbANyq3KnTKLS3c_s_Zik1z7VZstrEhaJXCQcd7dsiV6SsfhDlOfqi02qY73Zmnm-uRgMHU7Vg36qlmhefSpq_UUCrrHE1rjz0_5gqvOeYT4-gRlExfNEnhKuPJD8XlTy22a28SWs87JfmueFVPf6mnjaCpRms-fG8-_OXwahjNucbRjxVQ6-l768n07DWGZa4uoEgfzbatEb8u76LXh_i3ltaWT4-3D4vZ86kVjBufpXm5_WtmY9zpXuE5utnDtGm-D9OBO1yyZKsinz0-OEm82gEz0_0_Wn0M6u_2QCJsrjqfMduFiqLzqf2p5RxnXEUvrg-vd3efRLZdzbH7aUZdptgkJC9HlmwFZ50A_E9ET_je3t5U8t9UjKmLpjp4Xgiqnxn2l-LuZQHngpBKuq4gTYRGRLTaqib-ib6MbLokDhtFrdnI_vTiY0XrBNL3JLucW72yjONUN9R51AdpTCwPU1JIwY_q0SWaTc42b_U7Zf-K8wwKgtM5zciCv7jjmW7tXlftl6t6LLcLrnCa7ibF8VxCTu1hGpT69KJBWbtBh5MlWJnoZnqt7JGHCTvSE-mqXLdzFqhSYxOedWevzpG7Sb0UJWlYCiZY6pAUmh2TxjgswRpnBenY1mlW1ZU7TFV-rK99NOi7Ksyy_W7ZGy6ths3jWD9W1juxUIcIIKRaekNPN6DmruUF2nGXdtylrQrkxjZNc8FqRn97bBpTJsOlVMmg2WgdTfQlKlStHo3x21PY-AxMdYmlzjKq_LIhnGnw1jTLml57Dmc08T1TDXn6SLw1yEvLmN1FjhoVauussm7EkibLBBfy_Hgok_oQYdbYdX_v9FMJVe2kdWxdJUlmCa22aIKpWT79Ol57KZa4Lgmr_f6PR25itc5W3c3Vd_TQye2Vz5xbmpQVL6xKZ-Ww9jah9F3GdWTtaG38b2jt80ob_8JKa0Wsjy9zKC1FE13OJHy3wywtTFvnmsYKF1tzRg6_mUvSKsg9Lf6Ea8F39VVpd3vZVsj2VrK7Id0bG0lUJ-X7d49xtLh9fNBTaL5SgXWeZQm83iV5CYJpsVuGQwPsZRm85rARD91tFAM2_lmwcQOsPTpyIElpbu2Y0Tz8EceGp75XcY4h2ycJvM7UMg-vtX2miTFyb2R5J6Xibb7zUnbt5Iwy09x1IvyePfrQA2heoUHgR1V8U-9DFamROX6_MP0_7jZudXOH-U6Wq_QmSMfBGF-Rm96o7w-Gw35vcLW9GaxXfhIOVniI-uPhICHJKFmHBPnhoI_7JL2iN8hHAx-hYc_v-72Bh1bJeJCmfdQL14NkjEDfJztMM09f1-Jic6V_DXDTQ2E4CK70pcZC_yoBIUbezK8P9K7i9Erc6B8SrMpNAfp-RgtZ1DCSyozcPPyhPNje8C8gZQkXgiRSO7Yef3i9olItl2RDhDvM0rEDvm1psoXmIo7-FcGb4GwDMSveiLgqRXbzP_zUwYxvf4P-GwAA___FT94p">