<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/128853>128853</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            NVPTX produces incorrect PTX with 8-bit integer vector input which results in wrong answer
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          rashedmyt
      </td>
    </tr>
</table>

<pre>
    This issue is created as suggested in https://github.com/llvm/llvm-project/issues/107219.

https://cuda.godbolt.org/z/1ebcMfv87 highlights the differences in the PTX generated for the reproducer IR given below b/w LLVM 16 and current tip of main branch.

<details>
<summary>Original LLVM IR</summary>

```llvm
; LLVM version:16.0.6
; ModuleID = 'module'
source_filename = "module"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

define void @KERNEL(ptr addrspace(1) align 8 %out0, <2 x i8> %in0, i32 %arraySize) addrspace(1) {
check:
  %tid_x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  %ntid_x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  %ctaid_x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  %ctaid_y = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  %nctaid_x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
  %0 = mul i32 %nctaid_x, %ctaid_y
  %1 = add i32 %0, %ctaid_x
  %2 = mul i32 %1, %ntid_x
  %linearTid = add i32 %2, %tid_x
  %3 = icmp ult i32 %linearTid, %arraySize
  %4 = extractelement <2 x i8> %in0, i64 0
  %5 = sitofp i8 %4 to double
  %6 = extractelement <2 x i8> %in0, i64 1
  %7 = sitofp i8 %6 to double
  %8 = insertelement <2 x double> undef, double %5, i64 0
  %9 = insertelement <2 x double> %8, double %7, i64 1
  br i1 %3, label %StraightLineCodeBlock, label %exit

exit: ; preds = %StraightLineCodeBlock, %check
  ret void

StraightLineCodeBlock:                            ; preds = %check
  store <2 x double> %9, ptr addrspace(1) %out0, align 16
 br label %exit
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #0

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #0

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #0

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!nvvm.annotations = !{!0}

!0 = !{ptr addrspace(1) @KERNEL, !"kernel", i32 1}
```
</details>

<details>
<summary>C++ Driver Code</summary>

```cpp
// Type your code here, or load an example.

#include <array>
#include <complex>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>

#include <cuda.h>

inline void checkCudaErrors(CUresult err) {
  if (err != CUDA_SUCCESS)
    std::cout << std::string("CUDA Error: ") + std::to_string(err) << std::endl;
}

#define ALIGN_UP(offset, alignment) \
  (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)

#define ADD_TO_PARAM_BUFFER(value, alignment) \
  do { \
    ALIGN_UP(paramBufferSize, alignment); \
    memcpy(paramBuffer + paramBufferSize, &(value), sizeof(value));            \
    paramBufferSize += sizeof(value);                                          \
  } while (0)

template <typename T, typename std::enable_if<std::is_pointer<T>::value,
 T>::type * = nullptr>
void ProcessInput(CUdeviceptr &devPtr, T input, int const numel,
                  char *paramBuffer, size_t &paramBufferSize) {
  checkCudaErrors(cuMemAlloc(&devPtr, sizeof(*input) * numel));
  checkCudaErrors(cuMemcpyHtoD(devPtr, input, sizeof(*input) * numel));
  ADD_TO_PARAM_BUFFER(devPtr, __alignof(devPtr));
}

template <typename T, typename std::enable_if<!std::is_pointer<T>::value,
 T>::type * = nullptr>
void ProcessInput(CUdeviceptr &devPtr, T input, int const numel,
 char *paramBuffer, size_t &paramBufferSize) {
  ADD_TO_PARAM_BUFFER(input, __alignof(input));
}

template <typename T>
CUdeviceptr ProcessEachInput(T input, int const numel, char *paramBuffer,
 size_t &paramBufferSize) {
  CUdeviceptr devPtr(0LL);
 ProcessInput<T>(devPtr, input, numel, paramBuffer, paramBufferSize);
 return devPtr;
}

// Forward declare
template <typename OutputType, typename... InputTypes>
int RunPtx(std::string const &ptx, int const numel, int deviceID,
           std::vector<OutputType *> const &outputPtrs, InputTypes... inputs);

/*
 * Overload to convert single outputs to vector of outputs
 */
template <typename OutputType, typename... InputTypes>
inline int RunPtx(std::string const &ptx, int const numel, int deviceID,
                  OutputType *output, InputTypes... inputs) {
 return RunPtx(ptx, numel, deviceID, std::vector<OutputType *>{output},
 inputs...);
}

template <typename OutputType, typename... InputTypes>
inline int RunPtx(std::string const &ptx, int const numel, int deviceID,
                  std::vector<OutputType *> const &outputPtrs,
 InputTypes... inputs) {
  CUdevice device;
  CUmodule cudaModule;
  CUcontext context;
  CUfunction function;
  int devCount;

  checkCudaErrors(cuInit(0));
 checkCudaErrors(cuDeviceGetCount(&devCount));
 checkCudaErrors(cuDeviceGet(&device, deviceID));

  char name[128];
 checkCudaErrors(cuDeviceGetName(name, 128, device));
//   std::cout << "Using CUDA Device [0]: " << name << "\n";

  int devMajor, devMinor;
  checkCudaErrors(cuDeviceGetAttribute(
      &devMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
 checkCudaErrors(cuDeviceGetAttribute(
      &devMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
//   std::cout << "Device Compute Capability: " << devMajor << "." << devMinor
// << "\n";
  if (devMajor < 2) {
    std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
    return 1;
  }
 checkCudaErrors(cuCtxCreate(&context, 0, device));

 checkCudaErrors(cuModuleLoadDataEx(&cudaModule, ptx.c_str(), 0, 0, 0));

 /* Get kernel function */
  checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "KERNEL"));

  CUdeviceptr numelDevPtr;
 checkCudaErrors(cuMemAlloc(&numelDevPtr, sizeof(numel)));
 checkCudaErrors(cuMemcpyHtoD(numelDevPtr, &numel, sizeof(numel)));

  /* Kernel parameters using EXTRA field */
  char paramBuffer[1024];
  size_t paramBufferSize = 0;

  /*Set up output and array size info*/
  size_t numOutputs = outputPtrs.size();
  std::vector<CUdeviceptr> outputDevPtrs(numOutputs);
  for (size_t i = 0; i < numOutputs; i++) {
 checkCudaErrors(cuMemAlloc(&outputDevPtrs[i], sizeof(OutputType) * numel));
 CUdeviceptr opPtr =
        outputDevPtrs[i]; // Fails to build on windows without this
    ADD_TO_PARAM_BUFFER(opPtr, __alignof(opPtr));
 }

  // Expand parameter pack of inputs
  CUdeviceptr inputDevPtrs[] = {
 ProcessEachInput(inputs, numel, paramBuffer, paramBufferSize)...};

 ADD_TO_PARAM_BUFFER(numel, __alignof(numel));

  unsigned blockSizeX = unsigned(numel);
  unsigned blockSizeY, blockSizeZ, gridSizeX, gridSizeY, gridSizeZ;
  blockSizeY = blockSizeZ = gridSizeX = gridSizeY = gridSizeZ = 1;

  void *extra[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, paramBuffer,
 CU_LAUNCH_PARAM_BUFFER_SIZE, &paramBufferSize,
 CU_LAUNCH_PARAM_END};

//   std::cout << "Launching kernel\n";

  /* Kernel launch */
  checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
                                 blockSizeX, blockSizeY, blockSizeZ, 0, NULL,
                                 NULL, extra));

  cudaDeviceSynchronize();

  for (size_t i = 0; i < numOutputs; i++) {
    checkCudaErrors(cuMemcpyDtoH(outputPtrs[i], outputDevPtrs[i],
                                 sizeof(OutputType) * numel));
    checkCudaErrors(cuMemFree(outputDevPtrs[i]));
  }

  for (unsigned i = 0; i < sizeof...(InputTypes); i++) {
 checkCudaErrors(cuMemFree(inputDevPtrs[i]));
  }

 checkCudaErrors(cuModuleUnload(cudaModule));
 checkCudaErrors(cuCtxDestroy(context));
  return 0;
}

bool readFileToString(std::string &ptx, std::string const &pathToFile) {
  try {
    std::ifstream t(pathToFile);
    std::stringstream buffer;
 buffer << t.rdbuf();
    ptx = buffer.str();
    return true;
  } catch (...) {
    return false;
  }
}

int main(void) {
  std::string llvm_16_ptx, llvm_19_ptx;
  readFileToString(llvm_16_ptx, "llvm16.ptx");
 readFileToString(llvm_19_ptx, "llvm19.ptx");

  int const numel = 1;
 std::complex<int8_t> input{1, 2};
  std::array<std::complex<double>, numel> output;

  output.fill(std::complex<double>{0.0, 0.0});

  std::cout << "Input:" << std::endl << static_cast<std::complex<int>>(input) << std::endl;

  RunPtx(llvm_16_ptx, numel, /*deviceID=*/0, output.data(), input);

  std::cout << "Output with LLVM 16:" << std::endl;
  for (int i = 0; i < numel; i++) {
    std::cout << output[i] << std::endl;
  }

 RunPtx(llvm_19_ptx, numel, /*deviceID=*/0, output.data(), input);

 std::cout << "Output with LLVM 19:" << std::endl;
  for (int i = 0; i < numel; i++) {
    std::cout << output[i] << std::endl;
 }
}
```
</details>

**Build commands**

```bash
# Generating PTX from the repro LLVM IR using version 16 and 19
<LLVM_16_LOCATION>/bin/llc -mcpu=sm_86 repro.ll -o llvm16.ptx

<LLVM_19_LOCATION>/bin/llc -mcpu=sm_86 repro.ll -o llvm19.ptx

# Generating executable using NVCC
<CUDA_LOCATION>/bin/nvcc -lcuda -std=c++17 repro.cu -o repro.out
```

**Execution**

```bash
$ ./repro.out 
Input:
(1,2)
Output with LLVM 16:
(1,2)
Output with LLVM 19:
(1,0)
```
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJzMW1tz4rjy_zTKiyouI3MxD3kwBmbz39wqIVs7-0IJW4DOGMklywT24f_ZT-nmG5DLnN2qoaYmWFb_utXdanVLAhcF3TBCbsBgAgbTK1zKLRc3Ahdbku6O8mrF0-PNYksLSIuiJJAWMBEES5JCXMCi3GxIoR4og1sp8wIEEUBzgOYbKrflykv4DqB5lu3dn-tc8P-QRAI014gFQPOeP0K9sQf8CPhRGyYpU-xteLrimfS42AA0_1tRkFVyv96HI7ilm21GN1tZQLklMKXrNRGEJaRQMqmmp8WfcEMYEVrqNRe6VZBc8LRMiIC3z3BD94TBFcn4G1wBNH-Dd3d_3MPeEGKWwqQUgjAJJc0hX8MdpgyuBGbJ1soMgjglEtOsAMHMPBflbofFEQSzR0E3lOHMQN4-gyAGaF6_txBD3_zTulIQE0OwJ6KgnIEg6g093xvad_c8LTNyO4UgmEKARjv9DNAI-FHBS5GQ5ZpmhOEdsV2Q64KAH0ksNkTCFEuc4SMvpetErumwD4Jo2L-mPRQqtii83veGWoDrfYBAEAXomukW_TTstzCloHlWMWX7XB6G_Wu2pynF18qepjfwo5SsKSNwz2kKQd__ffb8MLsDKMylgDhNRZHjhAAU9gAaQ5zRDYMhBGjAS-kDFEMQxAgeIA1BMFPtlOlmGiD1hIXAxxf6N9HUXTgwmgA_SrYk-aGczY-gopE0XR606AnOMoPU1ybx2H6_8wTBqZfLg1cIsvEkTb0DQCFAYwfAvobAzkAkEn8JQ_c_D3L8IsixM5ivisLOyuJr-l2ZOcs4XG3DSlbXvae74zR13f1Wv4Prh7qwPdvP2MB1yygjWCyUi7Vhke3e6h3oXjTZ5bDMpOtaYViS2rUsWV-TkYMUOJEkIzsVLi6557APfUc40IQFlXydQxoaLMlhystVVsEPvwjfc4SjU_jhKXxoBs0KIrrotmMwgyVLyVoxME1a-JPhjD-DpDi2gUZtwVcC0p62hmrP8Ipk6ulFCqwi_R1lJOYpmWQ8-dHqQQ5UmtCivwURVKEyFyQtbDy6DKIcTEcDJYEgUoclA3aeJojgO58TxjV4IbkgZ9UyVoKcjX6NqGcCYU8tBEpTJ4MfTd2iNIHzkiWScgYjKYVaVSHjaiKvcPIDMr4WhEDGiyNLIOMle6MshUVOkjLDEivjvNEsE0SWgsEd2XGhIgTjjJjpnZIkw4J8IU5CgAL_V5OP_eoCtsLqLyzh8ReWkF1SIpZS0FUpSaEbzXwdTf5BIWE9J1FPi4YZ4xIrpbj40FMJCer5za5-4935qFAnTbHuh9APIhjJVIplM6GeRXT5pUlPAZq3M9YPktgYoAlAEzgVdE8EVFHwgzw2yXM9CpXFw8UxJ_DISwETnhK4JYIo-biAGccpxAySA97lGXEJNQooS7Iy1XFSL7eWQ-tFwhXN4dyrdSEFwbtzryi__K5455UUlG3OvdmTRHJRq6Ato6pgttVLyrIq59VLQlymeCYEFwVAYfwqSKESDyJEnaVCSNcQoJAIoa0cTGH8Oo2WL69xPHt5camWWllSlcwGUWIy-hgEcd1oB6D8Hyl6qNnqVVK5i5oTk7q35MuKwEnTASQszUAwaS06KLBZfXR3--1h-foEUMjX64LIau1SWYGGG8Q2b6h6jK3Hh-02NFFtLeJraNfFIfx_0__M6xOZptPl4nH5FD1H98vJ63w-ewYo3OOsJBelS7mOBu4RNgeWY4F3k1KVnKbQaGOo4FfT7cguyY9tKj20MygADWvBdFZQ0L8JX7caNX4z5ahYdRAVF5MHdjHaAO9_KnONpvBtS3XmFvqVliXZ5RmW2uflMTeV50KJXj01HEeFzCVd62llG2mxzDllkggQxAs1YXSzs47iXbcqTAhQpP2FlVmWSzsB9cx6EjwhRXHL8lLqaZWSPU2ICqIADVOyf5JCibaA1HSJIWUSJpwVErJypyKoU2b7k2yxwogaKnbmWUoFfmLNehafzvekvCe7KMt4on24IVllK4AiK-NYD9hKZ-3_Hm6SH3-TfApQWMNWw_0K_vlpU4Mul9rrNZxrremr2PCzHgJQ79d2kv_FJ87rtmLYVK0z06c1q4faHJYd8QwnWzfqdwd3YWRK8E8Nrsnb6TP07-5q32rZwFr0rL9WInWUfMrdIttczEJ1Vymdlcy5eMMihTaDvKTIx1LmpVQpTNNXPc-DWmz1wqZMSoPPJXuSKsvsrLtWtUph8nBB3arJaOx2ehKAKjyXbsS1YMpGqoaseHD96knl2yhuyKmk1jotai-y-tCbYGq2PO6J0GmZ5ApwT4SEBWWbjEADW6g3RgrI167Rkiusf0CROkv6V_VpP20lmrG8ozPn29a9KumsEBXrBtvPWA6MJpb1yElqWHqe98n5_ktq9-edVsF9YIMqvlgx3HoVv5qtbqhyb7NRXr9KOJPkoAej_tYv1q5MdV_cKzvOmJdMVlPm_Kp7y6h0WVEVic51nGqBvxFpUN3ib58-S1zRqcG3nG7cnN42a1HuAAaTHgrBYPoZ-AdFgEJNh2KoCCsmLRYmnF6oPwBCryp86JoFGnAIBhNfS6GLD9fVnlY4MjCImfpTj8Oa4h7_hwsryz1lXLyXCVXDiVyhr9TmnNXorwKMX5fT2R-38WwZLRbPt5PXxWwZP94_6b_RUzS5vbtdfF_eR__3-HxeGT8vgx7IF2S4fbgkw4cGsVaI-S4vJYExzvGKZlQeOwZxqmmQep3XWuqK5QXjuQq2iQdRcyo3ZdV1bgU0e35-fFZyWaF9SAvIuIQv9xB5PuQCbvSRpOiwhC5I91yLCaBnLRTLQ6xRzJxy0QHF0L-g4wtAJt7ccZxOscSzg8WrI5He4j14iSqv7UGJ4-L-a3MxqzP8RiQ0uztVhKrX2wtFgOb4jUi3BWeEqQKcyvBaggGE3IYSOgkizWxOLwLTRm71ieKmSdOsQJoVx7uzqFXTdNAc_ofIdstBq_R3o06dQxJJRAFLHalmfy6eI7imJEvbKsailX8OJj0f9eto6rLik_o7mEL_hP0LkbDMbfqkD5r1RpcGgZSteYO1BWbl7tGmYAqzXjC9Qie_YaNqO116GwZU666hNiosjLYseANlrSYrCi1_Wg1Ff40bAuk2s0vYmNcfe0VbisGEKnU2jdjMay6VqE3X5PmTquCCaSMhOcskmEBXCGCa6aR2VdIshZzBN8pS_lbANyq3KnTKLS3c_s_Zik1z7VZstrEhaJXCQcd7dsiV6SsfhDlOfqi02qY73Zmnm-uRgMHU7Vg36qlmhefSpq_UUCrrHE1rjz0_5gqvOeYT4-gRlExfNEnhKuPJD8XlTy22a28SWs87JfmueFVPf6mnjaCpRms-fG8-_OXwahjNucbRjxVQ6-l768n07DWGZa4uoEgfzbatEb8u76LXh_i3ltaWT4-3D4vZ86kVjBufpXm5_WtmY9zpXuE5utnDtGm-D9OBO1yyZKsinz0-OEm82gEz0_0_Wn0M6u_2QCJsrjqfMduFiqLzqf2p5RxnXEUvrg-vd3efRLZdzbH7aUZdptgkJC9HlmwFZ50A_E9ET_je3t5U8t9UjKmLpjp4Xgiqnxn2l-LuZQHngpBKuq4gTYRGRLTaqib-ib6MbLokDhtFrdnI_vTiY0XrBNL3JLucW72yjONUN9R51AdpTCwPU1JIwY_q0SWaTc42b_U7Zf-K8wwKgtM5zciCv7jjmW7tXlftl6t6LLcLrnCa7ibF8VxCTu1hGpT69KJBWbtBh5MlWJnoZnqt7JGHCTvSE-mqXLdzFqhSYxOedWevzpG7Sb0UJWlYCiZY6pAUmh2TxjgswRpnBenY1mlW1ZU7TFV-rK99NOi7Ksyy_W7ZGy6ths3jWD9W1juxUIcIIKRaekNPN6DmruUF2nGXdtylrQrkxjZNc8FqRn97bBpTJsOlVMmg2WgdTfQlKlStHo3x21PY-AxMdYmlzjKq_LIhnGnw1jTLml57Dmc08T1TDXn6SLw1yEvLmN1FjhoVauussm7EkibLBBfy_Hgok_oQYdbYdX_v9FMJVe2kdWxdJUlmCa22aIKpWT79Ol57KZa4Lgmr_f6PR25itc5W3c3Vd_TQye2Vz5xbmpQVL6xKZ-Ww9jah9F3GdWTtaG38b2jt80ob_8JKa0Wsjy9zKC1FE13OJHy3wywtTFvnmsYKF1tzRg6_mUvSKsg9Lf6Ea8F39VVpd3vZVsj2VrK7Id0bG0lUJ-X7d49xtLh9fNBTaL5SgXWeZQm83iV5CYJpsVuGQwPsZRm85rARD91tFAM2_lmwcQOsPTpyIElpbu2Y0Tz8EceGp75XcY4h2ycJvM7UMg-vtX2miTFyb2R5J6Xibb7zUnbt5Iwy09x1IvyePfrQA2heoUHgR1V8U-9DFamROX6_MP0_7jZudXOH-U6Wq_QmSMfBGF-Rm96o7w-Gw35vcLW9GaxXfhIOVniI-uPhICHJKFmHBPnhoI_7JL2iN8hHAx-hYc_v-72Bh1bJeJCmfdQL14NkjEDfJztMM09f1-Jic6V_DXDTQ2E4CK70pcZC_yoBIUbezK8P9K7i9Erc6B8SrMpNAfp-RgtZ1DCSyozcPPyhPNje8C8gZQkXgiRSO7Yef3i9olItl2RDhDvM0rEDvm1psoXmIo7-FcGb4GwDMSveiLgqRXbzP_zUwYxvf4P-GwAA___FT94p">