[llvm] ef8655a - [NVPTX] Adapt tests to make them usable with CUDA-12.x

Artem Belevich via llvm-commits llvm-commits at lists.llvm.org
Tue Jun 6 14:22:25 PDT 2023


Author: Artem Belevich
Date: 2023-06-06T14:22:12-07:00
New Revision: ef8655adc8e025f1614c8540a791560f1a2a6bbc

URL: https://github.com/llvm/llvm-project/commit/ef8655adc8e025f1614c8540a791560f1a2a6bbc
DIFF: https://github.com/llvm/llvm-project/commit/ef8655adc8e025f1614c8540a791560f1a2a6bbc.diff

LOG: [NVPTX] Adapt tests to make them usable with CUDA-12.x

CUDA-12 no longer supports 32-bit compilation.

Tests agnostic to 32/64 compilation mode are switched to use nvptx64.
Tests that do care about it have 32-bit ptxas compilation disabled with cuda-12+.

Differential Revision: https://reviews.llvm.org/D152199

Added: 
    

Modified: 
    llvm/test/CodeGen/NVPTX/access-non-generic.ll
    llvm/test/CodeGen/NVPTX/add-sub-128bit.ll
    llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll
    llvm/test/CodeGen/NVPTX/addrspacecast.ll
    llvm/test/CodeGen/NVPTX/aggr-param.ll
    llvm/test/CodeGen/NVPTX/annotations.ll
    llvm/test/CodeGen/NVPTX/arg-lowering.ll
    llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
    llvm/test/CodeGen/NVPTX/arithmetic-int.ll
    llvm/test/CodeGen/NVPTX/async-copy.ll
    llvm/test/CodeGen/NVPTX/atomics-sm60.ll
    llvm/test/CodeGen/NVPTX/atomics-with-scope.ll
    llvm/test/CodeGen/NVPTX/atomics.ll
    llvm/test/CodeGen/NVPTX/bf16.ll
    llvm/test/CodeGen/NVPTX/bfe.ll
    llvm/test/CodeGen/NVPTX/bug17709.ll
    llvm/test/CodeGen/NVPTX/bug22246.ll
    llvm/test/CodeGen/NVPTX/bug22322.ll
    llvm/test/CodeGen/NVPTX/bug26185-2.ll
    llvm/test/CodeGen/NVPTX/bug26185.ll
    llvm/test/CodeGen/NVPTX/bug52623.ll
    llvm/test/CodeGen/NVPTX/bypass-div.ll
    llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
    llvm/test/CodeGen/NVPTX/callchain.ll
    llvm/test/CodeGen/NVPTX/calling-conv.ll
    llvm/test/CodeGen/NVPTX/calls-with-phi.ll
    llvm/test/CodeGen/NVPTX/combine-min-max.ll
    llvm/test/CodeGen/NVPTX/compare-int.ll
    llvm/test/CodeGen/NVPTX/constant-vectors.ll
    llvm/test/CodeGen/NVPTX/convert-fp.ll
    llvm/test/CodeGen/NVPTX/convert-int-sm20.ll
    llvm/test/CodeGen/NVPTX/ctlz.ll
    llvm/test/CodeGen/NVPTX/ctpop.ll
    llvm/test/CodeGen/NVPTX/cttz.ll
    llvm/test/CodeGen/NVPTX/disable-opt.ll
    llvm/test/CodeGen/NVPTX/div-ri.ll
    llvm/test/CodeGen/NVPTX/divrem-combine.ll
    llvm/test/CodeGen/NVPTX/envreg.ll
    llvm/test/CodeGen/NVPTX/fast-math.ll
    llvm/test/CodeGen/NVPTX/fma-assoc.ll
    llvm/test/CodeGen/NVPTX/fma-disable.ll
    llvm/test/CodeGen/NVPTX/fma.ll
    llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll
    llvm/test/CodeGen/NVPTX/fp-literals.ll
    llvm/test/CodeGen/NVPTX/fp16.ll
    llvm/test/CodeGen/NVPTX/function-align.ll
    llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
    llvm/test/CodeGen/NVPTX/global-addrspace.ll
    llvm/test/CodeGen/NVPTX/global-ordering.ll
    llvm/test/CodeGen/NVPTX/global-visibility.ll
    llvm/test/CodeGen/NVPTX/globals_init.ll
    llvm/test/CodeGen/NVPTX/globals_lowering.ll
    llvm/test/CodeGen/NVPTX/half.ll
    llvm/test/CodeGen/NVPTX/i1-global.ll
    llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll
    llvm/test/CodeGen/NVPTX/i1-param.ll
    llvm/test/CodeGen/NVPTX/i128-param.ll
    llvm/test/CodeGen/NVPTX/i8-param.ll
    llvm/test/CodeGen/NVPTX/idioms.ll
    llvm/test/CodeGen/NVPTX/imad.ll
    llvm/test/CodeGen/NVPTX/inline-asm.ll
    llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll
    llvm/test/CodeGen/NVPTX/intrinsic-old.ll
    llvm/test/CodeGen/NVPTX/intrinsics.ll
    llvm/test/CodeGen/NVPTX/isspacep.ll
    llvm/test/CodeGen/NVPTX/ld-addrspace.ll
    llvm/test/CodeGen/NVPTX/ld-generic.ll
    llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
    llvm/test/CodeGen/NVPTX/ldparam-v4.ll
    llvm/test/CodeGen/NVPTX/ldu-i8.ll
    llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
    llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll
    llvm/test/CodeGen/NVPTX/load-sext-i1.ll
    llvm/test/CodeGen/NVPTX/local-stack-frame.ll
    llvm/test/CodeGen/NVPTX/lower-byval-args.ll
    llvm/test/CodeGen/NVPTX/machine-sink.ll
    llvm/test/CodeGen/NVPTX/managed.ll
    llvm/test/CodeGen/NVPTX/mbarrier.ll
    llvm/test/CodeGen/NVPTX/minmax-negative.ll
    llvm/test/CodeGen/NVPTX/module-inline-asm.ll
    llvm/test/CodeGen/NVPTX/mulwide.ll
    llvm/test/CodeGen/NVPTX/named-barriers.ll
    llvm/test/CodeGen/NVPTX/nofunc.ll
    llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
    llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll
    llvm/test/CodeGen/NVPTX/packed-aggr.ll
    llvm/test/CodeGen/NVPTX/param-align.ll
    llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
    llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
    llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll
    llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
    llvm/test/CodeGen/NVPTX/pr16278.ll
    llvm/test/CodeGen/NVPTX/pr17529.ll
    llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
    llvm/test/CodeGen/NVPTX/refl1.ll
    llvm/test/CodeGen/NVPTX/reg-types.ll
    llvm/test/CodeGen/NVPTX/rotate.ll
    llvm/test/CodeGen/NVPTX/rotate_64.ll
    llvm/test/CodeGen/NVPTX/sched1.ll
    llvm/test/CodeGen/NVPTX/sched2.ll
    llvm/test/CodeGen/NVPTX/sext-in-reg.ll
    llvm/test/CodeGen/NVPTX/sext-params.ll
    llvm/test/CodeGen/NVPTX/shift-parts.ll
    llvm/test/CodeGen/NVPTX/short-ptr.ll
    llvm/test/CodeGen/NVPTX/simple-call.ll
    llvm/test/CodeGen/NVPTX/sqrt-approx.ll
    llvm/test/CodeGen/NVPTX/st-addrspace.ll
    llvm/test/CodeGen/NVPTX/st-generic.ll
    llvm/test/CodeGen/NVPTX/store-retval.ll
    llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
    llvm/test/CodeGen/NVPTX/surf-read.ll
    llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
    llvm/test/CodeGen/NVPTX/surf-write.ll
    llvm/test/CodeGen/NVPTX/symbol-naming.ll
    llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
    llvm/test/CodeGen/NVPTX/tex-read.ll
    llvm/test/CodeGen/NVPTX/texsurf-queries.ll
    llvm/test/CodeGen/NVPTX/vaargs.ll
    llvm/test/CodeGen/NVPTX/vec-param-load.ll
    llvm/test/CodeGen/NVPTX/vec8.ll
    llvm/test/CodeGen/NVPTX/vector-args.ll
    llvm/test/CodeGen/NVPTX/vector-call.ll
    llvm/test/CodeGen/NVPTX/vector-compare.ll
    llvm/test/CodeGen/NVPTX/vector-loads.ll
    llvm/test/CodeGen/NVPTX/vector-select.ll
    llvm/test/CodeGen/NVPTX/vector-stores.ll
    llvm/test/CodeGen/NVPTX/weak-global.ll
    llvm/test/CodeGen/NVPTX/weak-linkage.ll
    llvm/test/lit.cfg.py

Removed: 
    llvm/test/CodeGen/NVPTX/tuple-literal.ll


################################################################################
diff  --git a/llvm/test/CodeGen/NVPTX/access-non-generic.ll b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
index e91b443a8381c..91b7e403e7906 100644
--- a/llvm/test/CodeGen/NVPTX/access-non-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX
 ; RUN: opt -mtriple=nvptx-- < %s -S -passes=infer-address-spaces | FileCheck %s --check-prefix IR
 ; RUN: opt -mtriple=nvptx64-- < %s -S -passes=infer-address-spaces | FileCheck %s --check-prefix IR
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 @array = internal addrspace(3) global [10 x float] zeroinitializer, align 4

diff  --git a/llvm/test/CodeGen/NVPTX/add-sub-128bit.ll b/llvm/test/CodeGen/NVPTX/add-sub-128bit.ll
index 7c6897110f426..9d451e90650df 100644
--- a/llvm/test/CodeGen/NVPTX/add-sub-128bit.ll
+++ b/llvm/test/CodeGen/NVPTX/add-sub-128bit.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=COMMON,NOCARRY
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=+ptx43 | FileCheck %s --check-prefixes=COMMON,CARRY
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=COMMON,NOCARRY
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx43 | FileCheck %s --check-prefixes=COMMON,CARRY
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll b/llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll
index acc0d9ae79c39..e876f41489963 100644
--- a/llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll
+++ b/llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: .visible .global .align 4 .u32 g = 42;
 ; CHECK: .visible .global .align 1 .b8 ga[4] = {0, 1, 2, 3};
-; CHECK: .visible .global .align 4 .u32 g2 = generic(g);
-; CHECK: .visible .global .align 4 .u32 g3 = g;
-; CHECK: .visible .global .align 8 .u32 g4[2] = {0, generic(g)};
-; CHECK: .visible .global .align 8 .u32 g5[2] = {0, generic(g)+8};
+; CHECK: .visible .global .align 8 .u64 g2 = generic(g);
+; CHECK: .visible .global .align 8 .u64 g3 = g;
+; CHECK: .visible .global .align 8 .u64 g4[2] = {0, generic(g)};
+; CHECK: .visible .global .align 8 .u64 g5[2] = {0, generic(g)+8};
 
 @g = addrspace(1) global i32 42
 @ga = addrspace(1) global [4 x i8] c"\00\01\02\03"
@@ -15,34 +15,34 @@
 @g4 = constant {ptr, ptr} {ptr null, ptr addrspacecast (ptr addrspace(1) @g to ptr)}
 @g5 = constant {ptr, ptr} {ptr null, ptr addrspacecast (ptr addrspace(1) getelementptr (i32, ptr addrspace(1) @g, i32 2) to ptr)}
 
-; CHECK: .visible .global .align 4 .u32 g6 = generic(ga)+2;
+; CHECK: .visible .global .align 8 .u64 g6 = generic(ga)+2;
 @g6 = addrspace(1) global ptr getelementptr inbounds (
   [4 x i8], ptr addrspacecast (ptr addrspace(1) @ga to ptr),
   i32 0, i32 2
 )
 
-; CHECK: .visible .global .align 4 .u32 g7 = generic(g);
+; CHECK: .visible .global .align 8 .u64 g7 = generic(g);
 @g7 = addrspace(1) global ptr addrspacecast (
   ptr addrspace(1) @g
   to ptr
 )
 
-; CHECK: .visible .global .align 4 .u32 g8[2] = {0, g};
+; CHECK: .visible .global .align 8 .u64 g8[2] = {0, g};
 @g8 = addrspace(1) global [2 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @g]
 
-; CHECK: .visible .global .align 4 .u32 g9[2] = {0, generic(g)};
+; CHECK: .visible .global .align 8 .u64 g9[2] = {0, generic(g)};
 @g9 = addrspace(1) global [2 x ptr] [
   ptr null,
   ptr addrspacecast (ptr addrspace(1) @g to ptr)
 ]
 
-; CHECK: .visible .global .align 4 .u32 g10[2] = {0, g};
+; CHECK: .visible .global .align 8 .u64 g10[2] = {0, g};
 @g10 = addrspace(1) global [2 x ptr addrspace(1)] [
   ptr addrspace(1) null,
   ptr addrspace(1) @g
 ]
 
-; CHECK: .visible .global .align 4 .u32 g11[2] = {0, generic(g)};
+; CHECK: .visible .global .align 8 .u64 g11[2] = {0, generic(g)};
 @g11 = addrspace(1) global [2 x ptr] [
   ptr null,
   ptr addrspacecast (ptr addrspace(1) @g to ptr)

diff  --git a/llvm/test/CodeGen/NVPTX/addrspacecast.ll b/llvm/test/CodeGen/NVPTX/addrspacecast.ll
index 77ae69cfc14b0..ff51a67071dab 100644
--- a/llvm/test/CodeGen/NVPTX/addrspacecast.ll
+++ b/llvm/test/CodeGen/NVPTX/addrspacecast.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,CLS32,G32
 ; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64,G64
 ; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr| FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64,G64
-; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc -O0 < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %}
 

diff  --git a/llvm/test/CodeGen/NVPTX/aggr-param.ll b/llvm/test/CodeGen/NVPTX/aggr-param.ll
index 88afbd62f6314..caa77fddaa7dd 100644
--- a/llvm/test/CodeGen/NVPTX/aggr-param.ll
+++ b/llvm/test/CodeGen/NVPTX/aggr-param.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; Make sure aggregate param types get emitted properly.
 

diff  --git a/llvm/test/CodeGen/NVPTX/annotations.ll b/llvm/test/CodeGen/NVPTX/annotations.ll
index 05a0944530018..37c317939cf05 100644
--- a/llvm/test/CodeGen/NVPTX/annotations.ll
+++ b/llvm/test/CodeGen/NVPTX/annotations.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 @texture = internal addrspace(1) global i64 0, align 8

diff  --git a/llvm/test/CodeGen/NVPTX/arg-lowering.ll b/llvm/test/CodeGen/NVPTX/arg-lowering.ll
index be39b2cfb0417..11f85cedde7f2 100644
--- a/llvm/test/CodeGen/NVPTX/arg-lowering.ll
+++ b/llvm/test/CodeGen/NVPTX/arg-lowering.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: .visible .func  (.param .align 16 .b8 func_retval0[16]) foo0(
 ; CHECK:          .param .align 4 .b8 foo0_param_0[8]

diff  --git a/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll b/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
index ba6754a333987..a244dfdb8b4ca 100644
--- a/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
+++ b/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
 
 ;; These tests should run for all targets

diff  --git a/llvm/test/CodeGen/NVPTX/arithmetic-int.ll b/llvm/test/CodeGen/NVPTX/arithmetic-int.ll
index 9bbe020e89c41..afee98fec5893 100644
--- a/llvm/test/CodeGen/NVPTX/arithmetic-int.ll
+++ b/llvm/test/CodeGen/NVPTX/arithmetic-int.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ;; These tests should run for all targets

diff  --git a/llvm/test/CodeGen/NVPTX/async-copy.ll b/llvm/test/CodeGen/NVPTX/async-copy.ll
index 075f6ca0e52aa..5bf32c8dd2187 100644
--- a/llvm/test/CodeGen/NVPTX/async-copy.ll
+++ b/llvm/test/CodeGen/NVPTX/async-copy.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s
-; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+; RUN: %if ptxas-11.0 && ! ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 
 declare void @llvm.nvvm.cp.async.wait.group(i32)

diff  --git a/llvm/test/CodeGen/NVPTX/atomics-sm60.ll b/llvm/test/CodeGen/NVPTX/atomics-sm60.ll
index 624ecb48c3433..de618c675674e 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm60.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm60.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %}
 
 ; CHECK-LABEL: .func test(

diff  --git a/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll b/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll
index 80643125a4c17..9d6eddcd65a18 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s -check-prefixes=CHECK,CHECK32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %}
 
 ; CHECK-LABEL: .func test_atomics_scope(

diff  --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index 6b326465d44e9..e99d0fd05e346 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_32 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_32 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_32 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_32 | %ptxas-verify %}
 
 
 ; CHECK-LABEL: atom0

diff  --git a/llvm/test/CodeGen/NVPTX/bf16.ll b/llvm/test/CodeGen/NVPTX/bf16.ll
index c2cf804c5013a..ca5d09eac7b12 100644
--- a/llvm/test/CodeGen/NVPTX/bf16.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
 
 ; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
 @"bfloat_array" = addrspace(1) constant [4 x bfloat]
@@ -7,8 +7,8 @@
 
 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @test_load_store
-; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK: st.global.b16 [{{%r[0-9]+}}], [[TMP]]
+; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}]
+; CHECK: st.global.b16 [{{%rd[0-9]+}}], [[TMP]]
   %val = load bfloat, ptr addrspace(1) %in
   store bfloat %val, ptr addrspace(1) %out
   ret void
@@ -16,8 +16,8 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 
 define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @test_bitcast_from_bfloat
-; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK: st.global.b16 [{{%r[0-9]+}}], [[TMP]]
+; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}]
+; CHECK: st.global.b16 [{{%rd[0-9]+}}], [[TMP]]
   %val = load bfloat, ptr addrspace(1) %in
   %val_int = bitcast bfloat %val to i16
   store i16 %val_int, ptr addrspace(1) %out
@@ -26,8 +26,8 @@ define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %ou
 
 define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; CHECK-LABEL: @test_bitcast_to_bfloat
-; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
+; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}]
+; CHECK: st.global.u16 [{{%rd[0-9]+}}], [[TMP]]
   %val = load i16, ptr addrspace(1) %in
   %val_fp = bitcast i16 %val to bfloat
   store bfloat %val_fp, ptr addrspace(1) %out

diff  --git a/llvm/test/CodeGen/NVPTX/bfe.ll b/llvm/test/CodeGen/NVPTX/bfe.ll
index 37bbc9fa3c370..58449a74dd6c3 100644
--- a/llvm/test/CodeGen/NVPTX/bfe.ll
+++ b/llvm/test/CodeGen/NVPTX/bfe.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 
 ; CHECK: bfe0

diff  --git a/llvm/test/CodeGen/NVPTX/bug17709.ll b/llvm/test/CodeGen/NVPTX/bug17709.ll
index 94ea3f502d241..6cfbc785cc71e 100644
--- a/llvm/test/CodeGen/NVPTX/bug17709.ll
+++ b/llvm/test/CodeGen/NVPTX/bug17709.ll
@@ -1,27 +1,27 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-
-; ModuleID = '__kernelgen_main_module'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-define private ptx_device { double, double } @__utils1_MOD_trace(ptr noalias %m) {
-entry:
-  ;unreachable
-  %t0 = insertvalue {double, double} undef, double 1.0, 0
-  %t1 = insertvalue {double, double} %t0, double 1.0, 1
-  ret { double, double } %t1
-}
-
-%struct.descriptor_dimension.0.52 = type { i64, i64, i64 }
-%"struct.array2_complex(kind=8).37.18.70" = type { ptr, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
-%"struct.array2_complex(kind=8).43.5.57" = type { ptr, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
- at replacementOfAlloca8 = private global %"struct.array2_complex(kind=8).37.18.70" zeroinitializer, align 4096
-
-; CHECK: .visible .entry __kernelgen_main
-define ptx_kernel void @__kernelgen_main(ptr nocapture %args, ptr) {
-entry:
-  %1 = tail call ptx_device { double, double } @__utils1_MOD_trace(ptr noalias @replacementOfAlloca8)
-  ret void
-}
-
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+
+; ModuleID = '__kernelgen_main_module'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define private ptx_device { double, double } @__utils1_MOD_trace(ptr noalias %m) {
+entry:
+  ;unreachable
+  %t0 = insertvalue {double, double} undef, double 1.0, 0
+  %t1 = insertvalue {double, double} %t0, double 1.0, 1
+  ret { double, double } %t1
+}
+
+%struct.descriptor_dimension.0.52 = type { i64, i64, i64 }
+%"struct.array2_complex(kind=8).37.18.70" = type { ptr, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
+%"struct.array2_complex(kind=8).43.5.57" = type { ptr, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
+ at replacementOfAlloca8 = private global %"struct.array2_complex(kind=8).37.18.70" zeroinitializer, align 4096
+
+; CHECK: .visible .entry __kernelgen_main
+define ptx_kernel void @__kernelgen_main(ptr nocapture %args, ptr) {
+entry:
+  %1 = tail call ptx_device { double, double } @__utils1_MOD_trace(ptr noalias @replacementOfAlloca8)
+  ret void
+}
+

diff  --git a/llvm/test/CodeGen/NVPTX/bug22246.ll b/llvm/test/CodeGen/NVPTX/bug22246.ll
index be689861ab5b7..b091f7793a4b5 100644
--- a/llvm/test/CodeGen/NVPTX/bug22246.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22246.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/bug22322.ll b/llvm/test/CodeGen/NVPTX/bug22322.ll
index bb4a7c0ae5a81..67364ab356578 100644
--- a/llvm/test/CodeGen/NVPTX/bug22322.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22322.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
index d090dd0a8f1f4..1ee29ef276458 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
 ; Verify that we correctly emit code for extending ldg/ldu. We do not expose
 ; extending variants in the backend, but the ldg/ldu selection code may pick

diff  --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll
index bcbbb5cd08c7f..361e5de793f6a 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
 ; Verify that we correctly emit code for i8 ldg/ldu. We do not expose 8-bit
 ; registers in the backend, so these loads need special handling.

diff  --git a/llvm/test/CodeGen/NVPTX/bug52623.ll b/llvm/test/CodeGen/NVPTX/bug52623.ll
index 4359a662ef44e..fd5824dbc72d7 100644
--- a/llvm/test/CodeGen/NVPTX/bug52623.ll
+++ b/llvm/test/CodeGen/NVPTX/bug52623.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -verify-machineinstrs
-; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -verify-machineinstrs
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
 
 ; Check that llc will not crash even when first MBB doesn't contain
 ; any instruction.

diff  --git a/llvm/test/CodeGen/NVPTX/bypass-div.ll b/llvm/test/CodeGen/NVPTX/bypass-div.ll
index 78d02dc838774..5c41e603fd635 100644
--- a/llvm/test/CodeGen/NVPTX/bypass-div.ll
+++ b/llvm/test/CodeGen/NVPTX/bypass-div.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
 ; 64-bit divides and rems should be split into a fast and slow path where
 ; the fast path uses a 32-bit operation.

diff  --git a/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll b/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
index 56f9c905caa91..c5f7bd1bd1ba2 100644
--- a/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
+++ b/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_50 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_50 -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_50 -verify-machineinstrs | %ptxas-verify %}
 
 ; calls with a bitcasted function symbol should be fine, but in combination with
 ; a byval attribute were causing a segfault during isel. This testcase was

diff  --git a/llvm/test/CodeGen/NVPTX/callchain.ll b/llvm/test/CodeGen/NVPTX/callchain.ll
index 2d9eb738cc3bb..59f936bb6aac4 100644
--- a/llvm/test/CodeGen/NVPTX/callchain.ll
+++ b/llvm/test/CodeGen/NVPTX/callchain.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target triple = "nvptx"
 

diff  --git a/llvm/test/CodeGen/NVPTX/calling-conv.ll b/llvm/test/CodeGen/NVPTX/calling-conv.ll
index 7a5655a868185..1499eb1da45d6 100644
--- a/llvm/test/CodeGen/NVPTX/calling-conv.ll
+++ b/llvm/test/CodeGen/NVPTX/calling-conv.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 

diff  --git a/llvm/test/CodeGen/NVPTX/calls-with-phi.ll b/llvm/test/CodeGen/NVPTX/calls-with-phi.ll
index dcc31af57cdd7..af44b9e483c84 100644
--- a/llvm/test/CodeGen/NVPTX/calls-with-phi.ll
+++ b/llvm/test/CodeGen/NVPTX/calls-with-phi.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx 2>&1 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 2>&1 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
 
 ; Make sure the example doesn't crash with segfault
 

diff  --git a/llvm/test/CodeGen/NVPTX/combine-min-max.ll b/llvm/test/CodeGen/NVPTX/combine-min-max.ll
index e5dba45de0b4b..72d21407a2b67 100644
--- a/llvm/test/CodeGen/NVPTX/combine-min-max.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-min-max.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O2 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O2 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -O2 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -O2 | %ptxas-verify %}
 
 ; *************************************
 ; * Cases with no min/max

diff  --git a/llvm/test/CodeGen/NVPTX/compare-int.ll b/llvm/test/CodeGen/NVPTX/compare-int.ll
index 3f82aec7563e0..25cf15cbfc7fd 100644
--- a/llvm/test/CodeGen/NVPTX/compare-int.ll
+++ b/llvm/test/CodeGen/NVPTX/compare-int.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ;; These tests should run for all targets

diff  --git a/llvm/test/CodeGen/NVPTX/constant-vectors.ll b/llvm/test/CodeGen/NVPTX/constant-vectors.ll
index 1cffb5afc19a0..a426d7cd2de67 100644
--- a/llvm/test/CodeGen/NVPTX/constant-vectors.ll
+++ b/llvm/test/CodeGen/NVPTX/constant-vectors.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target triple = "nvptx-nvidia-cuda"
 

diff  --git a/llvm/test/CodeGen/NVPTX/convert-fp.ll b/llvm/test/CodeGen/NVPTX/convert-fp.ll
index 7f9c40d7f26ab..d46c8d1afd775 100644
--- a/llvm/test/CodeGen/NVPTX/convert-fp.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-fp.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 define i16 @cvt_u16_f32(float %x) {

diff  --git a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll
index 38a2c6b361190..d7e2cede8a991 100644
--- a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 

diff  --git a/llvm/test/CodeGen/NVPTX/ctlz.ll b/llvm/test/CodeGen/NVPTX/ctlz.ll
index 0fbe45b466f69..c9932e1a25a71 100644
--- a/llvm/test/CodeGen/NVPTX/ctlz.ll
+++ b/llvm/test/CodeGen/NVPTX/ctlz.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/ctpop.ll b/llvm/test/CodeGen/NVPTX/ctpop.ll
index e89f2d6fcf27e..9c2c746d3e912 100644
--- a/llvm/test/CodeGen/NVPTX/ctpop.ll
+++ b/llvm/test/CodeGen/NVPTX/ctpop.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/cttz.ll b/llvm/test/CodeGen/NVPTX/cttz.ll
index 37e85349c42b6..9fe0895c743a4 100644
--- a/llvm/test/CodeGen/NVPTX/cttz.ll
+++ b/llvm/test/CodeGen/NVPTX/cttz.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/disable-opt.ll b/llvm/test/CodeGen/NVPTX/disable-opt.ll
index 18cc28a9560df..a42b10cd876c9 100644
--- a/llvm/test/CodeGen/NVPTX/disable-opt.ll
+++ b/llvm/test/CodeGen/NVPTX/disable-opt.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O0 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -O0 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -O0 | %ptxas-verify %}
 
 define void @foo(ptr %output) {
 ; CHECK-LABEL: .visible .func foo(

diff  --git a/llvm/test/CodeGen/NVPTX/div-ri.ll b/llvm/test/CodeGen/NVPTX/div-ri.ll
index c95350349c865..0578d5147b17f 100644
--- a/llvm/test/CodeGen/NVPTX/div-ri.ll
+++ b/llvm/test/CodeGen/NVPTX/div-ri.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-prec-divf32=0 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-prec-divf32=0 | %ptxas-verify %}
 
 define float @foo(float %a) {
 ; CHECK: div.approx.f32

diff  --git a/llvm/test/CodeGen/NVPTX/divrem-combine.ll b/llvm/test/CodeGen/NVPTX/divrem-combine.ll
index 323c40f2f0df1..c715a7a65d20f 100644
--- a/llvm/test/CodeGen/NVPTX/divrem-combine.ll
+++ b/llvm/test/CodeGen/NVPTX/divrem-combine.ll
@@ -1,7 +1,7 @@
-; RUN: llc -O2 < %s -march=nvptx -mcpu=sm_35 | FileCheck %s --check-prefix=O2 --check-prefix=CHECK
-; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_35 | FileCheck %s --check-prefix=O0 --check-prefix=CHECK
-; RUN: %if ptxas %{ llc -O2 < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify %}
-; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify %}
+; RUN: llc -O2 < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix=O2 --check-prefix=CHECK
+; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix=O0 --check-prefix=CHECK
+; RUN: %if ptxas %{ llc -O2 < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
 ; The following IR
 ;

diff  --git a/llvm/test/CodeGen/NVPTX/envreg.ll b/llvm/test/CodeGen/NVPTX/envreg.ll
index 5bd4b0cfee67e..fe1e411ee2204 100644
--- a/llvm/test/CodeGen/NVPTX/envreg.ll
+++ b/llvm/test/CodeGen/NVPTX/envreg.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 
 declare i32 @llvm.nvvm.read.ptx.sreg.envreg0()

diff  --git a/llvm/test/CodeGen/NVPTX/fast-math.ll b/llvm/test/CodeGen/NVPTX/fast-math.ll
index cf86de2d54938..08944d4e33b80 100644
--- a/llvm/test/CodeGen/NVPTX/fast-math.ll
+++ b/llvm/test/CodeGen/NVPTX/fast-math.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 declare float @llvm.sqrt.f32(float)
 declare double @llvm.sqrt.f64(double)

diff  --git a/llvm/test/CodeGen/NVPTX/fma-assoc.ll b/llvm/test/CodeGen/NVPTX/fma-assoc.ll
index d4e55351b85c2..8a3a50cea74c1 100644
--- a/llvm/test/CodeGen/NVPTX/fma-assoc.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-assoc.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s -check-prefix=CHECK
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | %ptxas-verify %}
 
 define ptx_device float @t1_f32(float %x, float %y, float %z,
                                 float %u, float %v) {

diff  --git a/llvm/test/CodeGen/NVPTX/fma-disable.ll b/llvm/test/CodeGen/NVPTX/fma-disable.ll
index f693d45ff8a4c..e2db31f9114c5 100644
--- a/llvm/test/CodeGen/NVPTX/fma-disable.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-disable.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | FileCheck %s -check-prefix=MUL
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 | FileCheck %s -check-prefix=FMA
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 | FileCheck %s -check-prefix=MUL
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=1 | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=1 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 | %ptxas-verify %}
 

diff  --git a/llvm/test/CodeGen/NVPTX/fma.ll b/llvm/test/CodeGen/NVPTX/fma.ll
index 2dd85ee989217..8f6d02e89877f 100644
--- a/llvm/test/CodeGen/NVPTX/fma.ll
+++ b/llvm/test/CodeGen/NVPTX/fma.ll
@@ -1,43 +1,43 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs | %ptxas-verify %}
-
-declare float @dummy_f32(float, float) #0
-declare double @dummy_f64(double, double) #0
-
-define ptx_device float @t1_f32(float %x, float %y, float %z) {
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-  %a = fmul float %x, %y
-  %b = fadd float %a, %z
-  ret float %b
-}
-
-define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
-  %a = fmul float %x, %y
-  %b = fadd float %a, %z
-  %c = fadd float %a, %w
-  %d = call float @dummy_f32(float %b, float %c)
-  ret float %d
-}
-
-define ptx_device double @t1_f64(double %x, double %y, double %z) {
-; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-  %a = fmul double %x, %y
-  %b = fadd double %a, %z
-  ret double %b
-}
-
-define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
-; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
-  %a = fmul double %x, %y
-  %b = fadd double %a, %z
-  %c = fadd double %a, %w
-  %d = call double @dummy_f64(double %b, double %c)
-  ret double %d
-}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs | %ptxas-verify %}
+
+declare float @dummy_f32(float, float) #0
+declare double @dummy_f64(double, double) #0
+
+define ptx_device float @t1_f32(float %x, float %y, float %z) {
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul float %x, %y
+  %b = fadd float %a, %z
+  ret float %b
+}
+
+define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul float %x, %y
+  %b = fadd float %a, %z
+  %c = fadd float %a, %w
+  %d = call float @dummy_f32(float %b, float %c)
+  ret float %d
+}
+
+define ptx_device double @t1_f64(double %x, double %y, double %z) {
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul double %x, %y
+  %b = fadd double %a, %z
+  ret double %b
+}
+
+define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul double %x, %y
+  %b = fadd double %a, %z
+  %c = fadd double %a, %w
+  %d = call double @dummy_f64(double %b, double %c)
+  ret double %d
+}

diff  --git a/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll b/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll
index 69b692513775d..461cfc2c814fb 100644
--- a/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=nvptx | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -march=nvptx -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK
-; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %}
-; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+; RUN: llc < %s -march=nvptx64 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
 
 ; ---- minimum ----
 

diff  --git a/llvm/test/CodeGen/NVPTX/fp-literals.ll b/llvm/test/CodeGen/NVPTX/fp-literals.ll
index 58313b7e47a87..8acc19d908854 100644
--- a/llvm/test/CodeGen/NVPTX/fp-literals.ll
+++ b/llvm/test/CodeGen/NVPTX/fp-literals.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
 
 target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

diff  --git a/llvm/test/CodeGen/NVPTX/fp16.ll b/llvm/test/CodeGen/NVPTX/fp16.ll
index 207972c9275fa..bb3216f7f1975 100644
--- a/llvm/test/CodeGen/NVPTX/fp16.ll
+++ b/llvm/test/CodeGen/NVPTX/fp16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=nvptx -verify-machineinstrs < %s | FileCheck %s
-; RUN: %if ptxas %{ llc -march=nvptx -verify-machineinstrs < %s | %ptxas-verify %}
+; RUN: llc -march=nvptx64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: %if ptxas %{ llc -march=nvptx64 -verify-machineinstrs < %s | %ptxas-verify %}
 
 declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
 declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone

diff  --git a/llvm/test/CodeGen/NVPTX/function-align.ll b/llvm/test/CodeGen/NVPTX/function-align.ll
index b65fc12d8e929..ee98dceb6d575 100644
--- a/llvm/test/CodeGen/NVPTX/function-align.ll
+++ b/llvm/test/CodeGen/NVPTX/function-align.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-NOT: .align 2
 define ptx_device void @foo() align 2 {

diff  --git a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
index 1d5e438c1b727..32f2d0ec52d27 100644
--- a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
+++ b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/global-addrspace.ll b/llvm/test/CodeGen/NVPTX/global-addrspace.ll
index f3a711a931a73..521e7bd131cfa 100644
--- a/llvm/test/CodeGen/NVPTX/global-addrspace.ll
+++ b/llvm/test/CodeGen/NVPTX/global-addrspace.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; PTX32: .visible .global .align 4 .u32 i;

diff  --git a/llvm/test/CodeGen/NVPTX/global-ordering.ll b/llvm/test/CodeGen/NVPTX/global-ordering.ll
index aaa34bb4578e5..6403e2686668b 100644
--- a/llvm/test/CodeGen/NVPTX/global-ordering.ll
+++ b/llvm/test/CodeGen/NVPTX/global-ordering.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; Make sure we emit these globals in def-use order

diff  --git a/llvm/test/CodeGen/NVPTX/global-visibility.ll b/llvm/test/CodeGen/NVPTX/global-visibility.ll
index ef88da7b896c1..d3030d590a8b7 100644
--- a/llvm/test/CodeGen/NVPTX/global-visibility.ll
+++ b/llvm/test/CodeGen/NVPTX/global-visibility.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; PTX does not support .hidden or .protected.
 ; Make sure we do not emit them.

diff  --git a/llvm/test/CodeGen/NVPTX/globals_init.ll b/llvm/test/CodeGen/NVPTX/globals_init.ll
index b482843773bcd..bdfba6b602aec 100644
--- a/llvm/test/CodeGen/NVPTX/globals_init.ll
+++ b/llvm/test/CodeGen/NVPTX/globals_init.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; Make sure the globals constant initializers are not prone to host endianess 
 ; issues.

diff  --git a/llvm/test/CodeGen/NVPTX/globals_lowering.ll b/llvm/test/CodeGen/NVPTX/globals_lowering.ll
index a03fea4fa8781..b0b7aeb0900ac 100644
--- a/llvm/test/CodeGen/NVPTX/globals_lowering.ll
+++ b/llvm/test/CodeGen/NVPTX/globals_lowering.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -relocation-model=static | FileCheck %s --check-prefix CHK
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -relocation-model=static | %ptxas-verify %}
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -relocation-model=static | FileCheck %s --check-prefix CHK
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -relocation-model=static | %ptxas-verify %}
 
 %MyStruct = type { i32, i32, float }
 @Gbl = internal addrspace(3) global [1024 x %MyStruct] zeroinitializer

diff  --git a/llvm/test/CodeGen/NVPTX/half.ll b/llvm/test/CodeGen/NVPTX/half.ll
index 92acc43914b1e..dedd35789a2fb 100644
--- a/llvm/test/CodeGen/NVPTX/half.ll
+++ b/llvm/test/CodeGen/NVPTX/half.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
 
 ; CHECK: .b8 half_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
 @"half_array" = addrspace(1) constant [4 x half]
@@ -7,8 +7,8 @@
 
 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @test_load_store
-; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK: st.global.b16 [{{%r[0-9]+}}], [[TMP]]
+; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}]
+; CHECK: st.global.b16 [{{%rd[0-9]+}}], [[TMP]]
   %val = load half, ptr addrspace(1) %in
   store half %val, ptr addrspace(1) %out
   ret void
@@ -16,8 +16,8 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 
 define void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @test_bitcast_from_half
-; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK: st.global.b16 [{{%r[0-9]+}}], [[TMP]]
+; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}]
+; CHECK: st.global.b16 [{{%rd[0-9]+}}], [[TMP]]
   %val = load half, ptr addrspace(1) %in
   %val_int = bitcast half %val to i16
   store i16 %val_int, ptr addrspace(1) %out
@@ -26,8 +26,8 @@ define void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out)
 
 define void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; CHECK-LABEL: @test_bitcast_to_half
-; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
+; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}]
+; CHECK: st.global.u16 [{{%rd[0-9]+}}], [[TMP]]
   %val = load i16, ptr addrspace(1) %in
   %val_fp = bitcast i16 %val to half
   store half %val_fp, ptr addrspace(1) %out

diff  --git a/llvm/test/CodeGen/NVPTX/i1-global.ll b/llvm/test/CodeGen/NVPTX/i1-global.ll
index 3a4d87891328a..60540bf45945e 100644
--- a/llvm/test/CodeGen/NVPTX/i1-global.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-global.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll b/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll
index 6c69e40832de2..6920be5cc4e9e 100644
--- a/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-LABEL: foo
 ; CHECK: setp

diff  --git a/llvm/test/CodeGen/NVPTX/i1-param.ll b/llvm/test/CodeGen/NVPTX/i1-param.ll
index c64c4d35779d7..375752b619a58 100644
--- a/llvm/test/CodeGen/NVPTX/i1-param.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-param.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx-nvidia-cuda"
@@ -8,7 +8,7 @@ target triple = "nvptx-nvidia-cuda"
 
 ; CHECK: .entry foo
 ; CHECK:   .param .u8 foo_param_0
-; CHECK:   .param .u32 foo_param_1
+; CHECK:   .param .u64 foo_param_1
 define void @foo(i1 %p, ptr %out) {
   %val = zext i1 %p to i32
   store i32 %val, ptr %out

diff  --git a/llvm/test/CodeGen/NVPTX/i128-param.ll b/llvm/test/CodeGen/NVPTX/i128-param.ll
index 52178e65c5ee5..c2f23124049ca 100644
--- a/llvm/test/CodeGen/NVPTX/i128-param.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-param.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-LABEL: .visible .func callee(
 ; CHECK-NEXT: .param .align 16 .b8 callee_param_0[16],

diff  --git a/llvm/test/CodeGen/NVPTX/i8-param.ll b/llvm/test/CodeGen/NVPTX/i8-param.ll
index d8ca8fe28079d..a4c26ddd02949 100644
--- a/llvm/test/CodeGen/NVPTX/i8-param.ll
+++ b/llvm/test/CodeGen/NVPTX/i8-param.ll
@@ -1,24 +1,24 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-
-; CHECK: .visible .func  (.param .b32 func_retval0) callee
-define i8 @callee(i8 %a) {
-; CHECK: ld.param.u8
-  %ret = add i8 %a, 42
-; CHECK: st.param.b32
-  ret i8 %ret
-}
-
-; CHECK: .visible .func caller
-define void @caller(ptr %a) {
-; CHECK: ld.u8
-  %val = load i8, ptr %a
-  %ret = tail call i8 @callee(i8 %val)
-; CHECK: ld.param.b32
-  store i8 %ret, ptr %a
-  ret void
-}
-
-  
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+; CHECK: .visible .func  (.param .b32 func_retval0) callee
+define i8 @callee(i8 %a) {
+; CHECK: ld.param.u8
+  %ret = add i8 %a, 42
+; CHECK: st.param.b32
+  ret i8 %ret
+}
+
+; CHECK: .visible .func caller
+define void @caller(ptr %a) {
+; CHECK: ld.u8
+  %val = load i8, ptr %a
+  %ret = tail call i8 @callee(i8 %val)
+; CHECK: ld.param.b32
+  store i8 %ret, ptr %a
+  ret void
+}
+
+  

diff  --git a/llvm/test/CodeGen/NVPTX/idioms.ll b/llvm/test/CodeGen/NVPTX/idioms.ll
index c6cc752d0ea18..e8fe47c303f92 100644
--- a/llvm/test/CodeGen/NVPTX/idioms.ll
+++ b/llvm/test/CodeGen/NVPTX/idioms.ll
@@ -2,7 +2,7 @@
 
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 %struct.S16 = type { i16, i16 }

diff  --git a/llvm/test/CodeGen/NVPTX/imad.ll b/llvm/test/CodeGen/NVPTX/imad.ll
index b3be772c5fa75..2ab94106a8a28 100644
--- a/llvm/test/CodeGen/NVPTX/imad.ll
+++ b/llvm/test/CodeGen/NVPTX/imad.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: imad
 define i32 @imad(i32 %a, i32 %b, i32 %c) {

diff  --git a/llvm/test/CodeGen/NVPTX/inline-asm.ll b/llvm/test/CodeGen/NVPTX/inline-asm.ll
index aa27bf1532565..aa493b45ed8c5 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 define float @test(float %x) {
 entry:

diff  --git a/llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll b/llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll
index 04c180d4d5c99..1f0c47c1171a8 100644
--- a/llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll
+++ b/llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=nvptx < %s | FileCheck %s
-; RUN: %if ptxas %{ llc -march=nvptx < %s | %ptxas-verify %}
+; RUN: llc -march=nvptx64 < %s | FileCheck %s
+; RUN: %if ptxas %{ llc -march=nvptx64 < %s | %ptxas-verify %}
 
 ; Test that %c works with immediates
 ; CHECK-LABEL: test_inlineasm_c_output_template0

diff  --git a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll
index 862226b3abd4c..3930e6d774183 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll
@@ -10,7 +10,7 @@
 ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \
 ; RUN:    -passes=nvvm-intr-range -nvvm-intr-range-sm=30 \
 ; RUN:   | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_30 %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 define ptx_device i32 @test_tid_x() {

diff  --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index c6061d3502e91..c09c7a72fd101 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-LABEL: test_fabsf(

diff  --git a/llvm/test/CodeGen/NVPTX/isspacep.ll b/llvm/test/CodeGen/NVPTX/isspacep.ll
index 8ac199aa4a0b7..437c84737a426 100644
--- a/llvm/test/CodeGen/NVPTX/isspacep.ll
+++ b/llvm/test/CodeGen/NVPTX/isspacep.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 declare i1 @llvm.nvvm.isspacep.const(ptr) readnone noinline
 declare i1 @llvm.nvvm.isspacep.global(ptr) readnone noinline

diff  --git a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll
index 475405ec48502..8900b58f6603f 100644
--- a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll
+++ b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G32,LS32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G64,LS64
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s --check-prefixes=G64,LS32
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %}
 

diff  --git a/llvm/test/CodeGen/NVPTX/ld-generic.ll b/llvm/test/CodeGen/NVPTX/ld-generic.ll
index 087de8c63709f..4af9d4e88813b 100644
--- a/llvm/test/CodeGen/NVPTX/ld-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/ld-generic.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 

diff  --git a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
index e6681fe744845..923d438b7f701 100644
--- a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
+++ b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
@@ -2,10 +2,10 @@
 # LLVM generates correct PTX for them.
 
 # RUN: %python %s > %t.ll
-# RUN: llc < %t.ll -march=nvptx64 -mcpu=sm_30 | FileCheck -check-prefixes=CHECK,CHECK_P64 %t.ll
 # RUN: llc < %t.ll -march=nvptx -mcpu=sm_30 | FileCheck -check-prefixes=CHECK,CHECK_P32 %t.ll
+# RUN: llc < %t.ll -march=nvptx64 -mcpu=sm_30 | FileCheck -check-prefixes=CHECK,CHECK_P64 %t.ll
+# RUN: %if ptxas && !ptxas-12.0 %{ llc < %t.ll -march=nvptx -mcpu=sm_30 | %ptxas-verify %}
 # RUN: %if ptxas %{ llc < %t.ll -march=nvptx64 -mcpu=sm_30 | %ptxas-verify %}
-# RUN: %if ptxas %{ llc < %t.ll -march=nvptx -mcpu=sm_30 | %ptxas-verify %}
 
 from __future__ import print_function
 

diff  --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
index 55031e3cf96eb..dc20441a67a8b 100644
--- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
+++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 declare <4 x float> @bar()
 
 ; CHECK-LABEL: .func foo(
 define void @foo(ptr %ptr) {
-; CHECK:     ld.param.u32 %[[PTR:r[0-9]+]], [foo_param_0];
+; CHECK:     ld.param.u64 %[[PTR:rd[0-9]+]], [foo_param_0];
 ; CHECK:     ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0+0];
 ; CHECK:     st.v4.f32    [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]}
   %val = tail call <4 x float> @bar()

diff  --git a/llvm/test/CodeGen/NVPTX/ldu-i8.ll b/llvm/test/CodeGen/NVPTX/ldu-i8.ll
index 26ce65e6eb014..f25040fc0d188 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-i8.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-i8.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
index 6677169632e40..f01f2fc6e3fe5 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
 
 define void @reg_plus_offset(ptr %a) {
-; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%r{{[0-9]+}}+32];
-; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%r{{[0-9]+}}+36];
+; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%rd{{[0-9]+}}+32];
+; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%rd{{[0-9]+}}+36];
   %p2 = getelementptr i32, ptr %a, i32 8
   %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr %p2, i32 4)
   %p3 = getelementptr i32, ptr %a, i32 9

diff  --git a/llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll b/llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll
index d7a9765c8e83d..e7b2140b92200 100644
--- a/llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll
+++ b/llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx 2>&1 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 2>&1 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
 
 ; Allow to make libcalls that are defined in the current module
 

diff  --git a/llvm/test/CodeGen/NVPTX/load-sext-i1.ll b/llvm/test/CodeGen/NVPTX/load-sext-i1.ll
index eb2596bba1b65..a8493b63084b5 100644
--- a/llvm/test/CodeGen/NVPTX/load-sext-i1.ll
+++ b/llvm/test/CodeGen/NVPTX/load-sext-i1.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
index 14e6d1c4bb76e..d702ede61addf 100644
--- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX64
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 ; Ensure we access the local stack properly

diff  --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index cc46657c30131..f041f202777f6 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK64
 ; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK32
+; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK64
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 %struct.ham = type { [4 x i32] }
 

diff  --git a/llvm/test/CodeGen/NVPTX/machine-sink.ll b/llvm/test/CodeGen/NVPTX/machine-sink.ll
index 9269fec0d150f..0c47b1756ed8b 100644
--- a/llvm/test/CodeGen/NVPTX/machine-sink.ll
+++ b/llvm/test/CodeGen/NVPTX/machine-sink.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/managed.ll b/llvm/test/CodeGen/NVPTX/managed.ll
index dfc6a3c975bcb..6616dbcd03227 100644
--- a/llvm/test/CodeGen/NVPTX/managed.ll
+++ b/llvm/test/CodeGen/NVPTX/managed.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_30 -mattr=+ptx40 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -mattr=+ptx40 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx40 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx40 | %ptxas-verify %}
 
-; RUN: not --crash llc < %s -march=nvptx -mcpu=sm_20 2>&1 | FileCheck %s --check-prefix ERROR
+; RUN: not --crash llc < %s -march=nvptx64 -mcpu=sm_20 2>&1 | FileCheck %s --check-prefix ERROR
 ; ERROR: LLVM ERROR: .attribute(.managed) requires PTX version >= 4.0 and sm_30
 
 ; CHECK: .visible .global .align 4 .u32 device_g;
@@ -11,7 +11,7 @@
 
 ; CHECK: .extern .global .align 4 .u32 decl_g;
 @decl_g = external addrspace(1) global i32, align 4
-; CHECK: .extern .global .attribute(.managed) .align 8 .b32 managed_decl_g;
+; CHECK: .extern .global .attribute(.managed) .align 8 .b64 managed_decl_g;
 @managed_decl_g = external addrspace(1) global ptr, align 8
 
 !nvvm.annotations = !{!0, !1}

diff  --git a/llvm/test/CodeGen/NVPTX/mbarrier.ll b/llvm/test/CodeGen/NVPTX/mbarrier.ll
index c85131bd5d881..ac0324beed7a2 100644
--- a/llvm/test/CodeGen/NVPTX/mbarrier.ll
+++ b/llvm/test/CodeGen/NVPTX/mbarrier.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX64
-; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+; RUN: %if ptxas-11.0 && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
 ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
 
 declare void @llvm.nvvm.mbarrier.init(ptr %a, i32 %b)

diff  --git a/llvm/test/CodeGen/NVPTX/minmax-negative.ll b/llvm/test/CodeGen/NVPTX/minmax-negative.ll
index c83fe79816e97..d3dbb2eaa9e33 100644
--- a/llvm/test/CodeGen/NVPTX/minmax-negative.ll
+++ b/llvm/test/CodeGen/NVPTX/minmax-negative.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -O0 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -O0 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -O0 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -O0 | %ptxas-verify %}
 
 define i16 @test1(ptr %sur1) {
 ; CHECK-NOT: mov.u16 %rs{{[0-9]+}}, 32767

diff  --git a/llvm/test/CodeGen/NVPTX/module-inline-asm.ll b/llvm/test/CodeGen/NVPTX/module-inline-asm.ll
index 317542dba687d..b3a08d4347db9 100644
--- a/llvm/test/CodeGen/NVPTX/module-inline-asm.ll
+++ b/llvm/test/CodeGen/NVPTX/module-inline-asm.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/mulwide.ll b/llvm/test/CodeGen/NVPTX/mulwide.ll
index ff8670821f1e6..77c21564c8aa7 100644
--- a/llvm/test/CodeGen/NVPTX/mulwide.ll
+++ b/llvm/test/CodeGen/NVPTX/mulwide.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O3 | FileCheck %s --check-prefix=OPT
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 | FileCheck %s --check-prefix=NOOPT
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O3 | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O0 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -O3 | FileCheck %s --check-prefix=OPT
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -O0 | FileCheck %s --check-prefix=NOOPT
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -O3 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -O0 | %ptxas-verify %}
 
 ; OPT-LABEL: @mulwide16
 ; NOOPT-LABEL: @mulwide16

diff  --git a/llvm/test/CodeGen/NVPTX/named-barriers.ll b/llvm/test/CodeGen/NVPTX/named-barriers.ll
index 5c3af1cd2f352..10f1340809839 100644
--- a/llvm/test/CodeGen/NVPTX/named-barriers.ll
+++ b/llvm/test/CodeGen/NVPTX/named-barriers.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; Use bar.sync to arrive at a pre-computed barrier number and

diff  --git a/llvm/test/CodeGen/NVPTX/nofunc.ll b/llvm/test/CodeGen/NVPTX/nofunc.ll
index 33b26210165a9..608da797c80d1 100644
--- a/llvm/test/CodeGen/NVPTX/nofunc.ll
+++ b/llvm/test/CodeGen/NVPTX/nofunc.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; Test that we don't crash if we're compiling a module with function references,

diff  --git a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
index 12f9ba38f5e3b..69afdac93b3bd 100644
--- a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
@@ -1,13 +1,13 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target triple = "nvptx-unknown-nvcl"
 
 define void @foo(i64 %img, i64 %sampler, ptr align 32 %v1, ptr %v2) {
 ; The parameter alignment is determined by the align attribute (default 1).
 ; CHECK-LABEL: .entry foo(
-; CHECK: .param .u32 .ptr .align 32 foo_param_2
-; CHECK: .param .u32 .ptr .align 1 foo_param_3
+; CHECK: .param .u64 .ptr .align 32 foo_param_2
+; CHECK: .param .u64 .ptr .align 1 foo_param_3
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll b/llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll
index 8d28b9cab1e75..0b8d247b0bca6 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx-unknown-unknown | %ptxas-verify %}
+; RUN: llc < %s -mtriple=nvptx64-unknown-unknown | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-unknown-unknown | %ptxas-verify %}
 ;
 ; NVPTXTargetLowering::getFunctionParamOptimizedAlign, which was introduces in
 ; D120129, contained a poorly designed assertion checking that a function with

diff  --git a/llvm/test/CodeGen/NVPTX/packed-aggr.ll b/llvm/test/CodeGen/NVPTX/packed-aggr.ll
index e741a92a2c983..6fefefd3a7206 100644
--- a/llvm/test/CodeGen/NVPTX/packed-aggr.ll
+++ b/llvm/test/CodeGen/NVPTX/packed-aggr.ll
@@ -5,7 +5,7 @@
 ; RUN:   FileCheck %s --check-prefixes=CHECK,CHECK32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | \
 ; RUN:   FileCheck %s --check-prefixes=CHECK,CHECK64
-; RUN: %if ptxas-11.1 %{ llc < %s -march=nvptx -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}
+; RUN: %if ptxas-11.1 && !ptxas-12.0%{ llc < %s -march=nvptx -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}
 ; RUN: %if ptxas-11.1 %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}
 
 ;; Test that packed structs with symbol references are represented using the

diff  --git a/llvm/test/CodeGen/NVPTX/param-align.ll b/llvm/test/CodeGen/NVPTX/param-align.ll
index 5f5d77b22477b..5435ee238c88d 100644
--- a/llvm/test/CodeGen/NVPTX/param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/param-align.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=CHECK,NOALIGN4
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-force-min-byval-param-align | FileCheck %s --check-prefixes=CHECK,ALIGN4
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-force-min-byval-param-align | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=CHECK,NOALIGN4
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-force-min-byval-param-align | FileCheck %s --check-prefixes=CHECK,ALIGN4
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-force-min-byval-param-align | %ptxas-verify %}
 
 ;;; Need 4-byte alignment on ptr passed byval
 define ptx_device void @t1(ptr byval(float) %x) {

diff  --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
index 48841952c983d..55fadf10f8d6d 100644
--- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
+++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx-unknown-unknown | %ptxas-verify %}
+; RUN: llc < %s -mtriple=nvptx64-unknown-unknown | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-unknown-unknown | %ptxas-verify %}
 ;
 ; Check that parameters of a __device__ function with private or internal
 ; linkage called from a __global__ (kernel) function get increased alignment,
@@ -81,7 +81,7 @@
 define dso_local void @caller_St4x1(ptr nocapture noundef readonly byval(%struct.St4x1) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func caller_St4x1(
   ; CHECK:               .param .align 4 .b8 caller_St4x1_param_0[4],
-  ; CHECK:               .param .b32 caller_St4x1_param_1
+  ; CHECK:               .param .b64 caller_St4x1_param_1
   ; CHECK:       )
   ; CHECK:       .param .b32 param0;
   ; CHECK:       st.param.b32 [param0+0], {{%r[0-9]+}};
@@ -113,7 +113,7 @@ define internal fastcc [1 x i32] @callee_St4x1(i32 %in.0.val) {
 define dso_local void @caller_St4x2(ptr nocapture noundef readonly byval(%struct.St4x2) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func caller_St4x2(
   ; CHECK:               .param .align 4 .b8 caller_St4x2_param_0[8],
-  ; CHECK:               .param .b32 caller_St4x2_param_1
+  ; CHECK:               .param .b64 caller_St4x2_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[8];
   ; CHECK:       st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
@@ -154,7 +154,7 @@ define internal fastcc [2 x i32] @callee_St4x2(ptr nocapture noundef readonly by
 define dso_local void @caller_St4x3(ptr nocapture noundef readonly byval(%struct.St4x3) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func caller_St4x3(
   ; CHECK:               .param .align 4 .b8 caller_St4x3_param_0[12],
-  ; CHECK:               .param .b32 caller_St4x3_param_1
+  ; CHECK:               .param .b64 caller_St4x3_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[12];
   ; CHECK:       st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
@@ -202,7 +202,7 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by
 define dso_local void @caller_St4x4(ptr nocapture noundef readonly byval(%struct.St4x4) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func caller_St4x4(
   ; CHECK:               .param .align 4 .b8 caller_St4x4_param_0[16],
-  ; CHECK:               .param .b32 caller_St4x4_param_1
+  ; CHECK:               .param .b64 caller_St4x4_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[16];
   ; CHECK:       st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
@@ -252,7 +252,7 @@ define internal fastcc [4 x i32] @callee_St4x4(ptr nocapture noundef readonly by
 define dso_local void @caller_St4x5(ptr nocapture noundef readonly byval(%struct.St4x5) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func caller_St4x5(
   ; CHECK:               .param .align 4 .b8 caller_St4x5_param_0[20],
-  ; CHECK:               .param .b32 caller_St4x5_param_1
+  ; CHECK:               .param .b64 caller_St4x5_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[20];
   ; CHECK:       st.param.v4.b32 [param0+0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
@@ -312,7 +312,7 @@ define internal fastcc [5 x i32] @callee_St4x5(ptr nocapture noundef readonly by
 define dso_local void @caller_St4x6(ptr nocapture noundef readonly byval(%struct.St4x6) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func caller_St4x6(
   ; CHECK:               .param .align 4 .b8 caller_St4x6_param_0[24],
-  ; CHECK:               .param .b32 caller_St4x6_param_1
+  ; CHECK:               .param .b64 caller_St4x6_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[24];
   ; CHECK:       st.param.v4.b32 [param0+0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
@@ -378,7 +378,7 @@ define internal fastcc [6 x i32] @callee_St4x6(ptr nocapture noundef readonly by
 define dso_local void @caller_St4x7(ptr nocapture noundef readonly byval(%struct.St4x7) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func caller_St4x7(
   ; CHECK:               .param .align 4 .b8 caller_St4x7_param_0[28],
-  ; CHECK:               .param .b32 caller_St4x7_param_1
+  ; CHECK:               .param .b64 caller_St4x7_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[28];
   ; CHECK:       st.param.v4.b32 [param0+0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
@@ -454,7 +454,7 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by
 define dso_local void @caller_St4x8(ptr nocapture noundef readonly byval(%struct.St4x8) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func caller_St4x8(
   ; CHECK:               .param .align 4 .b8 caller_St4x8_param_0[32],
-  ; CHECK:               .param .b32 caller_St4x8_param_1
+  ; CHECK:               .param .b64 caller_St4x8_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[32];
   ; CHECK:       st.param.v4.b32 [param0+0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
@@ -532,7 +532,7 @@ define internal fastcc [8 x i32] @callee_St4x8(ptr nocapture noundef readonly by
 define dso_local void @caller_St8x1(ptr nocapture noundef readonly byval(%struct.St8x1) align 8 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func caller_St8x1(
   ; CHECK:               .param .align 8 .b8 caller_St8x1_param_0[8],
-  ; CHECK:               .param .b32 caller_St8x1_param_1
+  ; CHECK:               .param .b64 caller_St8x1_param_1
   ; CHECK:       )
   ; CHECK:       .param .b64 param0;
   ; CHECK:       st.param.b64 [param0+0], {{%rd[0-9]+}};
@@ -564,7 +564,7 @@ define internal fastcc [1 x i64] @callee_St8x1(i64 %in.0.val) {
 define dso_local void @caller_St8x2(ptr nocapture noundef readonly byval(%struct.St8x2) align 8 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func caller_St8x2(
   ; CHECK:               .param .align 8 .b8 caller_St8x2_param_0[16],
-  ; CHECK:               .param .b32 caller_St8x2_param_1
+  ; CHECK:               .param .b64 caller_St8x2_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[16];
   ; CHECK:       st.param.v2.b64 [param0+0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
@@ -602,7 +602,7 @@ define internal fastcc [2 x i64] @callee_St8x2(ptr nocapture noundef readonly by
 define dso_local void @caller_St8x3(ptr nocapture noundef readonly byval(%struct.St8x3) align 8 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func caller_St8x3(
   ; CHECK:               .param .align 8 .b8 caller_St8x3_param_0[24],
-  ; CHECK:               .param .b32 caller_St8x3_param_1
+  ; CHECK:               .param .b64 caller_St8x3_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[24];
   ; CHECK:       st.param.v2.b64 [param0+0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
@@ -650,7 +650,7 @@ define internal fastcc [3 x i64] @callee_St8x3(ptr nocapture noundef readonly by
 define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct.St8x4) align 8 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func caller_St8x4(
   ; CHECK:               .param .align 8 .b8 caller_St8x4_param_0[32],
-  ; CHECK:               .param .b32 caller_St8x4_param_1
+  ; CHECK:               .param .b64 caller_St8x4_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[32];
   ; CHECK:       st.param.v2.b64 [param0+0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};

diff  --git a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
index 37ffa4e077cec..9ca1eddee9d7b 100644
--- a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
+++ b/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx-unknown-unknown | %ptxas-verify %}
+; RUN: llc < %s -mtriple=nvptx64-unknown-unknown | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-unknown-unknown | %ptxas-verify %}
 ;
 ; Check that parameters of a __global__ (kernel) function do not get increased
 ; alignment, and no additional vectorization is performed on loads/stores with
@@ -63,9 +63,9 @@
 define dso_local void @foo_St4x1(ptr nocapture noundef readonly byval(%struct.St4x1) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func foo_St4x1(
   ; CHECK:               .param .align 4 .b8 foo_St4x1_param_0[4],
-  ; CHECK:               .param .b32 foo_St4x1_param_1
+  ; CHECK:               .param .b64 foo_St4x1_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x1_param_1];
+  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x1_param_1];
   ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x1_param_0];
   ; CHECK:       st.u32  [[[R1]]], [[R2]];
   ; CHECK:       ret;
@@ -77,9 +77,9 @@ define dso_local void @foo_St4x1(ptr nocapture noundef readonly byval(%struct.St
 define dso_local void @foo_St4x2(ptr nocapture noundef readonly byval(%struct.St4x2) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func foo_St4x2(
   ; CHECK:               .param .align 4 .b8 foo_St4x2_param_0[8],
-  ; CHECK:               .param .b32 foo_St4x2_param_1
+  ; CHECK:               .param .b64 foo_St4x2_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x2_param_1];
+  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x2_param_1];
   ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x2_param_0];
   ; CHECK:       st.u32  [[[R1]]], [[R2]];
   ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x2_param_0+4];
@@ -97,9 +97,9 @@ define dso_local void @foo_St4x2(ptr nocapture noundef readonly byval(%struct.St
 define dso_local void @foo_St4x3(ptr nocapture noundef readonly byval(%struct.St4x3) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func foo_St4x3(
   ; CHECK:               .param .align 4 .b8 foo_St4x3_param_0[12],
-  ; CHECK:               .param .b32 foo_St4x3_param_1
+  ; CHECK:               .param .b64 foo_St4x3_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x3_param_1];
+  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x3_param_1];
   ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x3_param_0];
   ; CHECK:       st.u32  [[[R1]]], [[R2]];
   ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x3_param_0+4];
@@ -123,9 +123,9 @@ define dso_local void @foo_St4x3(ptr nocapture noundef readonly byval(%struct.St
 define dso_local void @foo_St4x4(ptr nocapture noundef readonly byval(%struct.St4x4) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func foo_St4x4(
   ; CHECK:               .param .align 4 .b8 foo_St4x4_param_0[16],
-  ; CHECK:               .param .b32 foo_St4x4_param_1
+  ; CHECK:               .param .b64 foo_St4x4_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x4_param_1];
+  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x4_param_1];
   ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x4_param_0];
   ; CHECK:       st.u32  [[[R1]]], [[R2]];
   ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x4_param_0+4];
@@ -155,9 +155,9 @@ define dso_local void @foo_St4x4(ptr nocapture noundef readonly byval(%struct.St
 define dso_local void @foo_St4x5(ptr nocapture noundef readonly byval(%struct.St4x5) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func foo_St4x5(
   ; CHECK:               .param .align 4 .b8 foo_St4x5_param_0[20],
-  ; CHECK:               .param .b32 foo_St4x5_param_1
+  ; CHECK:               .param .b64 foo_St4x5_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x5_param_1];
+  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x5_param_1];
   ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x5_param_0];
   ; CHECK:       st.u32  [[[R1]]], [[R2]];
   ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x5_param_0+4];
@@ -193,9 +193,9 @@ define dso_local void @foo_St4x5(ptr nocapture noundef readonly byval(%struct.St
 define dso_local void @foo_St4x6(ptr nocapture noundef readonly byval(%struct.St4x6) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func foo_St4x6(
   ; CHECK:               .param .align 4 .b8 foo_St4x6_param_0[24],
-  ; CHECK:               .param .b32 foo_St4x6_param_1
+  ; CHECK:               .param .b64 foo_St4x6_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x6_param_1];
+  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x6_param_1];
   ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x6_param_0];
   ; CHECK:       st.u32  [[[R1]]], [[R2]];
   ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x6_param_0+4];
@@ -237,9 +237,9 @@ define dso_local void @foo_St4x6(ptr nocapture noundef readonly byval(%struct.St
 define dso_local void @foo_St4x7(ptr nocapture noundef readonly byval(%struct.St4x7) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func foo_St4x7(
   ; CHECK:               .param .align 4 .b8 foo_St4x7_param_0[28],
-  ; CHECK:               .param .b32 foo_St4x7_param_1
+  ; CHECK:               .param .b64 foo_St4x7_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x7_param_1];
+  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x7_param_1];
   ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x7_param_0];
   ; CHECK:       st.u32  [[[R1]]], [[R2]];
   ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x7_param_0+4];
@@ -287,9 +287,9 @@ define dso_local void @foo_St4x7(ptr nocapture noundef readonly byval(%struct.St
 define dso_local void @foo_St4x8(ptr nocapture noundef readonly byval(%struct.St4x8) align 4 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func foo_St4x8(
   ; CHECK:               .param .align 4 .b8 foo_St4x8_param_0[32],
-  ; CHECK:               .param .b32 foo_St4x8_param_1
+  ; CHECK:               .param .b64 foo_St4x8_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x8_param_1];
+  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x8_param_1];
   ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x8_param_0];
   ; CHECK:       st.u32  [[[R1]]], [[R2]];
   ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x8_param_0+4];
@@ -343,9 +343,9 @@ define dso_local void @foo_St4x8(ptr nocapture noundef readonly byval(%struct.St
 define dso_local void @foo_St8x1(ptr nocapture noundef readonly byval(%struct.St8x1) align 8 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func foo_St8x1(
   ; CHECK:               .param .align 8 .b8 foo_St8x1_param_0[8],
-  ; CHECK:               .param .b32 foo_St8x1_param_1
+  ; CHECK:               .param .b64 foo_St8x1_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x1_param_1];
+  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x1_param_1];
   ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x1_param_0];
   ; CHECK:       st.u64 [[[R1]]], [[RD1]];
   ; CHECK:       ret;
@@ -357,9 +357,9 @@ define dso_local void @foo_St8x1(ptr nocapture noundef readonly byval(%struct.St
 define dso_local void @foo_St8x2(ptr nocapture noundef readonly byval(%struct.St8x2) align 8 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func foo_St8x2(
   ; CHECK:               .param .align 8 .b8 foo_St8x2_param_0[16],
-  ; CHECK:               .param .b32 foo_St8x2_param_1
+  ; CHECK:               .param .b64 foo_St8x2_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x2_param_1];
+  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x2_param_1];
   ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x2_param_0];
   ; CHECK:       st.u64 [[[R1]]], [[RD1]];
   ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x2_param_0+8];
@@ -377,9 +377,9 @@ define dso_local void @foo_St8x2(ptr nocapture noundef readonly byval(%struct.St
 define dso_local void @foo_St8x3(ptr nocapture noundef readonly byval(%struct.St8x3) align 8 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func foo_St8x3(
   ; CHECK:               .param .align 8 .b8 foo_St8x3_param_0[24],
-  ; CHECK:               .param .b32 foo_St8x3_param_1
+  ; CHECK:               .param .b64 foo_St8x3_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x3_param_1];
+  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x3_param_1];
   ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x3_param_0];
   ; CHECK:       st.u64 [[[R1]]], [[RD1]];
   ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x3_param_0+8];
@@ -403,9 +403,9 @@ define dso_local void @foo_St8x3(ptr nocapture noundef readonly byval(%struct.St
 define dso_local void @foo_St8x4(ptr nocapture noundef readonly byval(%struct.St8x4) align 8 %in, ptr nocapture noundef writeonly %ret) {
   ; CHECK-LABEL: .visible .func foo_St8x4(
   ; CHECK:               .param .align 8 .b8 foo_St8x4_param_0[32],
-  ; CHECK:               .param .b32 foo_St8x4_param_1
+  ; CHECK:               .param .b64 foo_St8x4_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x4_param_1];
+  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x4_param_1];
   ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x4_param_0];
   ; CHECK:       st.u64 [[[R1]]], [[RD1]];
   ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x4_param_0+8];

diff  --git a/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll b/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll
index cef5286949ca5..bf44c4687bb2a 100644
--- a/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll
+++ b/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=nvptx -verify-machineinstrs < %s | FileCheck %s
-; RUN: %if ptxas %{ llc -march=nvptx -verify-machineinstrs < %s | %ptxas-verify %}
+; RUN: llc -march=nvptx64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: %if ptxas %{ llc -march=nvptx64 -verify-machineinstrs < %s | %ptxas-verify %}
 
 ; Tests the following pattern:
 ; (X & 8) != 0 --> (X & 8) >> 3

diff  --git a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
index 374ab67dc46ca..8aa7f2c3eb55f 100644
--- a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
+++ b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 define ptx_kernel void @t1(ptr %a) {

diff  --git a/llvm/test/CodeGen/NVPTX/pr16278.ll b/llvm/test/CodeGen/NVPTX/pr16278.ll
index da1d81a6ad487..1696b3cf366d6 100644
--- a/llvm/test/CodeGen/NVPTX/pr16278.ll
+++ b/llvm/test/CodeGen/NVPTX/pr16278.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 @one_f = addrspace(4) global float 1.000000e+00, align 4
 

diff  --git a/llvm/test/CodeGen/NVPTX/pr17529.ll b/llvm/test/CodeGen/NVPTX/pr17529.ll
index 6919c00c7c52b..525572e1365c6 100644
--- a/llvm/test/CodeGen/NVPTX/pr17529.ll
+++ b/llvm/test/CodeGen/NVPTX/pr17529.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
index 1a57207ce4ea0..a32203a0c3b2b 100644
--- a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
+++ b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
 ; Check load from constant global variables.  These loads should be
 ; ld.global.nc (aka ldg).

diff  --git a/llvm/test/CodeGen/NVPTX/refl1.ll b/llvm/test/CodeGen/NVPTX/refl1.ll
index 5010e5c37d3c0..053c7a972b347 100644
--- a/llvm/test/CodeGen/NVPTX/refl1.ll
+++ b/llvm/test/CodeGen/NVPTX/refl1.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target triple = "nvptx-nvidia-cuda"
 

diff  --git a/llvm/test/CodeGen/NVPTX/reg-types.ll b/llvm/test/CodeGen/NVPTX/reg-types.ll
index 54ed66b0d9508..d738a55952955 100644
--- a/llvm/test/CodeGen/NVPTX/reg-types.ll
+++ b/llvm/test/CodeGen/NVPTX/reg-types.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=NO8BIT
 ; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=NO8BIT
-; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc -O0 < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-LABEL: .visible .func func()

diff  --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll
index 724af4c69c11e..9d058662c2717 100644
--- a/llvm/test/CodeGen/NVPTX/rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
-; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
 
 declare i32 @llvm.nvvm.rotate.b32(i32, i32)

diff  --git a/llvm/test/CodeGen/NVPTX/rotate_64.ll b/llvm/test/CodeGen/NVPTX/rotate_64.ll
index 84982643099ab..64659ce1b5c56 100644
--- a/llvm/test/CodeGen/NVPTX/rotate_64.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate_64.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
 
 declare i64 @llvm.nvvm.rotate.b64(i64, i32)
 declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)

diff  --git a/llvm/test/CodeGen/NVPTX/sched1.ll b/llvm/test/CodeGen/NVPTX/sched1.ll
index a25888ac729da..7dc5c5a0e17d2 100644
--- a/llvm/test/CodeGen/NVPTX/sched1.ll
+++ b/llvm/test/CodeGen/NVPTX/sched1.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; Ensure source scheduling is working
 

diff  --git a/llvm/test/CodeGen/NVPTX/sched2.ll b/llvm/test/CodeGen/NVPTX/sched2.ll
index 11e577755e296..722d70dbe06e0 100644
--- a/llvm/test/CodeGen/NVPTX/sched2.ll
+++ b/llvm/test/CodeGen/NVPTX/sched2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 define void @foo(ptr %a) {
 ; CHECK: .func foo

diff  --git a/llvm/test/CodeGen/NVPTX/sext-in-reg.ll b/llvm/test/CodeGen/NVPTX/sext-in-reg.ll
index 49fc9cc56befa..59cefd0ca5eda 100644
--- a/llvm/test/CodeGen/NVPTX/sext-in-reg.ll
+++ b/llvm/test/CodeGen/NVPTX/sext-in-reg.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/sext-params.ll b/llvm/test/CodeGen/NVPTX/sext-params.ll
index 7b0fe7c6e381e..41287bada9cd1 100644
--- a/llvm/test/CodeGen/NVPTX/sext-params.ll
+++ b/llvm/test/CodeGen/NVPTX/sext-params.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/shift-parts.ll b/llvm/test/CodeGen/NVPTX/shift-parts.ll
index 0f4018db332f5..15c9c8b0738c6 100644
--- a/llvm/test/CodeGen/NVPTX/shift-parts.ll
+++ b/llvm/test/CodeGen/NVPTX/shift-parts.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: shift_parts_left_128
 define void @shift_parts_left_128(ptr %val, ptr %amtptr) {

diff  --git a/llvm/test/CodeGen/NVPTX/short-ptr.ll b/llvm/test/CodeGen/NVPTX/short-ptr.ll
index 22958391c6336..c6f9106af8b20 100644
--- a/llvm/test/CodeGen/NVPTX/short-ptr.ll
+++ b/llvm/test/CodeGen/NVPTX/short-ptr.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix CHECK-DEFAULT-32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-short-ptr | FileCheck %s --check-prefixes CHECK-SHORT-SHARED,CHECK-SHORT-CONST,CHECK-SHORT-LOCAL
 
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-short-ptr | %ptxas-verify %}
 

diff  --git a/llvm/test/CodeGen/NVPTX/simple-call.ll b/llvm/test/CodeGen/NVPTX/simple-call.ll
index 682f0630a561c..d0b01336e940f 100644
--- a/llvm/test/CodeGen/NVPTX/simple-call.ll
+++ b/llvm/test/CodeGen/NVPTX/simple-call.ll
@@ -1,26 +1,26 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-
-; CHECK: .func ({{.*}}) device_func
-define float @device_func(float %a) noinline {
-  %ret = fmul float %a, %a
-  ret float %ret
-}
-
-; CHECK: .entry kernel_func
-define void @kernel_func(ptr %a) {
-  %val = load float, ptr %a
-; CHECK: call.uni (retval0),
-; CHECK: device_func,
-  %mul = call float @device_func(float %val)
-  store float %mul, ptr %a
-  ret void
-}
-
-
-
-!nvvm.annotations = !{!1}
-
-!1 = !{ptr @kernel_func, !"kernel", i32 1}
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+
+; CHECK: .func ({{.*}}) device_func
+define float @device_func(float %a) noinline {
+  %ret = fmul float %a, %a
+  ret float %ret
+}
+
+; CHECK: .entry kernel_func
+define void @kernel_func(ptr %a) {
+  %val = load float, ptr %a
+; CHECK: call.uni (retval0),
+; CHECK: device_func,
+  %mul = call float @device_func(float %val)
+  store float %mul, ptr %a
+  ret void
+}
+
+
+
+!nvvm.annotations = !{!1}
+
+!1 = !{ptr @kernel_func, !"kernel", i32 1}

diff  --git a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
index 5069108c9c7d6..d63c0432aa92c 100644
--- a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
+++ b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \
 ; RUN:   | FileCheck %s
 ; RUN: %if ptxas %{                                                                   \
-; RUN:   llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \
+; RUN:   llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \
 ; RUN:   | %ptxas-verify                                                              \
 ; RUN: %}
 

diff  --git a/llvm/test/CodeGen/NVPTX/st-addrspace.ll b/llvm/test/CodeGen/NVPTX/st-addrspace.ll
index 9840103e0be2c..525a837801073 100644
--- a/llvm/test/CodeGen/NVPTX/st-addrspace.ll
+++ b/llvm/test/CodeGen/NVPTX/st-addrspace.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G32,LS32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G64,LS64
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s --check-prefixes=G64,LS32
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %}
 

diff  --git a/llvm/test/CodeGen/NVPTX/st-generic.ll b/llvm/test/CodeGen/NVPTX/st-generic.ll
index be423f6c2b98f..0d1261f625f1a 100644
--- a/llvm/test/CodeGen/NVPTX/st-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/st-generic.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ;; i8

diff  --git a/llvm/test/CodeGen/NVPTX/store-retval.ll b/llvm/test/CodeGen/NVPTX/store-retval.ll
index 6e652d9b71911..6a60c97b854b7 100644
--- a/llvm/test/CodeGen/NVPTX/store-retval.ll
+++ b/llvm/test/CodeGen/NVPTX/store-retval.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s --mtriple=nvptx-unknown-unknown | FileCheck %s
-; RUN: %if ptxas %{ llc < %s --mtriple=nvptx-unknown-unknown | %ptxas-verify %}
+; RUN: llc < %s --mtriple=nvptx64-unknown-unknown | FileCheck %s
+; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64-unknown-unknown | %ptxas-verify %}
 ;
 ; This is IR generated with clang using -O3 optimization level
 ; and nvptx-unknown-unknown target from the following C code.

diff  --git a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
index 86938b01c2295..a4ab789246916 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20
-; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %}
 
 target triple = "nvptx-unknown-cuda"
 
@@ -20,8 +20,8 @@ define void @foo(i64 %img, ptr %red, i32 %idx) {
 ; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
 ; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
   %ret = sitofp i32 %val to float
-; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]]
-; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]]
+; SM20: st.global.f32 [%rd{{[0-9]+}}], %f[[REDF]]
+; SM30: st.global.f32 [%rd{{[0-9]+}}], %f[[REDF]]
   store float %ret, ptr %red
   ret void
 }
@@ -39,8 +39,8 @@ define void @bar(ptr %red, i32 %idx) {
 ; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
 ; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
   %ret = sitofp i32 %val to float
-; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]]
-; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]]
+; SM20: st.global.f32 [%rd{{[0-9]+}}], %f[[REDF]]
+; SM30: st.global.f32 [%rd{{[0-9]+}}], %f[[REDF]]
   store float %ret, ptr %red
   ret void
 }

diff  --git a/llvm/test/CodeGen/NVPTX/surf-read.ll b/llvm/test/CodeGen/NVPTX/surf-read.ll
index dcf8b4ec739ed..1976fcfc00470 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 target triple = "nvptx-unknown-nvcl"
 
@@ -11,7 +11,7 @@ define void @foo(i64 %img, ptr %red, i32 %idx) {
   %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
 ; CHECK: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
   %ret = sitofp i32 %val to float
-; CHECK: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+; CHECK: st.f32 [%rd{{[0-9]+}}], %f[[REDF]]
   store float %ret, ptr %red
   ret void
 }

diff  --git a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
index 748965b8d1dfc..9d840ce24e7af 100644
--- a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20
-; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %}
 
 target triple = "nvptx-unknown-cuda"
 

diff  --git a/llvm/test/CodeGen/NVPTX/surf-write.ll b/llvm/test/CodeGen/NVPTX/surf-write.ll
index 0e2503aafde3c..3948b7194ffc0 100644
--- a/llvm/test/CodeGen/NVPTX/surf-write.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-write.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 target triple = "nvptx-unknown-nvcl"
 

diff  --git a/llvm/test/CodeGen/NVPTX/symbol-naming.ll b/llvm/test/CodeGen/NVPTX/symbol-naming.ll
index 143a7c6de32e3..a719dd57c4b9c 100644
--- a/llvm/test/CodeGen/NVPTX/symbol-naming.ll
+++ b/llvm/test/CodeGen/NVPTX/symbol-naming.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mattr=+ptx60 -mcpu=sm_30 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %}
 
 ; Verify that the NVPTX target removes invalid symbol names prior to emitting

diff  --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
index ae3c40c1fc948..107671d1d1f39 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20
-; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %}
 
 
 target triple = "nvptx-unknown-cuda"
@@ -18,8 +18,8 @@ define void @foo(i64 %img, ptr %red, i32 %idx) {
 ; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXREG]], {%r{{[0-9]+}}}]
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %img, i32 %idx)
   %ret = extractvalue { float, float, float, float } %val, 0
-; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[RED]]
-; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[RED]]
+; SM20: st.global.f32 [%rd{{[0-9]+}}], %f[[RED]]
+; SM30: st.global.f32 [%rd{{[0-9]+}}], %f[[RED]]
   store float %ret, ptr %red
   ret void
 }
@@ -36,8 +36,8 @@ define void @bar(ptr %red, i32 %idx) {
 ; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXHANDLE]], {%r{{[0-9]+}}}]
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx)
   %ret = extractvalue { float, float, float, float } %val, 0
-; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[RED]]
-; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[RED]]
+; SM20: st.global.f32 [%rd{{[0-9]+}}], %f[[RED]]
+; SM30: st.global.f32 [%rd{{[0-9]+}}], %f[[RED]]
   store float %ret, ptr %red
   ret void
 }
@@ -63,8 +63,8 @@ define void @baz(ptr %red, i32 %idx) {
 ; SM20: add.rn.f32 %f[[RET2:[0-9]+]], %f[[RED]], %f[[TEXCALL]]
 ; SM30: add.rn.f32 %f[[RET2:[0-9]+]], %f[[RED]], %f[[TEXCALL]]
   %ret2 = fadd float %ret, %texcall
-; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[RET2]]
-; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[RET2]]
+; SM20: st.global.f32 [%rd{{[0-9]+}}], %f[[RET2]]
+; SM30: st.global.f32 [%rd{{[0-9]+}}], %f[[RET2]]
   store float %ret2, ptr %red
   ret void
 }

diff  --git a/llvm/test/CodeGen/NVPTX/tex-read.ll b/llvm/test/CodeGen/NVPTX/tex-read.ll
index d2edb30a28a70..8f6e0416beded 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 target triple = "nvptx-unknown-nvcl"
 
@@ -10,7 +10,7 @@ define void @foo(i64 %img, i64 %sampler, ptr %red, i32 %idx) {
 ; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64 %img, i64 %sampler, i32 %idx)
   %ret = extractvalue { float, float, float, float } %val, 0
-; CHECK: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+; CHECK: st.f32 [%rd{{[0-9]+}}], %f[[RED]]
   store float %ret, ptr %red
   ret void
 }

diff  --git a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll
index 37ec507836f9f..7c3043855c1cc 100644
--- a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll
+++ b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20
-; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %}
 
 target triple = "nvptx-unknown-cuda"
 

diff  --git a/llvm/test/CodeGen/NVPTX/tuple-literal.ll b/llvm/test/CodeGen/NVPTX/tuple-literal.ll
deleted file mode 100644
index 157780a8ccfdb..0000000000000
--- a/llvm/test/CodeGen/NVPTX/tuple-literal.ll
+++ /dev/null
@@ -1,5 +0,0 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 %if ptxas %{ | %ptxas-verify %}
-
-define ptx_device void @test_function(ptr) {
-  ret void
-}

diff  --git a/llvm/test/CodeGen/NVPTX/vaargs.ll b/llvm/test/CodeGen/NVPTX/vaargs.ll
index ab79f45906067..b8c213de04f8d 100644
--- a/llvm/test/CodeGen/NVPTX/vaargs.ll
+++ b/llvm/test/CodeGen/NVPTX/vaargs.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -O0 -march=nvptx -mattr=+ptx60 -mcpu=sm_30 | FileCheck %s --check-prefixes=CHECK,CHECK32
 ; RUN: llc < %s -O0 -march=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | FileCheck %s --check-prefixes=CHECK,CHECK64
-; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -O0 -march=nvptx -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %}
 
 ; CHECK: .address_size [[BITS:32|64]]

diff  --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll
index c8b64426f05f1..f8b16c1feab5e 100644
--- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll
+++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/vec8.ll b/llvm/test/CodeGen/NVPTX/vec8.ll
index fd383ce709bb7..092607462f332 100644
--- a/llvm/test/CodeGen/NVPTX/vec8.ll
+++ b/llvm/test/CodeGen/NVPTX/vec8.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target triple = "nvptx-unknown-cuda"
 
@@ -7,7 +7,7 @@ target triple = "nvptx-unknown-cuda"
 define void @foo(<8 x i8> %a, ptr %b) {
 ; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0]
 ; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4]
-; CHECK-DAG: ld.param.u32   %[[B:r[0-9+]]], [foo_param_1]
+; CHECK-DAG: ld.param.u64   %[[B:rd[0-9+]]], [foo_param_1]
 ; CHECK:     add.s16        [[T:%rs[0-9+]]], [[E1]], [[E6]];
 ; CHECK:     st.u8          [%[[B]]], [[T]];
   %t0 = extractelement <8 x i8> %a, i32 1

diff  --git a/llvm/test/CodeGen/NVPTX/vector-args.ll b/llvm/test/CodeGen/NVPTX/vector-args.ll
index c8eb714c7e3ce..162061ff34ba1 100644
--- a/llvm/test/CodeGen/NVPTX/vector-args.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-args.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 define float @foo(<2 x float> %a) {
 ; CHECK: .func (.param .b32 func_retval0) foo

diff  --git a/llvm/test/CodeGen/NVPTX/vector-call.ll b/llvm/test/CodeGen/NVPTX/vector-call.ll
index 5d1192724362e..15e4697333cb4 100644
--- a/llvm/test/CodeGen/NVPTX/vector-call.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-call.ll
@@ -1,31 +1,31 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-
-target triple = "nvptx-unknown-cuda"
-
-declare void @bar(<4 x i32>)
-
-; CHECK-LABEL: .func foo(
-; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0];
-; CHECK: .param .align 16 .b8 param0[16];
-; CHECK-DAG: st.param.v4.b32  [param0+0],  {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK:     call.uni
-; CHECK:     ret;
-define void @foo(<4 x i32> %a) {
-  tail call void @bar(<4 x i32> %a)
-  ret void
-}
-
-; CHECK-LABEL: .func foo3(
-; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0];
-; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8];
-; CHECK: .param .align 16 .b8 param0[16];
-; CHECK-DAG: st.param.v2.b32  [param0+0],  {[[E0]], [[E1]]};
-; CHECK-DAG: st.param.b32     [param0+8],  [[E2]];
-; CHECK:     call.uni
-; CHECK:     ret;
-declare void @bar3(<3 x i32>)
-define void @foo3(<3 x i32> %a) {
-  tail call void @bar3(<3 x i32> %a)
-  ret void
-}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+
+target triple = "nvptx-unknown-cuda"
+
+declare void @bar(<4 x i32>)
+
+; CHECK-LABEL: .func foo(
+; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK-DAG: st.param.v4.b32  [param0+0],  {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:     call.uni
+; CHECK:     ret;
+define void @foo(<4 x i32> %a) {
+  tail call void @bar(<4 x i32> %a)
+  ret void
+}
+
+; CHECK-LABEL: .func foo3(
+; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0];
+; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK-DAG: st.param.v2.b32  [param0+0],  {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b32     [param0+8],  [[E2]];
+; CHECK:     call.uni
+; CHECK:     ret;
+declare void @bar3(<3 x i32>)
+define void @foo3(<3 x i32> %a) {
+  tail call void @bar3(<3 x i32> %a)
+  ret void
+}

diff  --git a/llvm/test/CodeGen/NVPTX/vector-compare.ll b/llvm/test/CodeGen/NVPTX/vector-compare.ll
index 425efd00f2aa2..d4235ff86c6af 100644
--- a/llvm/test/CodeGen/NVPTX/vector-compare.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-compare.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20  | %ptxas-verify -m32 %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20  | %ptxas-verify -m32 %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; This test makes sure that the result of vector compares are properly

diff  --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll
index 64662dd020c57..b700fb5e8f34c 100644
--- a/llvm/test/CodeGen/NVPTX/vector-loads.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; Even though general vector types are not supported in PTX, we can still
 ; optimize loads/stores with pseudo-vector instructions of the form:

diff  --git a/llvm/test/CodeGen/NVPTX/vector-select.ll b/llvm/test/CodeGen/NVPTX/vector-select.ll
index aab3da0b23e8a..12b43549f0a91 100644
--- a/llvm/test/CodeGen/NVPTX/vector-select.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-select.ll
@@ -1,16 +1,27 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 %if ptxas %{ | %ptxas-verify %}
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 %if ptxas %{ | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; This test makes sure that vector selects are scalarized by the type legalizer.
 ; If not, type legalization will fail.
 
+; CHECK-LABEL: .visible .func foo(
 define void @foo(ptr addrspace(1) %def_a, ptr addrspace(1) %def_b, ptr addrspace(1) %def_c) {
 entry:
+; CHECK:  ld.global.v2.u32
+; CHECK:  ld.global.v2.u32
+; CHECK:  ld.global.v2.u32
   %tmp4 = load <2 x i32>, ptr addrspace(1) %def_a
   %tmp6 = load <2 x i32>, ptr addrspace(1) %def_c
   %tmp8 = load <2 x i32>, ptr addrspace(1) %def_b
+; CHECK:  setp.gt.s32
+; CHECK:  setp.gt.s32
   %0 = icmp sge <2 x i32> %tmp4, zeroinitializer
+; CHECK:  selp.b32
+; CHECK:  selp.b32
   %cond = select <2 x i1> %0, <2 x i32> %tmp6, <2 x i32> %tmp8
+; CHECK:  st.global.v2.u32
   store <2 x i32> %cond, ptr addrspace(1) %def_c
   ret void
 }

diff  --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll
index e6d545323dbe9..0254a58d2aebd 100644
--- a/llvm/test/CodeGen/NVPTX/vector-stores.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: .visible .func foo1
 ; CHECK: st.v2.f32

diff  --git a/llvm/test/CodeGen/NVPTX/weak-global.ll b/llvm/test/CodeGen/NVPTX/weak-global.ll
index 3386731827cd8..dd0160d1c0a65 100644
--- a/llvm/test/CodeGen/NVPTX/weak-global.ll
+++ b/llvm/test/CodeGen/NVPTX/weak-global.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: .weak .global .align 4 .u32 g
 @g = common addrspace(1) global i32 zeroinitializer

diff  --git a/llvm/test/CodeGen/NVPTX/weak-linkage.ll b/llvm/test/CodeGen/NVPTX/weak-linkage.ll
index 8c97e4b6a0483..75cac2dacc8ac 100644
--- a/llvm/test/CodeGen/NVPTX/weak-linkage.ll
+++ b/llvm/test/CodeGen/NVPTX/weak-linkage.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: // .weak foo
 ; CHECK: .weak .func foo

diff  --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index fc682bae5093c..a6574c405107e 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -305,6 +305,7 @@ def enable_ptxas(ptxas_executable):
             (11, 7),
             (11, 8),
             (12, 0),
+            (12, 1),
         ]
 
         def version_int(ver):


        


More information about the llvm-commits mailing list