[llvm] 0f1b5f1 - [NVPTX] Integrate ptxas to LIT tests

Thu Apr 28 05:02:47 PDT 2022

Author: Andrew Savonichev
Date: 2022-04-28T14:59:45+03:00
New Revision: 0f1b5f115a7f6fd90989996ae514810773157b76

URL: https://github.com/llvm/llvm-project/commit/0f1b5f115a7f6fd90989996ae514810773157b76
DIFF: https://github.com/llvm/llvm-project/commit/0f1b5f115a7f6fd90989996ae514810773157b76.diff

LOG: [NVPTX] Integrate ptxas to LIT tests

ptxas is a proprietary compiler from Nvidia that can compile PTX to
machine code (SASS). It has a lot of diagnostics to catch errors
in PTX, which can be used to verify PTX output from llc.

Set -DPXTAS_EXECUTABLE=/path/to/ptxas CMake option to enable it.
If this option is not set, then ptxas is substituted to true which
effectively disables all ptxas RUN lines.

LLVM_PTXAS_EXECUTABLE environment variable takes precedence over
the CMake option, and allows to override ptxas executable that is used for LIT
without complete re-configuration.

Differential Revision: https://reviews.llvm.org/D121727

Added: 
    

Modified: 
    llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
    llvm/test/CodeGen/NVPTX/MachineSink-call.ll
    llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll
    llvm/test/CodeGen/NVPTX/TailDuplication-convergent.ll
    llvm/test/CodeGen/NVPTX/access-non-generic.ll
    llvm/test/CodeGen/NVPTX/add-128bit.ll
    llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll
    llvm/test/CodeGen/NVPTX/addrspacecast.ll
    llvm/test/CodeGen/NVPTX/aggr-param.ll
    llvm/test/CodeGen/NVPTX/aggregate-return.ll
    llvm/test/CodeGen/NVPTX/annotations.ll
    llvm/test/CodeGen/NVPTX/arg-lowering.ll
    llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
    llvm/test/CodeGen/NVPTX/arithmetic-int.ll
    llvm/test/CodeGen/NVPTX/async-copy.ll
    llvm/test/CodeGen/NVPTX/atomics-sm60.ll
    llvm/test/CodeGen/NVPTX/atomics-with-scope.ll
    llvm/test/CodeGen/NVPTX/atomics.ll
    llvm/test/CodeGen/NVPTX/b52037.ll
    llvm/test/CodeGen/NVPTX/barrier.ll
    llvm/test/CodeGen/NVPTX/bfe.ll
    llvm/test/CodeGen/NVPTX/branch-fold.ll
    llvm/test/CodeGen/NVPTX/bug17709.ll
    llvm/test/CodeGen/NVPTX/bug21465.ll
    llvm/test/CodeGen/NVPTX/bug22246.ll
    llvm/test/CodeGen/NVPTX/bug22322.ll
    llvm/test/CodeGen/NVPTX/bug26185-2.ll
    llvm/test/CodeGen/NVPTX/bug26185.ll
    llvm/test/CodeGen/NVPTX/bug41651.ll
    llvm/test/CodeGen/NVPTX/bypass-div.ll
    llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
    llvm/test/CodeGen/NVPTX/callchain.ll
    llvm/test/CodeGen/NVPTX/calling-conv.ll
    llvm/test/CodeGen/NVPTX/calls-with-phi.ll
    llvm/test/CodeGen/NVPTX/combine-min-max.ll
    llvm/test/CodeGen/NVPTX/compare-int.ll
    llvm/test/CodeGen/NVPTX/constant-vectors.ll
    llvm/test/CodeGen/NVPTX/convert-fp.ll
    llvm/test/CodeGen/NVPTX/convert-int-sm20.ll
    llvm/test/CodeGen/NVPTX/convert-sm80.ll
    llvm/test/CodeGen/NVPTX/ctlz.ll
    llvm/test/CodeGen/NVPTX/ctpop.ll
    llvm/test/CodeGen/NVPTX/cttz.ll
    llvm/test/CodeGen/NVPTX/disable-opt.ll
    llvm/test/CodeGen/NVPTX/div-ri.ll
    llvm/test/CodeGen/NVPTX/divrem-combine.ll
    llvm/test/CodeGen/NVPTX/envreg.ll
    llvm/test/CodeGen/NVPTX/extloadv.ll
    llvm/test/CodeGen/NVPTX/f16-ex2.ll
    llvm/test/CodeGen/NVPTX/f16-instructions.ll
    llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
    llvm/test/CodeGen/NVPTX/fast-math.ll
    llvm/test/CodeGen/NVPTX/fma-assoc.ll
    llvm/test/CodeGen/NVPTX/fma-disable.ll
    llvm/test/CodeGen/NVPTX/fma.ll
    llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll
    llvm/test/CodeGen/NVPTX/fns.ll
    llvm/test/CodeGen/NVPTX/fp-contract.ll
    llvm/test/CodeGen/NVPTX/fp-literals.ll
    llvm/test/CodeGen/NVPTX/fp16.ll
    llvm/test/CodeGen/NVPTX/function-align.ll
    llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
    llvm/test/CodeGen/NVPTX/global-addrspace.ll
    llvm/test/CodeGen/NVPTX/global-ordering.ll
    llvm/test/CodeGen/NVPTX/global-variable-big.ll
    llvm/test/CodeGen/NVPTX/global-visibility.ll
    llvm/test/CodeGen/NVPTX/globals_init.ll
    llvm/test/CodeGen/NVPTX/globals_lowering.ll
    llvm/test/CodeGen/NVPTX/half.ll
    llvm/test/CodeGen/NVPTX/i1-global.ll
    llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll
    llvm/test/CodeGen/NVPTX/i1-param.ll
    llvm/test/CodeGen/NVPTX/i128-global.ll
    llvm/test/CodeGen/NVPTX/i128-param.ll
    llvm/test/CodeGen/NVPTX/i128-retval.ll
    llvm/test/CodeGen/NVPTX/i128-struct.ll
    llvm/test/CodeGen/NVPTX/i8-param.ll
    llvm/test/CodeGen/NVPTX/idioms.ll
    llvm/test/CodeGen/NVPTX/imad.ll
    llvm/test/CodeGen/NVPTX/inline-asm.ll
    llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll
    llvm/test/CodeGen/NVPTX/intrinsic-old.ll
    llvm/test/CodeGen/NVPTX/intrinsics.ll
    llvm/test/CodeGen/NVPTX/isspacep.ll
    llvm/test/CodeGen/NVPTX/ld-addrspace.ll
    llvm/test/CodeGen/NVPTX/ld-generic.ll
    llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
    llvm/test/CodeGen/NVPTX/ldg-invariant.ll
    llvm/test/CodeGen/NVPTX/ldparam-v4.ll
    llvm/test/CodeGen/NVPTX/ldu-i8.ll
    llvm/test/CodeGen/NVPTX/ldu-ldg.ll
    llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
    llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll
    llvm/test/CodeGen/NVPTX/load-sext-i1.ll
    llvm/test/CodeGen/NVPTX/load-store.ll
    llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
    llvm/test/CodeGen/NVPTX/local-stack-frame.ll
    llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
    llvm/test/CodeGen/NVPTX/lower-alloca.ll
    llvm/test/CodeGen/NVPTX/lower-args.ll
    llvm/test/CodeGen/NVPTX/lower-byval-args.ll
    llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
    llvm/test/CodeGen/NVPTX/machine-sink.ll
    llvm/test/CodeGen/NVPTX/managed.ll
    llvm/test/CodeGen/NVPTX/match.ll
    llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll
    llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
    llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
    llvm/test/CodeGen/NVPTX/math-intrins.ll
    llvm/test/CodeGen/NVPTX/mbarrier.ll
    llvm/test/CodeGen/NVPTX/minmax-negative.ll
    llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
    llvm/test/CodeGen/NVPTX/module-inline-asm.ll
    llvm/test/CodeGen/NVPTX/mulwide.ll
    llvm/test/CodeGen/NVPTX/named-barriers.ll
    llvm/test/CodeGen/NVPTX/no-extra-parens.ll
    llvm/test/CodeGen/NVPTX/nofunc.ll
    llvm/test/CodeGen/NVPTX/nounroll.ll
    llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
    llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll
    llvm/test/CodeGen/NVPTX/param-align.ll
    llvm/test/CodeGen/NVPTX/param-load-store.ll
    llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
    llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
    llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll
    llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
    llvm/test/CodeGen/NVPTX/pr16278.ll
    llvm/test/CodeGen/NVPTX/pr17529.ll
    llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
    llvm/test/CodeGen/NVPTX/redux-sync.ll
    llvm/test/CodeGen/NVPTX/refl1.ll
    llvm/test/CodeGen/NVPTX/reg-copy.ll
    llvm/test/CodeGen/NVPTX/reg-types.ll
    llvm/test/CodeGen/NVPTX/rotate.ll
    llvm/test/CodeGen/NVPTX/rotate_64.ll
    llvm/test/CodeGen/NVPTX/sched1.ll
    llvm/test/CodeGen/NVPTX/sched2.ll
    llvm/test/CodeGen/NVPTX/sext-in-reg.ll
    llvm/test/CodeGen/NVPTX/sext-params.ll
    llvm/test/CodeGen/NVPTX/shfl-p.ll
    llvm/test/CodeGen/NVPTX/shfl-sync-p.ll
    llvm/test/CodeGen/NVPTX/shfl-sync.ll
    llvm/test/CodeGen/NVPTX/shfl.ll
    llvm/test/CodeGen/NVPTX/shift-parts.ll
    llvm/test/CodeGen/NVPTX/simple-call.ll
    llvm/test/CodeGen/NVPTX/sqrt-approx.ll
    llvm/test/CodeGen/NVPTX/st-addrspace.ll
    llvm/test/CodeGen/NVPTX/st-generic.ll
    llvm/test/CodeGen/NVPTX/store-retval.ll
    llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
    llvm/test/CodeGen/NVPTX/surf-read.ll
    llvm/test/CodeGen/NVPTX/surf-tex.py
    llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
    llvm/test/CodeGen/NVPTX/surf-write.ll
    llvm/test/CodeGen/NVPTX/symbol-naming.ll
    llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
    llvm/test/CodeGen/NVPTX/tex-read.ll
    llvm/test/CodeGen/NVPTX/texsurf-queries.ll
    llvm/test/CodeGen/NVPTX/tid-range.ll
    llvm/test/CodeGen/NVPTX/tuple-literal.ll
    llvm/test/CodeGen/NVPTX/vec-param-load.ll
    llvm/test/CodeGen/NVPTX/vec8.ll
    llvm/test/CodeGen/NVPTX/vector-args.ll
    llvm/test/CodeGen/NVPTX/vector-call.ll
    llvm/test/CodeGen/NVPTX/vector-compare.ll
    llvm/test/CodeGen/NVPTX/vector-global.ll
    llvm/test/CodeGen/NVPTX/vector-loads.ll
    llvm/test/CodeGen/NVPTX/vector-select.ll
    llvm/test/CodeGen/NVPTX/vector-stores.ll
    llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll
    llvm/test/CodeGen/NVPTX/vote.ll
    llvm/test/CodeGen/NVPTX/weak-global.ll
    llvm/test/CodeGen/NVPTX/weak-linkage.ll
    llvm/test/CodeGen/NVPTX/wmma.py
    llvm/test/CodeGen/NVPTX/zeroext-32bit.ll
    llvm/test/DebugInfo/NVPTX/crash-missing-DISubprogram.ll
    llvm/test/DebugInfo/NVPTX/cu-range-hole.ll
    llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
    llvm/test/DebugInfo/NVPTX/dbg-value-const-byref.ll
    llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
    llvm/test/DebugInfo/NVPTX/debug-empty.ll
    llvm/test/DebugInfo/NVPTX/debug-file-loc-only.ll
    llvm/test/DebugInfo/NVPTX/debug-file-loc.ll
    llvm/test/DebugInfo/NVPTX/debug-info.ll
    llvm/test/DebugInfo/NVPTX/debug-loc-offset.ll
    llvm/test/DebugInfo/NVPTX/debug-name-table.ll
    llvm/test/DebugInfo/NVPTX/dwarf-file-dir.ll
    llvm/test/DebugInfo/NVPTX/packed_bitfields.ll
    llvm/test/lit.cfg.py
    llvm/test/lit.site.cfg.py.in

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index e84030f385c41..94f7991448af9 100644

--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -1,5 +1,8 @@
 ; RUN: llc < %s | FileCheck -check-prefix=ENABLED %s
 ; RUN: llc -disable-nvptx-load-store-vectorizer < %s | FileCheck -check-prefix=DISABLED %s
+; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
+; RUN: %if ptxas %{ llc -disable-nvptx-load-store-vectorizer < %s | %ptxas-verify %}
+
 target triple = "nvptx64-nvidia-cuda"
 
 ; Check that the load-store vectorizer is enabled by default for nvptx, and

diff  --git a/llvm/test/CodeGen/NVPTX/MachineSink-call.ll b/llvm/test/CodeGen/NVPTX/MachineSink-call.ll
index 3a6d43b76aeb6..72c370222ad3c 100644
--- a/llvm/test/CodeGen/NVPTX/MachineSink-call.ll
+++ b/llvm/test/CodeGen/NVPTX/MachineSink-call.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s | FileCheck %s
+; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
+
 target triple = "nvptx64-nvidia-cuda"
 
 declare void @foo()

diff  --git a/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll b/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll
index 91c80182e2f87..d3814340c2812 100644
--- a/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll
+++ b/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s | FileCheck %s
+; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
+
 target triple = "nvptx64-nvidia-cuda"
 
 declare void @llvm.nvvm.barrier0()

diff  --git a/llvm/test/CodeGen/NVPTX/TailDuplication-convergent.ll b/llvm/test/CodeGen/NVPTX/TailDuplication-convergent.ll
index fc6867eca417a..c99702e1a60f5 100644
--- a/llvm/test/CodeGen/NVPTX/TailDuplication-convergent.ll
+++ b/llvm/test/CodeGen/NVPTX/TailDuplication-convergent.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O2 -tail-dup-size=100 -enable-tail-merge=0 < %s | FileCheck %s
+; RUN: %if ptxas %{ llc -O2 -tail-dup-size=100 -enable-tail-merge=0 < %s | %ptxas-verify %}
 target triple = "nvptx64-nvidia-cuda"
 
 declare void @foo()

diff  --git a/llvm/test/CodeGen/NVPTX/access-non-generic.ll b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
index 62520be2cf5a0..6ce56f3c9cb27 100644
--- a/llvm/test/CodeGen/NVPTX/access-non-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
@@ -2,6 +2,8 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX
 ; RUN: opt -mtriple=nvptx-- < %s -S -infer-address-spaces | FileCheck %s --check-prefix IR
 ; RUN: opt -mtriple=nvptx64-- < %s -S -infer-address-spaces | FileCheck %s --check-prefix IR
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 @array = internal addrspace(3) global [10 x float] zeroinitializer, align 4
 @scalar = internal addrspace(3) global float 0.000000e+00, align 4

diff  --git a/llvm/test/CodeGen/NVPTX/add-128bit.ll b/llvm/test/CodeGen/NVPTX/add-128bit.ll
index a077c3fcf8915..12283fc52002c 100644
--- a/llvm/test/CodeGen/NVPTX/add-128bit.ll
+++ b/llvm/test/CodeGen/NVPTX/add-128bit.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll b/llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll
index 1e2fde4b858a2..d0321d7228835 100644
--- a/llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll
+++ b/llvm/test/CodeGen/NVPTX/addrspacecast-gvar.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: .visible .global .align 4 .u32 g = 42;
 ; CHECK: .visible .global .align 4 .u32 g2 = generic(g);

diff  --git a/llvm/test/CodeGen/NVPTX/addrspacecast.ll b/llvm/test/CodeGen/NVPTX/addrspacecast.ll
index dbcd2beb46faf..77ae69cfc14b0 100644
--- a/llvm/test/CodeGen/NVPTX/addrspacecast.ll
+++ b/llvm/test/CodeGen/NVPTX/addrspacecast.ll
@@ -1,6 +1,9 @@
 ; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,CLS32,G32
 ; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64,G64
 ; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr| FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64,G64
+; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %}
 
 ; ALL-LABEL: conv1
 define i32 @conv1(i32 addrspace(1)* %ptr) {

diff  --git a/llvm/test/CodeGen/NVPTX/aggr-param.ll b/llvm/test/CodeGen/NVPTX/aggr-param.ll
index 21deb7ebce872..88afbd62f6314 100644
--- a/llvm/test/CodeGen/NVPTX/aggr-param.ll
+++ b/llvm/test/CodeGen/NVPTX/aggr-param.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; Make sure aggregate param types get emitted properly.
 

diff  --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
index 785b4d6d90dc5..b8d169b9c9f35 100644
--- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll
+++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %}
 
 declare <2 x float> @barv(<2 x float> %input)
 declare <3 x float> @barv3(<3 x float> %input)

diff  --git a/llvm/test/CodeGen/NVPTX/annotations.ll b/llvm/test/CodeGen/NVPTX/annotations.ll
index 1da8526ca47cd..83ef559d6a4ae 100644
--- a/llvm/test/CodeGen/NVPTX/annotations.ll
+++ b/llvm/test/CodeGen/NVPTX/annotations.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 @texture = internal addrspace(1) global i64 0, align 8
 ; CHECK: .global .texref texture

diff  --git a/llvm/test/CodeGen/NVPTX/arg-lowering.ll b/llvm/test/CodeGen/NVPTX/arg-lowering.ll
index f7b8a1491d37a..be39b2cfb0417 100644
--- a/llvm/test/CodeGen/NVPTX/arg-lowering.ll
+++ b/llvm/test/CodeGen/NVPTX/arg-lowering.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: .visible .func  (.param .align 16 .b8 func_retval0[16]) foo0(
 ; CHECK:          .param .align 4 .b8 foo0_param_0[8]

diff  --git a/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll b/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
index c167db4b46dcd..ba6754a333987 100644
--- a/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
+++ b/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
 
 ;; These tests should run for all targets
 

diff  --git a/llvm/test/CodeGen/NVPTX/arithmetic-int.ll b/llvm/test/CodeGen/NVPTX/arithmetic-int.ll
index e7c968c4c0bfe..9bbe020e89c41 100644
--- a/llvm/test/CodeGen/NVPTX/arithmetic-int.ll
+++ b/llvm/test/CodeGen/NVPTX/arithmetic-int.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ;; These tests should run for all targets
 

diff  --git a/llvm/test/CodeGen/NVPTX/async-copy.ll b/llvm/test/CodeGen/NVPTX/async-copy.ll
index 182779c2e0500..fe021ca5bebca 100644
--- a/llvm/test/CodeGen/NVPTX/async-copy.ll
+++ b/llvm/test/CodeGen/NVPTX/async-copy.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s 
+; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 
 declare void @llvm.nvvm.cp.async.wait.group(i32)
 

diff  --git a/llvm/test/CodeGen/NVPTX/atomics-sm60.ll b/llvm/test/CodeGen/NVPTX/atomics-sm60.ll
index 3441d264d6b90..405a547bc3609 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm60.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm60.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %}
 
 ; CHECK-LABEL: .func test(
 define void @test(double* %dp0, double addrspace(1)* %dp1, double addrspace(3)* %dp3, double %d) {

diff  --git a/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll b/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll
index 88cade6fa682a..c40f4e54f9bf1 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s -check-prefixes=CHECK,CHECK32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %}
 
 ; CHECK-LABEL: .func test_atomics_scope(
 define void @test_atomics_scope(float* %fp, float %f,

diff  --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index 982c29faaf509..d0e1b836a4b61 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_32 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_32 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_32%} %}
 
 
 ; CHECK-LABEL: atom0

diff  --git a/llvm/test/CodeGen/NVPTX/b52037.ll b/llvm/test/CodeGen/NVPTX/b52037.ll
index 48b6a2467b2e3..9066dc2f07dd1 100644
--- a/llvm/test/CodeGen/NVPTX/b52037.ll
+++ b/llvm/test/CodeGen/NVPTX/b52037.ll
@@ -4,6 +4,7 @@
 ; https://bugs.llvm.org/show_bug.cgi?id=52037 for the gory details.
 ;
 ; RUN: llc -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 -O3 -o - %s | FileCheck %s
+; RUN: %if ptxas %{ llc -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 -O3 -o - %s | %ptxas-verify -arch=sm_70 %}
 
 ; CHECK-LABEL: .visible .entry barney(
 ; CHECK-NOT:  .local{{.*}}__local_depot

diff  --git a/llvm/test/CodeGen/NVPTX/barrier.ll b/llvm/test/CodeGen/NVPTX/barrier.ll
index b84575bb303bf..145bbc98ed3a8 100644
--- a/llvm/test/CodeGen/NVPTX/barrier.ll
+++ b/llvm/test/CodeGen/NVPTX/barrier.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}
 
 declare void @llvm.nvvm.bar.warp.sync(i32)
 declare void @llvm.nvvm.barrier.sync(i32)

diff  --git a/llvm/test/CodeGen/NVPTX/bfe.ll b/llvm/test/CodeGen/NVPTX/bfe.ll
index 2e816fec2c591..37bbc9fa3c370 100644
--- a/llvm/test/CodeGen/NVPTX/bfe.ll
+++ b/llvm/test/CodeGen/NVPTX/bfe.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 
 ; CHECK: bfe0

diff  --git a/llvm/test/CodeGen/NVPTX/branch-fold.ll b/llvm/test/CodeGen/NVPTX/branch-fold.ll
index 336147f1f9925..72e3d020a9015 100644
--- a/llvm/test/CodeGen/NVPTX/branch-fold.ll
+++ b/llvm/test/CodeGen/NVPTX/branch-fold.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -disable-cgp -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -disable-cgp -verify-machineinstrs | %ptxas-verify -arch=sm_35 %}
+
 ; Disable CGP which also folds branches, so that only BranchFolding is under
 ; the spotlight.
 

diff  --git a/llvm/test/CodeGen/NVPTX/bug17709.ll b/llvm/test/CodeGen/NVPTX/bug17709.ll
index 6d747f09d8a7e..cc5c11bde19c0 100644
--- a/llvm/test/CodeGen/NVPTX/bug17709.ll
+++ b/llvm/test/CodeGen/NVPTX/bug17709.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 ; ModuleID = '__kernelgen_main_module'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

diff  --git a/llvm/test/CodeGen/NVPTX/bug21465.ll b/llvm/test/CodeGen/NVPTX/bug21465.ll
index 03cde33f31c16..5a8a9518bc510 100644
--- a/llvm/test/CodeGen/NVPTX/bug21465.ll
+++ b/llvm/test/CodeGen/NVPTX/bug21465.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -nvptx-lower-args -S | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %}
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-unknown-unknown"

diff  --git a/llvm/test/CodeGen/NVPTX/bug22246.ll b/llvm/test/CodeGen/NVPTX/bug22246.ll
index 70e7e12336e71..cda3e7743827c 100644
--- a/llvm/test/CodeGen/NVPTX/bug22246.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22246.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/bug22322.ll b/llvm/test/CodeGen/NVPTX/bug22322.ll
index b200fd794d540..ddf4c9fce1451 100644
--- a/llvm/test/CodeGen/NVPTX/bug22322.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22322.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
index 55e9dad96c01c..86925578c352e 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %}
 
 ; Verify that we correctly emit code for extending ldg/ldu. We do not expose
 ; extending variants in the backend, but the ldg/ldu selection code may pick

diff  --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll
index 30313481deb0e..ec10ba7ef5550 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %}
 
 ; Verify that we correctly emit code for i8 ldg/ldu. We do not expose 8-bit
 ; registers in the backend, so these loads need special handling.

diff  --git a/llvm/test/CodeGen/NVPTX/bug41651.ll b/llvm/test/CodeGen/NVPTX/bug41651.ll
index 6039d39407c36..b093ab7b2c2a8 100644
--- a/llvm/test/CodeGen/NVPTX/bug41651.ll
+++ b/llvm/test/CodeGen/NVPTX/bug41651.ll
@@ -1,4 +1,6 @@
 ; RUN: llc -filetype=asm -o - %s | FileCheck %s
+; RUN: %if ptxas %{ llc -filetype=asm -o - %s | %ptxas-verify %}
+
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 

diff  --git a/llvm/test/CodeGen/NVPTX/bypass-div.ll b/llvm/test/CodeGen/NVPTX/bypass-div.ll
index bd98c9a5b0b10..3c8f5d0bc6c64 100644
--- a/llvm/test/CodeGen/NVPTX/bypass-div.ll
+++ b/llvm/test/CodeGen/NVPTX/bypass-div.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %}
 
 ; 64-bit divides and rems should be split into a fast and slow path where
 ; the fast path uses a 32-bit operation.

diff  --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index c165e05706306..882704918a49c 100644
--- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; Checks how NVPTX lowers alloca buffers and their passing to functions.
 ;

diff  --git a/llvm/test/CodeGen/NVPTX/callchain.ll b/llvm/test/CodeGen/NVPTX/callchain.ll
index 60b118b6a1993..a6f08f1f21be5 100644
--- a/llvm/test/CodeGen/NVPTX/callchain.ll
+++ b/llvm/test/CodeGen/NVPTX/callchain.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target triple = "nvptx"
 

diff  --git a/llvm/test/CodeGen/NVPTX/calling-conv.ll b/llvm/test/CodeGen/NVPTX/calling-conv.ll
index 3b03442ad8bdc..962138b2c0bb2 100644
--- a/llvm/test/CodeGen/NVPTX/calling-conv.ll
+++ b/llvm/test/CodeGen/NVPTX/calling-conv.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 
 ;; Kernel function using ptx_kernel calling conv

diff  --git a/llvm/test/CodeGen/NVPTX/calls-with-phi.ll b/llvm/test/CodeGen/NVPTX/calls-with-phi.ll
index 6e010ea9adc34..dcc31af57cdd7 100644
--- a/llvm/test/CodeGen/NVPTX/calls-with-phi.ll
+++ b/llvm/test/CodeGen/NVPTX/calls-with-phi.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -march=nvptx 2>&1 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %}
+
 ; Make sure the example doesn't crash with segfault
 
 ; CHECK: .visible .func ({{.*}}) loop

diff  --git a/llvm/test/CodeGen/NVPTX/combine-min-max.ll b/llvm/test/CodeGen/NVPTX/combine-min-max.ll
index b478192f1da71..e5dba45de0b4b 100644
--- a/llvm/test/CodeGen/NVPTX/combine-min-max.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-min-max.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O2 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O2 | %ptxas-verify %}
 
 ; *************************************
 ; * Cases with no min/max

diff  --git a/llvm/test/CodeGen/NVPTX/compare-int.ll b/llvm/test/CodeGen/NVPTX/compare-int.ll
index e4e0601db59fd..3f82aec7563e0 100644
--- a/llvm/test/CodeGen/NVPTX/compare-int.ll
+++ b/llvm/test/CodeGen/NVPTX/compare-int.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ;; These tests should run for all targets
 

diff  --git a/llvm/test/CodeGen/NVPTX/constant-vectors.ll b/llvm/test/CodeGen/NVPTX/constant-vectors.ll
index 208c2d970f318..1cffb5afc19a0 100644
--- a/llvm/test/CodeGen/NVPTX/constant-vectors.ll
+++ b/llvm/test/CodeGen/NVPTX/constant-vectors.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target triple = "nvptx-nvidia-cuda"
 

diff  --git a/llvm/test/CodeGen/NVPTX/convert-fp.ll b/llvm/test/CodeGen/NVPTX/convert-fp.ll
index fd28a4f7cc67b..7f9c40d7f26ab 100644
--- a/llvm/test/CodeGen/NVPTX/convert-fp.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-fp.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 define i16 @cvt_u16_f32(float %x) {
 ; CHECK: cvt.rzi.u16.f32 %rs{{[0-9]+}}, %f{{[0-9]+}};

diff  --git a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll
index 57a231629e000..38a2c6b361190 100644
--- a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 
 ;; Integer conversions happen inplicitly by loading/storing the proper types

diff  --git a/llvm/test/CodeGen/NVPTX/convert-sm80.ll b/llvm/test/CodeGen/NVPTX/convert-sm80.ll
index 81893fdfe790f..6aac2dd18775e 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm80.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm80.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 
 
 ; CHECK-LABEL: cvt_rn_bf16x2_f32

diff  --git a/llvm/test/CodeGen/NVPTX/ctlz.ll b/llvm/test/CodeGen/NVPTX/ctlz.ll
index 13eef2440e69c..504eb0be8804c 100644
--- a/llvm/test/CodeGen/NVPTX/ctlz.ll
+++ b/llvm/test/CodeGen/NVPTX/ctlz.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/ctpop.ll b/llvm/test/CodeGen/NVPTX/ctpop.ll
index 69a4f879a8d83..e89f2d6fcf27e 100644
--- a/llvm/test/CodeGen/NVPTX/ctpop.ll
+++ b/llvm/test/CodeGen/NVPTX/ctpop.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/cttz.ll b/llvm/test/CodeGen/NVPTX/cttz.ll
index 0bfe0139bcdf9..37e85349c42b6 100644
--- a/llvm/test/CodeGen/NVPTX/cttz.ll
+++ b/llvm/test/CodeGen/NVPTX/cttz.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/disable-opt.ll b/llvm/test/CodeGen/NVPTX/disable-opt.ll
index 15e4913c16952..0b36120f760f5 100644
--- a/llvm/test/CodeGen/NVPTX/disable-opt.ll
+++ b/llvm/test/CodeGen/NVPTX/disable-opt.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O0 | %ptxas-verify %}
 
 define void @foo(i32* %output) {
 ; CHECK-LABEL: .visible .func foo(

diff  --git a/llvm/test/CodeGen/NVPTX/div-ri.ll b/llvm/test/CodeGen/NVPTX/div-ri.ll
index 7f796e0239fce..c95350349c865 100644
--- a/llvm/test/CodeGen/NVPTX/div-ri.ll
+++ b/llvm/test/CodeGen/NVPTX/div-ri.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 | %ptxas-verify %}
 
 define float @foo(float %a) {
 ; CHECK: div.approx.f32

diff  --git a/llvm/test/CodeGen/NVPTX/divrem-combine.ll b/llvm/test/CodeGen/NVPTX/divrem-combine.ll
index 64257af57e92c..856f20d95a170 100644
--- a/llvm/test/CodeGen/NVPTX/divrem-combine.ll
+++ b/llvm/test/CodeGen/NVPTX/divrem-combine.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -O2 < %s -march=nvptx -mcpu=sm_35 | FileCheck %s --check-prefix=O2 --check-prefix=CHECK
 ; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_35 | FileCheck %s --check-prefix=O0 --check-prefix=CHECK
+; RUN: %if ptxas %{ llc -O2 < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %}
+; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %}
 
 ; The following IR
 ;

diff  --git a/llvm/test/CodeGen/NVPTX/envreg.ll b/llvm/test/CodeGen/NVPTX/envreg.ll
index 8ab5816e68ed2..5bd4b0cfee67e 100644
--- a/llvm/test/CodeGen/NVPTX/envreg.ll
+++ b/llvm/test/CodeGen/NVPTX/envreg.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 
 declare i32 @llvm.nvvm.read.ptx.sreg.envreg0()

diff  --git a/llvm/test/CodeGen/NVPTX/extloadv.ll b/llvm/test/CodeGen/NVPTX/extloadv.ll
index 8c264ae093316..8c00077a2b545 100644
--- a/llvm/test/CodeGen/NVPTX/extloadv.ll
+++ b/llvm/test/CodeGen/NVPTX/extloadv.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %}
 
 define void @foo(float* nocapture readonly %x_value, double* nocapture %output) #0 {
   %1 = bitcast float* %x_value to <4 x float>*

diff  --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
index 24238809e4933..da11e9dff80db 100644
--- a/llvm/test/CodeGen/NVPTX/f16-ex2.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
 
 declare half @llvm.nvvm.ex2.approx.f16(half)
 declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)

diff  --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index 3601929e00e72..994b99da5a95a 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -2,20 +2,41 @@
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
 ; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-F16-NOFTZ %s
+; RUN: %if ptxas %{                                                           \
+; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN:   | %ptxas-verify -arch=sm_53                                          \
+; RUN: %}
 ; ## Full FP16 with FTZ
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
 ; RUN:          -denormal-fp-math-f32=preserve-sign \
 ; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16-FTZ %s
+; RUN: %if ptxas %{                                                           \
+; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN:          -denormal-fp-math-f32=preserve-sign                           \
+; RUN:   | %ptxas-verify -arch=sm_53                                          \
+; RUN: %}
 ; ## FP16 support explicitly disabled.
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
 ; RUN:           -verify-machineinstrs \
 ; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-NOF16 %s
+; RUN: %if ptxas %{                                                           \
+; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math   \
+; RUN:   | %ptxas-verify -arch=sm_53                                          \
+; RUN: %}
 ; ## FP16 is not supported by hardware.
 ; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
 ; RUN:          -disable-post-ra -frame-pointer=all -verify-machineinstrs \
 ; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-NOF16 %s
+; RUN: %if ptxas %{                                                               \
+; RUN:   llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
+; RUN:          -disable-post-ra -frame-pointer=all -verify-machineinstrs         \
+; RUN:   | %ptxas-verify -arch=sm_52                                              \
+; RUN: %}
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 

diff  --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index db2259535d602..44d83b967054e 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -2,15 +2,31 @@
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
 ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-F16 %s
+; RUN: %if ptxas %{                                                           \
+; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN:   | %ptxas-verify -arch=sm_53                                          \
+; RUN: %}
 ; ## FP16 support explicitly disabled.
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
 ; RUN:           -verify-machineinstrs \
 ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s
+; RUN: %if ptxas %{                                                           \
+; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math   \
+; RUN:           -verify-machineinstrs                                        \
+; RUN:   | %ptxas-verify -arch=sm_53                                          \
+; RUN: %}
 ; ## FP16 is not supported by hardware.
 ; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
 ; RUN:          -disable-post-ra -frame-pointer=all -verify-machineinstrs \
 ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s
+; RUN: %if ptxas %{                                                               \
+; RUN:   llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
+; RUN:          -disable-post-ra -frame-pointer=all -verify-machineinstrs         \
+; RUN:   | %ptxas-verify -arch=sm_52                                              \
+; RUN: %}
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 

diff  --git a/llvm/test/CodeGen/NVPTX/fast-math.ll b/llvm/test/CodeGen/NVPTX/fast-math.ll
index a29d70a0b55bd..ceeef54a9295f 100644
--- a/llvm/test/CodeGen/NVPTX/fast-math.ll
+++ b/llvm/test/CodeGen/NVPTX/fast-math.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 declare float @llvm.sqrt.f32(float)
 declare double @llvm.sqrt.f64(double)

diff  --git a/llvm/test/CodeGen/NVPTX/fma-assoc.ll b/llvm/test/CodeGen/NVPTX/fma-assoc.ll
index df86d476efdce..d4e55351b85c2 100644
--- a/llvm/test/CodeGen/NVPTX/fma-assoc.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-assoc.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s -check-prefix=CHECK
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | %ptxas-verify %}
 
 define ptx_device float @t1_f32(float %x, float %y, float %z,
                                 float %u, float %v) {

diff  --git a/llvm/test/CodeGen/NVPTX/fma-disable.ll b/llvm/test/CodeGen/NVPTX/fma-disable.ll
index bdd74017f9d2f..f693d45ff8a4c 100644
--- a/llvm/test/CodeGen/NVPTX/fma-disable.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-disable.ll
@@ -2,6 +2,10 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | FileCheck %s -check-prefix=MUL
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 | FileCheck %s -check-prefix=FMA
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 | FileCheck %s -check-prefix=MUL
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=1 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 | %ptxas-verify %}
 
 define ptx_device float @test_mul_add_f(float %x, float %y, float %z) {
 entry:

diff  --git a/llvm/test/CodeGen/NVPTX/fma.ll b/llvm/test/CodeGen/NVPTX/fma.ll
index 351f9b20dc0c2..2dd85ee989217 100644
--- a/llvm/test/CodeGen/NVPTX/fma.ll
+++ b/llvm/test/CodeGen/NVPTX/fma.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs | %ptxas-verify %}
 
 declare float @dummy_f32(float, float) #0
 declare double @dummy_f64(double, double) #0

diff  --git a/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll b/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll
index 745a5ec08f232..6d57e0eec2770 100644
--- a/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx | FileCheck %s --check-prefixes=CHECK,CHECK-NONAN
 ; RUN: llc < %s -march=nvptx -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK,CHECK-NAN
+; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %}
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
 
 ; ---- minimum ----
 

diff  --git a/llvm/test/CodeGen/NVPTX/fns.ll b/llvm/test/CodeGen/NVPTX/fns.ll
index 7673e43449c30..d67465ef3f2e8 100644
--- a/llvm/test/CodeGen/NVPTX/fns.ll
+++ b/llvm/test/CodeGen/NVPTX/fns.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}
 
 declare i32 @llvm.nvvm.fns(i32, i32, i32)
 

diff  --git a/llvm/test/CodeGen/NVPTX/fp-contract.ll b/llvm/test/CodeGen/NVPTX/fp-contract.ll
index 3f68b188ba75a..ac751c13bb899 100644
--- a/llvm/test/CodeGen/NVPTX/fp-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fp-contract.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}
 
 target triple = "nvptx64-unknown-cuda"
 

diff  --git a/llvm/test/CodeGen/NVPTX/fp-literals.ll b/llvm/test/CodeGen/NVPTX/fp-literals.ll
index 755e0f9250a14..58313b7e47a87 100644
--- a/llvm/test/CodeGen/NVPTX/fp-literals.ll
+++ b/llvm/test/CodeGen/NVPTX/fp-literals.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
 
 target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

diff  --git a/llvm/test/CodeGen/NVPTX/fp16.ll b/llvm/test/CodeGen/NVPTX/fp16.ll
index b85eed0f6c7fb..fc16755c93803 100644
--- a/llvm/test/CodeGen/NVPTX/fp16.ll
+++ b/llvm/test/CodeGen/NVPTX/fp16.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=nvptx -verify-machineinstrs < %s | FileCheck %s
+; RUN: %if ptxas %{ llc -march=nvptx -verify-machineinstrs < %s | %ptxas-verify %}
 
 declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
 declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone

diff  --git a/llvm/test/CodeGen/NVPTX/function-align.ll b/llvm/test/CodeGen/NVPTX/function-align.ll
index e7abfb128f58f..b65fc12d8e929 100644
--- a/llvm/test/CodeGen/NVPTX/function-align.ll
+++ b/llvm/test/CodeGen/NVPTX/function-align.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-NOT: .align 2
 define ptx_device void @foo() align 2 {

diff  --git a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
index 5b29b219e9fb3..0ebec35922e43 100644
--- a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
+++ b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/global-addrspace.ll b/llvm/test/CodeGen/NVPTX/global-addrspace.ll
index 4da14c7ff4fe2..f3a711a931a73 100644
--- a/llvm/test/CodeGen/NVPTX/global-addrspace.ll
+++ b/llvm/test/CodeGen/NVPTX/global-addrspace.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; PTX32: .visible .global .align 4 .u32 i;
 ; PTX32: .visible .const .align 4 .u32 j;

diff  --git a/llvm/test/CodeGen/NVPTX/global-ordering.ll b/llvm/test/CodeGen/NVPTX/global-ordering.ll
index 43394a79e9122..daea215ebd879 100644
--- a/llvm/test/CodeGen/NVPTX/global-ordering.ll
+++ b/llvm/test/CodeGen/NVPTX/global-ordering.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; Make sure we emit these globals in def-use order
 

diff  --git a/llvm/test/CodeGen/NVPTX/global-variable-big.ll b/llvm/test/CodeGen/NVPTX/global-variable-big.ll
index 0c769a856080e..81171e8245d1e 100644
--- a/llvm/test/CodeGen/NVPTX/global-variable-big.ll
+++ b/llvm/test/CodeGen/NVPTX/global-variable-big.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s | FileCheck %s
+; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
+
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 

diff  --git a/llvm/test/CodeGen/NVPTX/global-visibility.ll b/llvm/test/CodeGen/NVPTX/global-visibility.ll
index 90af2950fb409..ef88da7b896c1 100644
--- a/llvm/test/CodeGen/NVPTX/global-visibility.ll
+++ b/llvm/test/CodeGen/NVPTX/global-visibility.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; PTX does not support .hidden or .protected.
 ; Make sure we do not emit them.

diff  --git a/llvm/test/CodeGen/NVPTX/globals_init.ll b/llvm/test/CodeGen/NVPTX/globals_init.ll
index 47434093c9617..b482843773bcd 100644
--- a/llvm/test/CodeGen/NVPTX/globals_init.ll
+++ b/llvm/test/CodeGen/NVPTX/globals_init.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; Make sure the globals constant initializers are not prone to host endianess 
 ; issues.

diff  --git a/llvm/test/CodeGen/NVPTX/globals_lowering.ll b/llvm/test/CodeGen/NVPTX/globals_lowering.ll
index 7c5ae6c549c89..07fa9cabbfaf8 100644
--- a/llvm/test/CodeGen/NVPTX/globals_lowering.ll
+++ b/llvm/test/CodeGen/NVPTX/globals_lowering.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -relocation-model=static | FileCheck %s --check-prefix CHK
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -relocation-model=static | %ptxas-verify %}
 
 %MyStruct = type { i32, i32, float }
 @Gbl = internal addrspace(3) global [1024 x %MyStruct] zeroinitializer

diff  --git a/llvm/test/CodeGen/NVPTX/half.ll b/llvm/test/CodeGen/NVPTX/half.ll
index 2b6ca615bfc28..33764718476f5 100644
--- a/llvm/test/CodeGen/NVPTX/half.ll
+++ b/llvm/test/CodeGen/NVPTX/half.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %}
 
 ; CHECK: .b8 half_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
 @"half_array" = addrspace(1) constant [4 x half]

diff  --git a/llvm/test/CodeGen/NVPTX/i1-global.ll b/llvm/test/CodeGen/NVPTX/i1-global.ll
index 35d77b4b44d26..2f7ec7675b2b8 100644
--- a/llvm/test/CodeGen/NVPTX/i1-global.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-global.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll b/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll
index 3979179399ee6..6c69e40832de2 100644
--- a/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-int-to-fp.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-LABEL: foo
 ; CHECK: setp

diff  --git a/llvm/test/CodeGen/NVPTX/i1-param.ll b/llvm/test/CodeGen/NVPTX/i1-param.ll
index aac71960551f8..a759d27f1b1f2 100644
--- a/llvm/test/CodeGen/NVPTX/i1-param.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-param.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/i128-global.ll b/llvm/test/CodeGen/NVPTX/i128-global.ll
index f53575d4ddb36..cdebc65272f86 100644
--- a/llvm/test/CodeGen/NVPTX/i128-global.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-global.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: .visible .global .align 16 .b8 G1[16] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 @G1 = global i128 1

diff  --git a/llvm/test/CodeGen/NVPTX/i128-param.ll b/llvm/test/CodeGen/NVPTX/i128-param.ll
index 7cb603546aedf..6a514e74bd2fd 100644
--- a/llvm/test/CodeGen/NVPTX/i128-param.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-param.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-LABEL: .visible .func callee(
 ; CHECK-NEXT: .param .align 16 .b8 callee_param_0[16],

diff  --git a/llvm/test/CodeGen/NVPTX/i128-retval.ll b/llvm/test/CodeGen/NVPTX/i128-retval.ll
index 015b0199d835e..9a7fd8cc138b4 100644
--- a/llvm/test/CodeGen/NVPTX/i128-retval.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-retval.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-LABEL: .visible .func (.param .align 16 .b8 func_retval0[16]) callee(
 define i128 @callee(i128) {

diff  --git a/llvm/test/CodeGen/NVPTX/i128-struct.ll b/llvm/test/CodeGen/NVPTX/i128-struct.ll
index c619be40f3694..cecfd4f6ce42a 100644
--- a/llvm/test/CodeGen/NVPTX/i128-struct.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-struct.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-LABEL: .visible .func (.param .align 16 .b8 func_retval0[32]) foo(
 define { i128, i128 } @foo(i64 %a, i32 %b) {

diff  --git a/llvm/test/CodeGen/NVPTX/i8-param.ll b/llvm/test/CodeGen/NVPTX/i8-param.ll
index c41da0eebd1f8..18d4a71b16607 100644
--- a/llvm/test/CodeGen/NVPTX/i8-param.ll
+++ b/llvm/test/CodeGen/NVPTX/i8-param.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/idioms.ll b/llvm/test/CodeGen/NVPTX/idioms.ll
index 047325c85165a..f82dac213c465 100644
--- a/llvm/test/CodeGen/NVPTX/idioms.ll
+++ b/llvm/test/CodeGen/NVPTX/idioms.ll
@@ -2,6 +2,8 @@
 
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-LABEL: abs_i16(
 define i16 @abs_i16(i16 %a) {

diff  --git a/llvm/test/CodeGen/NVPTX/imad.ll b/llvm/test/CodeGen/NVPTX/imad.ll
index 67421c7cac4b7..b3be772c5fa75 100644
--- a/llvm/test/CodeGen/NVPTX/imad.ll
+++ b/llvm/test/CodeGen/NVPTX/imad.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: imad
 define i32 @imad(i32 %a, i32 %b, i32 %c) {

diff  --git a/llvm/test/CodeGen/NVPTX/inline-asm.ll b/llvm/test/CodeGen/NVPTX/inline-asm.ll
index 6f0578d4cff41..aa27bf1532565 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 define float @test(float %x) {
 entry:

diff  --git a/llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll b/llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll
index cb1c6e20bcac9..c054fe86cfc58 100644
--- a/llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll
+++ b/llvm/test/CodeGen/NVPTX/inlineasm-output-template.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=nvptx < %s | FileCheck %s
+; RUN: %if ptxas %{ llc -march=nvptx < %s | %ptxas-verify %}
 
 ; Test that %c works with immediates
 ; CHECK-LABEL: test_inlineasm_c_output_template0

diff  --git a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll
index e16786d70fbd6..b9fed160168d4 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll
@@ -10,6 +10,8 @@
 ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \
 ; RUN:    -passes=nvvm-intr-range -nvvm-intr-range-sm=30 \
 ; RUN:   | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_30 %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 define ptx_device i32 @test_tid_x() {
 ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x;

diff  --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index 7172210ec8b17..100465d3fdba0 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-LABEL: test_fabsf(
 define float @test_fabsf(float %f) {

diff  --git a/llvm/test/CodeGen/NVPTX/isspacep.ll b/llvm/test/CodeGen/NVPTX/isspacep.ll
index 47fa7a6714df5..fe3741012ba30 100644
--- a/llvm/test/CodeGen/NVPTX/isspacep.ll
+++ b/llvm/test/CodeGen/NVPTX/isspacep.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 declare i1 @llvm.nvvm.isspacep.const(i8*) readnone noinline
 declare i1 @llvm.nvvm.isspacep.global(i8*) readnone noinline

diff  --git a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll
index 4a2eb3978751c..475405ec48502 100644
--- a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll
+++ b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll
@@ -1,6 +1,9 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G32,LS32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G64,LS64
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s --check-prefixes=G64,LS32
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %}
 
 
 ;; i8

diff  --git a/llvm/test/CodeGen/NVPTX/ld-generic.ll b/llvm/test/CodeGen/NVPTX/ld-generic.ll
index 44cfe6551b99c..472f3e88efee6 100644
--- a/llvm/test/CodeGen/NVPTX/ld-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/ld-generic.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 
 ;; i8

diff  --git a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
index 1f8cee7c53f2c..8ed4406fe9b3d 100644
--- a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
+++ b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
@@ -4,6 +4,8 @@
 # RUN: %python %s > %t.ll
 # RUN: llc < %t.ll -march=nvptx64 -mcpu=sm_30 | FileCheck -check-prefixes=CHECK,CHECK_P64 %t.ll
 # RUN: llc < %t.ll -march=nvptx -mcpu=sm_30 | FileCheck -check-prefixes=CHECK,CHECK_P32 %t.ll
+# RUN: %if ptxas %{ llc < %t.ll -march=nvptx64 -mcpu=sm_30 | %ptxas-verify %}
+# RUN: %if ptxas %{ llc < %t.ll -march=nvptx -mcpu=sm_30 | %ptxas-verify %}
 
 from __future__ import print_function
 

diff  --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
index ec7a857ea86ce..0ef9317f4d0cb 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %}
 
 ; Check that invariant loads from the global addrspace are lowered to
 ; ld.global.nc.

diff  --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
index 4d082f6e9a58d..e9038412de380 100644
--- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
+++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 declare <4 x float> @bar()
 

diff  --git a/llvm/test/CodeGen/NVPTX/ldu-i8.ll b/llvm/test/CodeGen/NVPTX/ldu-i8.ll
index 36c99b30425d8..88c10504036ba 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-i8.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-i8.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
index 4bfd68c224284..ab7767cfcc09b 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 
 declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)

diff  --git a/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
index ec96a493021aa..dd5d4d5702d6e 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll b/llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll
index 1703dbdf3a8ae..1847b5d774fa6 100644
--- a/llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll
+++ b/llvm/test/CodeGen/NVPTX/libcall-fulfilled.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -march=nvptx 2>&1 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %}
+
 ; Allow to make libcalls that are defined in the current module
 
 declare i8* @malloc(i64)

diff  --git a/llvm/test/CodeGen/NVPTX/load-sext-i1.ll b/llvm/test/CodeGen/NVPTX/load-sext-i1.ll
index 9fc98a45f59ac..e0044404c6626 100644
--- a/llvm/test/CodeGen/NVPTX/load-sext-i1.ll
+++ b/llvm/test/CodeGen/NVPTX/load-sext-i1.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index a3fb172646af0..de61b9ef9deb0 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-LABEL: plain
 define void @plain(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr {

diff  --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
index d93499b47f594..6783d7d3467cf 100644
--- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
+++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefix=SM20 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck -check-prefix=SM35 %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %}
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-unknown-unknown"

diff  --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
index 3be81400fd2ee..2150a9ed340db 100644
--- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX64
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 ; Ensure we access the local stack properly
 

diff  --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
index 80f9107472fb3..e6da3f42c7054 100644
--- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX
 ; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | %ptxas-verify -arch=sm_35 %}
 
 ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
 ; llvm.mem* intrinsics get lowered to loops.

diff  --git a/llvm/test/CodeGen/NVPTX/lower-alloca.ll b/llvm/test/CodeGen/NVPTX/lower-alloca.ll
index 3db225ef0e75c..f32d3bad0e7f8 100644
--- a/llvm/test/CodeGen/NVPTX/lower-alloca.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-alloca.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -S -nvptx-lower-alloca -infer-address-spaces | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %}
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-unknown-unknown"

diff  --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index 078b3fea4c7a3..75c1cc55b7879 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -S -nvptx-lower-args | FileCheck %s --check-prefix IR
 ; RUN: llc < %s -mcpu=sm_20 | FileCheck %s --check-prefix PTX
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index cfba9102ea19b..95dc45a039914 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK64
 ; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK32
+; RUN: %if ptxas %{ llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 %struct.ham = type { [4 x i32] }
 

diff  --git a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
index 5632ddc83a0f7..6f33d2fad7a6e 100644
--- a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/machine-sink.ll b/llvm/test/CodeGen/NVPTX/machine-sink.ll
index 65ba141c41d9d..e4b2ea83249a0 100644
--- a/llvm/test/CodeGen/NVPTX/machine-sink.ll
+++ b/llvm/test/CodeGen/NVPTX/machine-sink.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/managed.ll b/llvm/test/CodeGen/NVPTX/managed.ll
index 7dc02d6026621..283b1f908696f 100644
--- a/llvm/test/CodeGen/NVPTX/managed.ll
+++ b/llvm/test/CodeGen/NVPTX/managed.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_30 -mattr=+ptx40 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -mattr=+ptx40 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}
 
 ; RUN: not --crash llc < %s -march=nvptx -mcpu=sm_20 2>&1 | FileCheck %s --check-prefix ERROR
 ; ERROR: LLVM ERROR: .attribute(.managed) requires PTX version >= 4.0 and sm_30

diff  --git a/llvm/test/CodeGen/NVPTX/match.ll b/llvm/test/CodeGen/NVPTX/match.ll
index 4c94d46a7e847..edba9eed7ec25 100644
--- a/llvm/test/CodeGen/NVPTX/match.ll
+++ b/llvm/test/CodeGen/NVPTX/match.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %}
 
 declare i32 @llvm.nvvm.match.any.sync.i32(i32, i32)
 declare i32 @llvm.nvvm.match.any.sync.i64(i32, i64)

diff  --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll
index a6d1ee93c6ec6..2695d434c33c5 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | %ptxas-verify -arch=sm_53 %}
 
 declare half @llvm.nvvm.fma.rn.f16(half, half, half)
 declare half @llvm.nvvm.fma.rn.ftz.f16(half, half, half)

diff  --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
index e986e84233c77..34b9c08509326 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 
 declare i16 @llvm.nvvm.abs.bf16(i16)
 declare i32 @llvm.nvvm.abs.bf16x2(i32)

diff  --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
index 2d894ca78fadf..b745df484bab2 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | FileCheck %s
+; RUN: %if ptxas-11.2 %{ llc < %s -march=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | %ptxas-verify -arch=sm_86 %}
 
 declare half @llvm.nvvm.fmin.xorsign.abs.f16(half, half)
 declare half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half, half)

diff  --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll
index 3ab63a04a4674..efd8ed0a37b7a 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -1,6 +1,10 @@
 ; RUN: llc < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOF16
 ; RUN: llc < %s -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK,CHECK-F16
 ; RUN: llc < %s -mcpu=sm_80 --nvptx-no-f16-math | FileCheck %s --check-prefixes=CHECK,CHECK-NOF16
+; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
+; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 --nvptx-no-f16-math | %ptxas-verify -arch=sm_80 %}
+
 target triple = "nvptx64-nvidia-cuda"
 
 ; Checks that llvm intrinsics for math functions are correctly lowered to PTX.

diff  --git a/llvm/test/CodeGen/NVPTX/mbarrier.ll b/llvm/test/CodeGen/NVPTX/mbarrier.ll
index 160c4030d26ca..ae97f96a2d170 100644
--- a/llvm/test/CodeGen/NVPTX/mbarrier.ll
+++ b/llvm/test/CodeGen/NVPTX/mbarrier.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX64
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
 
 declare void @llvm.nvvm.mbarrier.init(i64* %a, i32 %b)
 declare void @llvm.nvvm.mbarrier.init.shared(i64 addrspace(3)* %a, i32 %b)

diff  --git a/llvm/test/CodeGen/NVPTX/minmax-negative.ll b/llvm/test/CodeGen/NVPTX/minmax-negative.ll
index 73bd9e049e7a7..9567d98e81725 100644
--- a/llvm/test/CodeGen/NVPTX/minmax-negative.ll
+++ b/llvm/test/CodeGen/NVPTX/minmax-negative.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -O0 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -O0 | %ptxas-verify %}
 
 define i16 @test1(i16* %sur1) {
 ; CHECK-NOT: mov.u16 %rs{{[0-9]+}}, 32767

diff  --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
index 036d9638ceac7..2b2ab8f70d084 100644
--- a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
+++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/module-inline-asm.ll b/llvm/test/CodeGen/NVPTX/module-inline-asm.ll
index a2ce6a7f87b38..317542dba687d 100644
--- a/llvm/test/CodeGen/NVPTX/module-inline-asm.ll
+++ b/llvm/test/CodeGen/NVPTX/module-inline-asm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/mulwide.ll b/llvm/test/CodeGen/NVPTX/mulwide.ll
index 1ddf9739e2025..c4205691a0b4c 100644
--- a/llvm/test/CodeGen/NVPTX/mulwide.ll
+++ b/llvm/test/CodeGen/NVPTX/mulwide.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O3 | FileCheck %s --check-prefix=OPT
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 | FileCheck %s --check-prefix=NOOPT
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O3 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O0 | %ptxas-verify %}
 
 ; OPT-LABEL: @mulwide16
 ; NOOPT-LABEL: @mulwide16

diff  --git a/llvm/test/CodeGen/NVPTX/named-barriers.ll b/llvm/test/CodeGen/NVPTX/named-barriers.ll
index accc0fd6fef74..5c3af1cd2f352 100644
--- a/llvm/test/CodeGen/NVPTX/named-barriers.ll
+++ b/llvm/test/CodeGen/NVPTX/named-barriers.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; Use bar.sync to arrive at a pre-computed barrier number and
 ; wait for all threads in CTA to also arrive:

diff  --git a/llvm/test/CodeGen/NVPTX/no-extra-parens.ll b/llvm/test/CodeGen/NVPTX/no-extra-parens.ll
index 22a89b8096306..b695c69cb0e34 100644
--- a/llvm/test/CodeGen/NVPTX/no-extra-parens.ll
+++ b/llvm/test/CodeGen/NVPTX/no-extra-parens.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; ptxas has no special meaning for '$' character, so it should be used
 ; without parens.

diff  --git a/llvm/test/CodeGen/NVPTX/nofunc.ll b/llvm/test/CodeGen/NVPTX/nofunc.ll
index e9dcffbdb7fae..9ae97993fd255 100644
--- a/llvm/test/CodeGen/NVPTX/nofunc.ll
+++ b/llvm/test/CodeGen/NVPTX/nofunc.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; Test that we don't crash if we're compiling a module with function references,
 ; but without any functions in it.

diff  --git a/llvm/test/CodeGen/NVPTX/nounroll.ll b/llvm/test/CodeGen/NVPTX/nounroll.ll
index e80a4a21f161a..474c38f8d85ac 100644
--- a/llvm/test/CodeGen/NVPTX/nounroll.ll
+++ b/llvm/test/CodeGen/NVPTX/nounroll.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-unknown-unknown"

diff  --git a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
index d1eecc855fcb4..702f4c1840569 100644
--- a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target triple = "nvptx-unknown-nvcl"
 

diff  --git a/llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll b/llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll
index ef8f74020508b..8d28b9cab1e75 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-annotations-D120129.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx-unknown-unknown | %ptxas-verify %}
 ;
 ; NVPTXTargetLowering::getFunctionParamOptimizedAlign, which was introduces in
 ; D120129, contained a poorly designed assertion checking that a function with

diff  --git a/llvm/test/CodeGen/NVPTX/param-align.ll b/llvm/test/CodeGen/NVPTX/param-align.ll
index b5b27770c455c..40a523f819493 100644
--- a/llvm/test/CodeGen/NVPTX/param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/param-align.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ;;; Need 4-byte alignment on float* passed byval
 define ptx_device void @t1(float* byval(float) %x) {

diff  --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index 099a26afb940b..a99b79596f9b3 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -1,5 +1,6 @@
 ; Verifies correctness of load/store of parameters and return values.
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify -arch=sm_35 %}
 
 %s_i1 = type { i1 }
 %s_i8 = type { i8 }

diff  --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
index ecec6f0ba0a44..48841952c983d 100644
--- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
+++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx-unknown-unknown | %ptxas-verify %}
 ;
 ; Check that parameters of a __device__ function with private or internal
 ; linkage called from a __global__ (kernel) function get increased alignment,

diff  --git a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
index 1d1e032b1160c..37ffa4e077cec 100644
--- a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
+++ b/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx-unknown-unknown | %ptxas-verify %}
 ;
 ; Check that parameters of a __global__ (kernel) function do not get increased
 ; alignment, and no additional vectorization is performed on loads/stores with

diff  --git a/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll b/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll
index 824327ebcc74c..cef5286949ca5 100644
--- a/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll
+++ b/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=nvptx -verify-machineinstrs < %s | FileCheck %s
+; RUN: %if ptxas %{ llc -march=nvptx -verify-machineinstrs < %s | %ptxas-verify %}
 
 ; Tests the following pattern:
 ; (X & 8) != 0 --> (X & 8) >> 3

diff  --git a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
index 934df30a3a7dc..a6ca645193441 100644
--- a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
+++ b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 define ptx_kernel void @t1(i1* %a) {
 ; PTX32:      mov.u16 %rs{{[0-9]+}}, 0;

diff  --git a/llvm/test/CodeGen/NVPTX/pr16278.ll b/llvm/test/CodeGen/NVPTX/pr16278.ll
index a836eaf2e51f9..044f8b3a0af50 100644
--- a/llvm/test/CodeGen/NVPTX/pr16278.ll
+++ b/llvm/test/CodeGen/NVPTX/pr16278.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 @one_f = addrspace(4) global float 1.000000e+00, align 4
 

diff  --git a/llvm/test/CodeGen/NVPTX/pr17529.ll b/llvm/test/CodeGen/NVPTX/pr17529.ll
index d605c29fe5408..eb332bc8bcf7a 100644
--- a/llvm/test/CodeGen/NVPTX/pr17529.ll
+++ b/llvm/test/CodeGen/NVPTX/pr17529.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
index cb9e6dc3a45c2..b07b36c779300 100644
--- a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
+++ b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %}
 
 ; Check load from constant global variables.  These loads should be
 ; ld.global.nc (aka ldg).

diff  --git a/llvm/test/CodeGen/NVPTX/redux-sync.ll b/llvm/test/CodeGen/NVPTX/redux-sync.ll
index 43a304ea8c702..5934d0f0678b8 100644
--- a/llvm/test/CodeGen/NVPTX/redux-sync.ll
+++ b/llvm/test/CodeGen/NVPTX/redux-sync.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
 
 declare i32 @llvm.nvvm.redux.sync.umin(i32, i32)
 ; CHECK-LABEL: .func{{.*}}redux_sync_min_u32

diff  --git a/llvm/test/CodeGen/NVPTX/refl1.ll b/llvm/test/CodeGen/NVPTX/refl1.ll
index 0432b67535c0f..222c479832cf1 100644
--- a/llvm/test/CodeGen/NVPTX/refl1.ll
+++ b/llvm/test/CodeGen/NVPTX/refl1.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target triple = "nvptx-nvidia-cuda"
 

diff  --git a/llvm/test/CodeGen/NVPTX/reg-copy.ll b/llvm/test/CodeGen/NVPTX/reg-copy.ll
index 98ee49d39023f..2f096f95881d0 100644
--- a/llvm/test/CodeGen/NVPTX/reg-copy.ll
+++ b/llvm/test/CodeGen/NVPTX/reg-copy.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %}
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-unknown-unknown"

diff  --git a/llvm/test/CodeGen/NVPTX/reg-types.ll b/llvm/test/CodeGen/NVPTX/reg-types.ll
index 25494fa67803a..41effcb8011b1 100644
--- a/llvm/test/CodeGen/NVPTX/reg-types.ll
+++ b/llvm/test/CodeGen/NVPTX/reg-types.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=NO8BIT
 ; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=NO8BIT
+; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-LABEL: .visible .func func()
 ; NO8BIT-LABEL: .visible .func func()

diff  --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll
index dfc8b4fd5fcb9..b80bee3d45d29 100644
--- a/llvm/test/CodeGen/NVPTX/rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
 ; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_35 | %ptxas-verify -arch=sm_35 %}
 
 
 declare i32 @llvm.nvvm.rotate.b32(i32, i32)

diff  --git a/llvm/test/CodeGen/NVPTX/rotate_64.ll b/llvm/test/CodeGen/NVPTX/rotate_64.ll
index 1ba0dfa90e021..84982643099ab 100644
--- a/llvm/test/CodeGen/NVPTX/rotate_64.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate_64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=nvptx | FileCheck %s
-
+; RUN: %if ptxas %{ llc < %s -march=nvptx | %ptxas-verify %}
 
 declare i64 @llvm.nvvm.rotate.b64(i64, i32)
 declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)

diff  --git a/llvm/test/CodeGen/NVPTX/sched1.ll b/llvm/test/CodeGen/NVPTX/sched1.ll
index fb01eb262adce..9eddc662cbbdc 100644
--- a/llvm/test/CodeGen/NVPTX/sched1.ll
+++ b/llvm/test/CodeGen/NVPTX/sched1.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; Ensure source scheduling is working
 

diff  --git a/llvm/test/CodeGen/NVPTX/sched2.ll b/llvm/test/CodeGen/NVPTX/sched2.ll
index 91ed77878f81c..d43d76b48391f 100644
--- a/llvm/test/CodeGen/NVPTX/sched2.ll
+++ b/llvm/test/CodeGen/NVPTX/sched2.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 define void @foo(<2 x i32>* %a) {
 ; CHECK: .func foo

diff  --git a/llvm/test/CodeGen/NVPTX/sext-in-reg.ll b/llvm/test/CodeGen/NVPTX/sext-in-reg.ll
index b516dfaf39a07..4afa35d5d75bc 100644
--- a/llvm/test/CodeGen/NVPTX/sext-in-reg.ll
+++ b/llvm/test/CodeGen/NVPTX/sext-in-reg.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/sext-params.ll b/llvm/test/CodeGen/NVPTX/sext-params.ll
index a559630f3591c..7b0fe7c6e381e 100644
--- a/llvm/test/CodeGen/NVPTX/sext-params.ll
+++ b/llvm/test/CodeGen/NVPTX/sext-params.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/shfl-p.ll b/llvm/test/CodeGen/NVPTX/shfl-p.ll
index 45331c326fee8..ce6e7b79d16cf 100644
--- a/llvm/test/CodeGen/NVPTX/shfl-p.ll
+++ b/llvm/test/CodeGen/NVPTX/shfl-p.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}
 
 declare {i32, i1} @llvm.nvvm.shfl.down.i32p(i32, i32, i32)
 declare {float, i1} @llvm.nvvm.shfl.down.f32p(float, i32, i32)

diff  --git a/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll b/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll
index 9ef7162170e47..5e0d6c5f02161 100644
--- a/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll
+++ b/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}
 
 declare {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32, i32, i32, i32)
 declare {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32, float, i32, i32)

diff  --git a/llvm/test/CodeGen/NVPTX/shfl-sync.ll b/llvm/test/CodeGen/NVPTX/shfl-sync.ll
index 84a461f640217..5ea60711de1f9 100644
--- a/llvm/test/CodeGen/NVPTX/shfl-sync.ll
+++ b/llvm/test/CodeGen/NVPTX/shfl-sync.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}
 
 declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32)
 declare float @llvm.nvvm.shfl.sync.down.f32(float, i32, i32, i32)

diff  --git a/llvm/test/CodeGen/NVPTX/shfl.ll b/llvm/test/CodeGen/NVPTX/shfl.ll
index e651676d3a9e0..5008d14a82324 100644
--- a/llvm/test/CodeGen/NVPTX/shfl.ll
+++ b/llvm/test/CodeGen/NVPTX/shfl.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}
 
 declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32)
 declare float @llvm.nvvm.shfl.down.f32(float, i32, i32)

diff  --git a/llvm/test/CodeGen/NVPTX/shift-parts.ll b/llvm/test/CodeGen/NVPTX/shift-parts.ll
index b4d408ff59727..794890175dc65 100644
--- a/llvm/test/CodeGen/NVPTX/shift-parts.ll
+++ b/llvm/test/CodeGen/NVPTX/shift-parts.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: shift_parts_left_128
 define void @shift_parts_left_128(i128* %val, i128* %amtptr) {

diff  --git a/llvm/test/CodeGen/NVPTX/simple-call.ll b/llvm/test/CodeGen/NVPTX/simple-call.ll
index 8ff0b5da5bccc..f3be040b81d46 100644
--- a/llvm/test/CodeGen/NVPTX/simple-call.ll
+++ b/llvm/test/CodeGen/NVPTX/simple-call.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
-
-
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 ; CHECK: .func ({{.*}}) device_func
 define float @device_func(float %a) noinline {

diff  --git a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
index a7015e2406cb2..5069108c9c7d6 100644
--- a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
+++ b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
@@ -1,5 +1,9 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \
 ; RUN:   | FileCheck %s
+; RUN: %if ptxas %{                                                                   \
+; RUN:   llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \
+; RUN:   | %ptxas-verify                                                              \
+; RUN: %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/st-addrspace.ll b/llvm/test/CodeGen/NVPTX/st-addrspace.ll
index 6c5df1c27e60c..9840103e0be2c 100644
--- a/llvm/test/CodeGen/NVPTX/st-addrspace.ll
+++ b/llvm/test/CodeGen/NVPTX/st-addrspace.ll
@@ -1,7 +1,9 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G32,LS32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G64,LS64
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s --check-prefixes=G64,LS32
-
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %}
 
 ;; i8
 ; ALL-LABEL: st_global_i8

diff  --git a/llvm/test/CodeGen/NVPTX/st-generic.ll b/llvm/test/CodeGen/NVPTX/st-generic.ll
index 022f7ab214ca1..5e0e337df2367 100644
--- a/llvm/test/CodeGen/NVPTX/st-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/st-generic.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
-
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ;; i8
 

diff  --git a/llvm/test/CodeGen/NVPTX/store-retval.ll b/llvm/test/CodeGen/NVPTX/store-retval.ll
index 8efe126dac6d2..8f6749e980efe 100644
--- a/llvm/test/CodeGen/NVPTX/store-retval.ll
+++ b/llvm/test/CodeGen/NVPTX/store-retval.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s --mtriple=nvptx-unknown-unknown | FileCheck %s
+; RUN: %if ptxas %{ llc < %s --mtriple=nvptx-unknown-unknown | %ptxas-verify %}
 ;
 ; This is IR generated with clang using -O3 optimization level
 ; and nvptx-unknown-unknown target from the following C code.

diff  --git a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
index fee61951bb56d..d35c8e5316d7c 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20
 ; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}
 
 target triple = "nvptx-unknown-cuda"
 

diff  --git a/llvm/test/CodeGen/NVPTX/surf-read.ll b/llvm/test/CodeGen/NVPTX/surf-read.ll
index 9eeb1c0fc70a4..784de11864276 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 target triple = "nvptx-unknown-nvcl"
 

diff  --git a/llvm/test/CodeGen/NVPTX/surf-tex.py b/llvm/test/CodeGen/NVPTX/surf-tex.py
index 7a251a24346a2..c0b40f1d927bb 100644
--- a/llvm/test/CodeGen/NVPTX/surf-tex.py
+++ b/llvm/test/CodeGen/NVPTX/surf-tex.py
@@ -1,11 +1,13 @@
 # RUN: %python %s --target=cuda --tests=suld,sust,tex,tld4 --gen-list=%t.list > %t-cuda.ll
 # RUN: llc %t-cuda.ll -verify-machineinstrs -o - | FileCheck %t-cuda.ll
+# RUN: %if ptxas %{ llc %t-cuda.ll -verify-machineinstrs -o - | %ptxas-verify %}
 
 # We only need to run this second time for texture tests, because
 # there is a 
diff erence between unified and non-unified intrinsics.
 #
 # RUN: %python %s --target=nvcl --tests=suld,sust,tex,tld4 --gen-list-append --gen-list=%t.list > %t-nvcl.ll
 # RUN: llc %t-nvcl.ll -verify-machineinstrs -o - | FileCheck %t-nvcl.ll
+# RUN: %if ptxas %{ llc %t-nvcl.ll -verify-machineinstrs -o - | %ptxas-verify %}
 
 # Verify that all instructions and intrinsics defined in TableGen
 # files are tested. The command may fail if the files are changed

diff  --git a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
index 6c77616e86721..bfe8c42e4a76a 100644
--- a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20
 ; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}
 
 target triple = "nvptx-unknown-cuda"
 

diff  --git a/llvm/test/CodeGen/NVPTX/surf-write.ll b/llvm/test/CodeGen/NVPTX/surf-write.ll
index d5180e6c5d468..77fc0e90609cb 100644
--- a/llvm/test/CodeGen/NVPTX/surf-write.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-write.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 target triple = "nvptx-unknown-nvcl"
 

diff  --git a/llvm/test/CodeGen/NVPTX/symbol-naming.ll b/llvm/test/CodeGen/NVPTX/symbol-naming.ll
index 3f1caf927dc81..68046167e7c47 100644
--- a/llvm/test/CodeGen/NVPTX/symbol-naming.ll
+++ b/llvm/test/CodeGen/NVPTX/symbol-naming.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; Verify that the NVPTX target removes invalid symbol names prior to emitting
 ; PTX.

diff  --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
index 5a8261ce15fe7..12cf06fd4d6d7 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20
 ; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}
 
 
 target triple = "nvptx-unknown-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/tex-read.ll b/llvm/test/CodeGen/NVPTX/tex-read.ll
index 8638a42dc0eff..dc750e69615aa 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 target triple = "nvptx-unknown-nvcl"
 

diff  --git a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll
index 203d0973b7cd8..b685bcab6fd13 100644
--- a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll
+++ b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=SM20
 ; RUN: llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | FileCheck %s --check-prefix=SM30
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}
 
 target triple = "nvptx-unknown-cuda"
 

diff  --git a/llvm/test/CodeGen/NVPTX/tid-range.ll b/llvm/test/CodeGen/NVPTX/tid-range.ll
index 3dc4008810a1f..c4dd33960d44a 100644
--- a/llvm/test/CodeGen/NVPTX/tid-range.ll
+++ b/llvm/test/CodeGen/NVPTX/tid-range.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -march=nvptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
+
 declare i32 @get_register()
 
 define i1 @test1() {

diff  --git a/llvm/test/CodeGen/NVPTX/tuple-literal.ll b/llvm/test/CodeGen/NVPTX/tuple-literal.ll
index 2b1f2c4b6680a..b0eed3ad8ba4b 100644
--- a/llvm/test/CodeGen/NVPTX/tuple-literal.ll
+++ b/llvm/test/CodeGen/NVPTX/tuple-literal.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 %if ptxas %{ | %ptxas-verify %}
 
 define ptx_device void @test_function({i8, i8}*) {
   ret void

diff  --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll
index bf26e5ff1bdbb..c8b64426f05f1 100644
--- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll
+++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 

diff  --git a/llvm/test/CodeGen/NVPTX/vec8.ll b/llvm/test/CodeGen/NVPTX/vec8.ll
index a86ba1e29d5cd..f8dc9f3b693ef 100644
--- a/llvm/test/CodeGen/NVPTX/vec8.ll
+++ b/llvm/test/CodeGen/NVPTX/vec8.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 target triple = "nvptx-unknown-cuda"
 

diff  --git a/llvm/test/CodeGen/NVPTX/vector-args.ll b/llvm/test/CodeGen/NVPTX/vector-args.ll
index c6c8e73bf83ec..c8eb714c7e3ce 100644
--- a/llvm/test/CodeGen/NVPTX/vector-args.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-args.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 define float @foo(<2 x float> %a) {
 ; CHECK: .func (.param .b32 func_retval0) foo

diff  --git a/llvm/test/CodeGen/NVPTX/vector-call.ll b/llvm/test/CodeGen/NVPTX/vector-call.ll
index d1ec8d25a107d..5d1192724362e 100644
--- a/llvm/test/CodeGen/NVPTX/vector-call.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-call.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
 target triple = "nvptx-unknown-cuda"
 

diff  --git a/llvm/test/CodeGen/NVPTX/vector-compare.ll b/llvm/test/CodeGen/NVPTX/vector-compare.ll
index 2992b0e62c56a..4908394a1e3fc 100644
--- a/llvm/test/CodeGen/NVPTX/vector-compare.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-compare.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 %if ptxas %{ | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 %if ptxas %{ | %ptxas-verify %}
 
 ; This test makes sure that the result of vector compares are properly
 ; scalarized.  If codegen fails, then the type legalizer incorrectly

diff  --git a/llvm/test/CodeGen/NVPTX/vector-global.ll b/llvm/test/CodeGen/NVPTX/vector-global.ll
index a463bee3a4799..ffb3857c7a41e 100644
--- a/llvm/test/CodeGen/NVPTX/vector-global.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-global.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"

diff  --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll
index e24848b99dafa..82238e98e8214 100644
--- a/llvm/test/CodeGen/NVPTX/vector-loads.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; Even though general vector types are not supported in PTX, we can still
 ; optimize loads/stores with pseudo-vector instructions of the form:

diff  --git a/llvm/test/CodeGen/NVPTX/vector-select.ll b/llvm/test/CodeGen/NVPTX/vector-select.ll
index 1e81031c685a6..0d4d45611ef41 100644
--- a/llvm/test/CodeGen/NVPTX/vector-select.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-select.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 %if ptxas %{ | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 %if ptxas %{ | %ptxas-verify %}
 
 ; This test makes sure that vector selects are scalarized by the type legalizer.
 ; If not, type legalization will fail.

diff  --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll
index 49418122da55c..5624d7e9565e6 100644
--- a/llvm/test/CodeGen/NVPTX/vector-stores.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: .visible .func foo1
 ; CHECK: st.v2.f32

diff  --git a/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll b/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll
index cffec02e52a0d..44f6bda0eb178 100644
--- a/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll
+++ b/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s | FileCheck %s
+; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
+
 target triple = "nvptx64-nvidia-cuda"
 
 ; CHECK-LABEL: test1

diff  --git a/llvm/test/CodeGen/NVPTX/vote.ll b/llvm/test/CodeGen/NVPTX/vote.ll
index 49f84c9916ec8..9913cb009f05e 100644
--- a/llvm/test/CodeGen/NVPTX/vote.ll
+++ b/llvm/test/CodeGen/NVPTX/vote.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}
 
 declare i1 @llvm.nvvm.vote.all(i1)
 ; CHECK-LABEL: .func{{.*}}vote_all

diff  --git a/llvm/test/CodeGen/NVPTX/weak-global.ll b/llvm/test/CodeGen/NVPTX/weak-global.ll
index a64f9f48b26f3..ad2e12dfaea63 100644
--- a/llvm/test/CodeGen/NVPTX/weak-global.ll
+++ b/llvm/test/CodeGen/NVPTX/weak-global.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: .weak .global .align 4 .u32 g
 @g = common addrspace(1) global i32 zeroinitializer

diff  --git a/llvm/test/CodeGen/NVPTX/weak-linkage.ll b/llvm/test/CodeGen/NVPTX/weak-linkage.ll
index 5df57b29249ec..8c97e4b6a0483 100644
--- a/llvm/test/CodeGen/NVPTX/weak-linkage.ll
+++ b/llvm/test/CodeGen/NVPTX/weak-linkage.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK: // .weak foo
 ; CHECK: .weak .func foo

diff  --git a/llvm/test/CodeGen/NVPTX/wmma.py b/llvm/test/CodeGen/NVPTX/wmma.py
index 3b3d10947cac9..4df0434b21b95 100644
--- a/llvm/test/CodeGen/NVPTX/wmma.py
+++ b/llvm/test/CodeGen/NVPTX/wmma.py
@@ -9,6 +9,10 @@
 # RUN:           --check-prefixes=INTRINSICS,NOEXTGEOM,NOINT,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX
 # RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \
 # RUN:           | FileCheck %t-ptx60-sm_70.ll
+# RUN: %if ptxas %{                                                       \
+# RUN:   llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \
+# RUN:           | %ptxas-verify -arch=sm_70                              \
+# RUN: %}
 
 # Check all variants of instructions supported by PTX61 on SM70
 # RUN: %python %s --ptx=61 --gpu-arch=70 > %t-ptx61-sm_70.ll
@@ -18,6 +22,10 @@
 # RUN:           --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX
 # RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \
 # RUN:           | FileCheck %t-ptx61-sm_70.ll
+# RUN: %if ptxas-9.1 %{                                                   \
+# RUN:   llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \
+# RUN:           | %ptxas-verify -arch=sm_70                              \
+# RUN: %}
 
 # Check all variants of instructions supported by PTX63 on SM72
 # RUN: %python %s --ptx=63 --gpu-arch=72 > %t-ptx63-sm_72.ll
@@ -27,6 +35,10 @@
 # RUN:           --check-prefixes=INTRINSICS,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX
 # RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \
 # RUN:           | FileCheck %t-ptx63-sm_72.ll
+# RUN: %if ptxas-10.0 %{                                                  \
+# RUN:   llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \
+# RUN:           | %ptxas-verify -arch=sm_72                              \
+# RUN: %}
 
 # Check all variants of instructions supported by PTX63 on SM75
 # RUN: %python %s --ptx=63 --gpu-arch=75 > %t-ptx63-sm_75.ll
@@ -36,6 +48,10 @@
 # RUN:           --check-prefixes=INTRINSICS,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX
 # RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \
 # RUN:           | FileCheck %t-ptx63-sm_75.ll
+# RUN: %if ptxas-10.0 %{                                                  \
+# RUN:   llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \
+# RUN:           | %ptxas-verify -arch=sm_75                              \
+# RUN: %}
 
 # Check all variants of instructions supported by PTX64 on SM70+
 # RUN: %python %s --ptx=64 --gpu-arch=70 > %t-ptx64-sm_70.ll
@@ -45,6 +61,10 @@
 # RUN:           --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NODOUBLE,NOALTFLOAT,NOLDMATRIX
 # RUN: llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \
 # RUN:           | FileCheck %t-ptx64-sm_70.ll
+# RUN: %if ptxas-10.1 %{                                                  \
+# RUN:   llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \
+# RUN:           | %ptxas-verify -arch=sm_70                              \
+# RUN: %}
 
 # Check all variants of instructions supported by PTX65 on SM75+
 # RUN: %python %s --ptx=65 --gpu-arch=75 > %t-ptx65-sm_75.ll
@@ -54,6 +74,10 @@
 # RUN:           --check-prefixes=INTRINSICS
 # RUN: llc < %t-ptx65-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx65 \
 # RUN:           | FileCheck %t-ptx65-sm_75.ll
+# RUN: %if ptxas-10.2 %{                                                  \
+# RUN:   llc < %t-ptx65-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx65 \
+# RUN:           | %ptxas-verify -arch=sm_75                              \
+# RUN: %}
 
 # Check all variants of instructions supported by PTX71 on SM80+
 # RUN: %python %s --ptx=71 --gpu-arch=80 > %t-ptx71-sm_80.ll
@@ -63,6 +87,10 @@
 # RUN:           --check-prefixes=INTRINSICS
 # RUN: llc < %t-ptx71-sm_80.ll -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 \
 # RUN:           | FileCheck %t-ptx71-sm_80.ll
+# RUN: %if ptxas-11.1 %{                                                  \
+# RUN:   llc < %t-ptx71-sm_80.ll -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 \
+# RUN:           | %ptxas-verify -arch=sm_80                              \
+# RUN: %}
 
 from __future__ import print_function
 

diff  --git a/llvm/test/CodeGen/NVPTX/zeroext-32bit.ll b/llvm/test/CodeGen/NVPTX/zeroext-32bit.ll
index bcfd987b4a661..0d6c9fad0b01c 100644
--- a/llvm/test/CodeGen/NVPTX/zeroext-32bit.ll
+++ b/llvm/test/CodeGen/NVPTX/zeroext-32bit.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %if !ptxas-11.0 %{-arch=sm_30%} %}
 
 ; The zeroext attribute below should be silently ignored because
 ; we can pass a 32-bit integer across a function call without

diff  --git a/llvm/test/DebugInfo/NVPTX/crash-missing-DISubprogram.ll b/llvm/test/DebugInfo/NVPTX/crash-missing-DISubprogram.ll
index 38ccea85fba6e..74cc4466c9d2f 100644
--- a/llvm/test/DebugInfo/NVPTX/crash-missing-DISubprogram.ll
+++ b/llvm/test/DebugInfo/NVPTX/crash-missing-DISubprogram.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda %if ptxas %{ | %ptxas-verify %}
+
 ; Don't crash for a function w/o debug info that contains an instruction w/
 ; debug info.
 ; Reported as #51079

diff  --git a/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll b/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll
index 03500938032f4..3f6e4232ed3e3 100644
--- a/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll
+++ b/llvm/test/DebugInfo/NVPTX/cu-range-hole.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %}
 
 ; CHECK: .target sm_{{[0-9]+}}, debug
 

diff  --git a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
index 6bfc3d1e94c48..fdcc74b26847e 100644
--- a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
+++ b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %}
 
 ; CHECK: .target sm_20, debug
 

diff  --git a/llvm/test/DebugInfo/NVPTX/dbg-value-const-byref.ll b/llvm/test/DebugInfo/NVPTX/dbg-value-const-byref.ll
index e3509bb22b637..d3dcd38579384 100644
--- a/llvm/test/DebugInfo/NVPTX/dbg-value-const-byref.ll
+++ b/llvm/test/DebugInfo/NVPTX/dbg-value-const-byref.ll
@@ -1,4 +1,6 @@
 ; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s | FileCheck %s
+; RUN: %if ptxas %{ llc -mtriple=nvptx64-nvidia-cuda < %s | %ptxas-verify %}
+
 ; Generated with -O1 from:
 ; int f1();
 ; void f2(int*);

diff  --git a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
index 73ec34787ad64..842344fbe44bf 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s | FileCheck %s
+; RUN: %if ptxas %{ llc -mtriple=nvptx64-nvidia-cuda < %s | %ptxas-verify %}
 
 @GLOBAL = addrspace(1) externally_initialized global i32 0, align 4, !dbg !0
 @SHARED = addrspace(3) externally_initialized global i32 undef, align 4, !dbg !6

diff  --git a/llvm/test/DebugInfo/NVPTX/debug-empty.ll b/llvm/test/DebugInfo/NVPTX/debug-empty.ll
index cae258815e4e0..e34c61059c0d6 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-empty.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-empty.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %}
 
 ; CHECK: .target sm_{{[0-9]+$}}
 ; CHECK: .section .debug_loc { }

diff  --git a/llvm/test/DebugInfo/NVPTX/debug-file-loc-only.ll b/llvm/test/DebugInfo/NVPTX/debug-file-loc-only.ll
index 200fcf657dd7e..37fbc83bc44f2 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-file-loc-only.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-file-loc-only.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %}
 
 ; // Bitcode in this test case is reduced version of compiled code below:
 ;extern "C" {

diff  --git a/llvm/test/DebugInfo/NVPTX/debug-file-loc.ll b/llvm/test/DebugInfo/NVPTX/debug-file-loc.ll
index 9ab007741f994..2a5a3bc2bb541 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-file-loc.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-file-loc.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %}
 
 ; // Bitcode int this test case is reduced version of compiled code below:
 ;extern "C" {

diff  --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll
index 2edd807a8d823..d118308fc05c3 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-info.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %}
 
 ; // Bitcode int this test case is reduced version of compiled code below:
 ;__device__ inline void res(float x, float y, float *res) { *res = x + y; }

diff  --git a/llvm/test/DebugInfo/NVPTX/debug-loc-offset.ll b/llvm/test/DebugInfo/NVPTX/debug-loc-offset.ll
index c932ea0af460c..f951c5d050ef1 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-loc-offset.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-loc-offset.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s | FileCheck %s
+; RUN: %if ptxas %{ llc -mtriple=nvptx64-nvidia-cuda < %s | %ptxas-verify %}
 
 ; CHECK: .target sm_{{[0-9]+}}, debug
 

diff  --git a/llvm/test/DebugInfo/NVPTX/debug-name-table.ll b/llvm/test/DebugInfo/NVPTX/debug-name-table.ll
index c19ddaeeb8f73..dbad768738c55 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-name-table.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-name-table.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=nvptx64-nvidia-cuda -mattr=+ptx75 < %s | FileCheck %s
+; RUN: %if ptxas-11.5 %{ llc -mtriple=nvptx64-nvidia-cuda -mattr=+ptx75 < %s | %ptxas-verify %}
 
 ; DICompileUnit without 'nameTableKind: None' results in
 ; debug_pubnames and debug_pubtypes sections in DWARF. These sections

diff  --git a/llvm/test/DebugInfo/NVPTX/dwarf-file-dir.ll b/llvm/test/DebugInfo/NVPTX/dwarf-file-dir.ll
index 272a59e756f4f..471e4a6d383f9 100644
--- a/llvm/test/DebugInfo/NVPTX/dwarf-file-dir.ll
+++ b/llvm/test/DebugInfo/NVPTX/dwarf-file-dir.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck --check-prefix=CHECK-NODIR %s
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -dwarf-directory=1 | FileCheck --check-prefix=CHECK-DIR %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %}
 
 ; CHECK-NODIR: .file   {{[0-9]+}} "/tmp/dbginfo/a{{/|\\\\}}a.cpp"
 ;
@@ -15,7 +16,7 @@ entry:
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2, nameTableKind: None)
 !1 = !DIFile(filename: "a.cpp", directory: "/tmp/dbginfo/a")
 !2 = !{}
 !4 = distinct !DISubprogram(name: "func", linkageName: "_Z4funcv", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 1, file: !1, scope: !1, type: !6, retainedNodes: !2)

diff  --git a/llvm/test/DebugInfo/NVPTX/packed_bitfields.ll b/llvm/test/DebugInfo/NVPTX/packed_bitfields.ll
index 63516f5972720..e2097d7f49b48 100644
--- a/llvm/test/DebugInfo/NVPTX/packed_bitfields.ll
+++ b/llvm/test/DebugInfo/NVPTX/packed_bitfields.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %}
 
 ; Produced at -O0 from:
 ; struct {

diff  --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 5c62c8ab49222..7227c92479342 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -192,6 +192,53 @@ def get_asan_rtlib():
     ToolSubst('OrcV2CBindingsVeryLazy', unresolved='ignore'),
     ToolSubst('dxil-dis', unresolved='ignore')])
 
+# Find (major, minor) version of ptxas
+def ptxas_version(ptxas):
+    ptxas_cmd = subprocess.Popen([ptxas, '--version'], stdout=subprocess.PIPE)
+    ptxas_out = ptxas_cmd.stdout.read().decode('ascii')
+    ptxas_cmd.wait()
+    match = re.search('release (\d+)\.(\d+)', ptxas_out)
+    if match:
+        return (int(match.group(1)), int(match.group(2)))
+    print('couldn\'t determine ptxas version')
+    return None
+
+def enable_ptxas(ptxas_executable):
+    version = ptxas_version(ptxas_executable)
+    if version:
+        # ptxas is supposed to be backward compatible with previous
+        # versions, so add a feature for every known version prior to
+        # the current one.
+        ptxas_known_versions = [
+            (9, 0), (9, 1), (9, 2),
+            (10, 0), (10, 1), (10, 2),
+            (11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 6),
+        ]
+
+        # ignore ptxas if its version is below the minimum supported
+        # version
+        min_version = ptxas_known_versions[0]
+        if version[0] < min_version[0] or version[1] < min_version[1]:
+            print(
+                'Warning: ptxas version {}.{} is not supported'.format(
+                    version[0], version[1]))
+            return
+
+        for known_major, known_minor in ptxas_known_versions:
+            if known_major <= version[0] and known_minor <= version[1]:
+                config.available_features.add(
+                    'ptxas-{}.{}'.format(known_major, known_minor))
+
+    config.available_features.add('ptxas')
+    tools.extend([ToolSubst('%ptxas', ptxas_executable),
+                  ToolSubst('%ptxas-verify', '{} -c -o /dev/null -'.format(
+                      ptxas_executable))])
+
+ptxas_executable = \
+    os.environ.get('LLVM_PTXAS_EXECUTABLE', None) or config.ptxas_executable
+if ptxas_executable:
+    enable_ptxas(ptxas_executable)
+
 llvm_config.add_tool_substitutions(tools, config.llvm_tools_dir)
 
 # Targets

diff  --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
index b198f3d6dc3b1..266f93232ed61 100644
--- a/llvm/test/lit.site.cfg.py.in
+++ b/llvm/test/lit.site.cfg.py.in
@@ -23,6 +23,7 @@ config.have_ocamlopt = @HAVE_OCAMLOPT@
 config.ocaml_flags = "@OCAMLFLAGS@"
 config.include_go_tests = @LLVM_INCLUDE_GO_TESTS@
 config.go_executable = "@GO_EXECUTABLE@"
+config.ptxas_executable = "@PXTAS_EXECUTABLE@"
 config.enable_shared = @ENABLE_SHARED@
 config.enable_assertions = @ENABLE_ASSERTIONS@
 config.targets_to_build = "@TARGETS_TO_BUILD@"