[llvm] e29ba97 - [NVPTX] Auto-generate tests for sufrace and texture instructions

Tue Dec 7 04:28:50 PST 2021

Author: Andrew Savonichev
Date: 2021-12-07T15:27:51+03:00
New Revision: e29ba97d236ca8c1ccd67e801cdc762b2764ae3d

URL: https://github.com/llvm/llvm-project/commit/e29ba97d236ca8c1ccd67e801cdc762b2764ae3d
DIFF: https://github.com/llvm/llvm-project/commit/e29ba97d236ca8c1ccd67e801cdc762b2764ae3d.diff

LOG: [NVPTX] Auto-generate tests for sufrace and texture instructions

The patch adds LIT tests for SULD, SUST, TEX and TLD4 instructions as
a follow up for D112232. There are a number of FIXME marks that
highlight possible bugs or missed instruction variants.

Differential Revision: https://reviews.llvm.org/D114367

Added: 
    llvm/test/CodeGen/NVPTX/surf-tex.py

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/NVPTX/surf-tex.py b/llvm/test/CodeGen/NVPTX/surf-tex.py
new file mode 100644
index 0000000000000..7a251a24346a2

--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/surf-tex.py
@@ -0,0 +1,1025 @@
+# RUN: %python %s --target=cuda --tests=suld,sust,tex,tld4 --gen-list=%t.list > %t-cuda.ll
+# RUN: llc %t-cuda.ll -verify-machineinstrs -o - | FileCheck %t-cuda.ll
+
+# We only need to run this second time for texture tests, because
+# there is a 
diff erence between unified and non-unified intrinsics.
+#
+# RUN: %python %s --target=nvcl --tests=suld,sust,tex,tld4 --gen-list-append --gen-list=%t.list > %t-nvcl.ll
+# RUN: llc %t-nvcl.ll -verify-machineinstrs -o - | FileCheck %t-nvcl.ll
+
+# Verify that all instructions and intrinsics defined in TableGen
+# files are tested. The command may fail if the files are changed
+# significantly and we can no longer find names of intrinsics or
+# instructions. In that case we can replace this command with a
+# reference list.
+#
+# Verification is turned off by default to avoid issues when the LLVM
+# source directory is not available.
+#
+# RUN-DISABLED:  %python %s --verify --gen-list=%t.list --llvm-tablegen=%S/../../../include/llvm/IR/IntrinsicsNVVM.td  --inst-tablegen=%S/../../../lib/Target/NVPTX/NVPTXIntrinsics.td
+
+from __future__ import print_function
+
+import argparse
+import re
+import string
+import textwrap
+from itertools import product
+
+def get_llvm_geom(geom_ptx):
+  geom = {
+    "1d"    : "1d",
+    "2d"    : "2d",
+    "3d"    : "3d",
+    "a1d"   : "1d.array",
+    "a2d"   : "2d.array",
+    "cube"  : "cube",
+    "acube" : "cube.array"
+  }
+  return geom[geom_ptx]
+
+def get_ptx_reg(ty):
+  reg = {
+    "b8"  : "%rs{{[0-9]+}}",
+    "b16" : "%rs{{[0-9]+}}",
+    "b32" : "%r{{[0-9]+}}",
+    "b64" : "%rd{{[0-9]+}}",
+    "f32" : "%f{{[0-9]+}}",
+    "u32" : "%r{{[0-9]+}}",
+    "s32" : "%r{{[0-9]+}}"
+  }
+  return reg[ty]
+
+def get_ptx_vec_reg(vec, ty):
+  vec_reg = {
+    ""   : "{{{reg}}}",
+    "v2" : "{{{reg}, {reg}}}",
+    "v4" : "{{{reg}, {reg}, {reg}, {reg}}}"
+  }
+  return vec_reg[vec].format(reg=get_ptx_reg(ty))
+
+def get_llvm_type(ty):
+  if ty[0] in ("b", "s", "u"):
+    return "i" + ty[1:]
+  if ty == "f16":
+    return "half"
+  if ty == "f32":
+    return "float"
+  raise RuntimeError("invalid type: " + ty)
+
+def get_llvm_vec_type(vec, ty_ptx):
+  ty = get_llvm_type(ty_ptx)
+
+  # i8 is passed as i16, same as in PTX
+  if ty == "i8":
+    ty = "i16"
+
+  vec_ty = {
+    ""   : "{ty}",
+    "v2" : "{{ {ty}, {ty} }}",
+    "v4" : "{{ {ty}, {ty}, {ty}, {ty} }}"
+  }
+  return vec_ty[vec].format(ty=ty)
+
+def get_llvm_value(vec, ty_ptx):
+  ty = get_llvm_type(ty_ptx)
+
+  # i8 is passed as i16, same as in PTX
+  if ty == "i8":
+    ty = "i16"
+
+  value = {
+    ""   : "{ty} %v1",
+    "v2" : "{ty} %v1, {ty} %v2",
+    "v4" : "{ty} %v1, {ty} %v2, {ty} %v3, {ty} %v4"
+  }
+  return value[vec].format(ty=ty)
+
+def get_llvm_value_type(vec, ty_ptx):
+  ty = get_llvm_type(ty_ptx)
+
+  # i8 is passed as i16, same as in PTX
+  if ty == "i8":
+    ty = "i16"
+
+  value = {
+    ""   : "{ty}",
+    "v2" : "{ty}, {ty}",
+    "v4" : "{ty}, {ty}, {ty}, {ty}"
+  }
+  return value[vec].format(ty=ty)
+
+def gen_triple(target):
+  if target == "cuda":
+    print("target triple = \"nvptx64-unknown-cuda\"\n")
+  elif target == "nvcl":
+    print("target triple = \"nvptx64-unknown-nvcl\"\n")
+  else:
+    raise RuntimeError("invalid target: " + target)
+
+def gen_globals(target, surf_name, tex_name, sampler_name):
+  print("declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)")
+  print("; CHECK: .global .surfref {}".format(surf_name))
+  print("; CHECK: .global .texref {}".format(tex_name))
+  print("@{} = internal addrspace(1) global i64 0, align 8".format(surf_name))
+  print("@{} = internal addrspace(1) global i64 1, align 8".format(tex_name))
+  generated_metadata = [
+    "!{{i64 addrspace(1)* @{}, !\"surface\", i32 1}}".format(surf_name),
+    "!{{i64 addrspace(1)* @{}, !\"texture\", i32 1}}".format(tex_name),
+  ]
+
+  if not is_unified(target):
+    print("; CHECK: .global .samplerref {}".format(sampler_name))
+    print("@{} = internal addrspace(1) global i64 1, align 8".format(
+      sampler_name))
+    generated_metadata.append(
+      "!{{i64 addrspace(1)* @{}, !\"sampler\", i32 1}}".format(sampler_name))
+
+  return generated_metadata
+
+def gen_metadata(metadata):
+  md_values = ["!{}".format(i) for i in range(len(metadata))]
+  print("!nvvm.annotations = !{{{values}}}".format(values=(", ".join(md_values))))
+  for i, md in enumerate(metadata):
+    print("!{} = {}".format(i, md))
+
+def get_llvm_surface_access(geom_ptx):
+  access = {
+    "1d"  : "i32 %x",
+    "2d"  : "i32 %x, i32 %y",
+    "3d"  : "i32 %x, i32 %y, i32 %z",
+    "a1d" : "i32 %l, i32 %x",
+    "a2d" : "i32 %l, i32 %x, i32 %y",
+  }
+  return access[geom_ptx]
+
+def get_llvm_surface_access_type(geom_ptx):
+  access_ty = {
+    "1d"  : "i32",
+    "2d"  : "i32, i32",
+    "3d"  : "i32, i32, i32",
+    "a1d" : "i32, i32",
+    "a2d" : "i32, i32, i32",
+  }
+  return access_ty[geom_ptx]
+
+def get_ptx_surface_access(geom_ptx):
+  """
+  Operand b is a scalar or singleton tuple for 1d surfaces; is a
+  two-element vector for 2d surfaces; and is a four-element vector
+  for 3d surfaces, where the fourth element is ignored. Coordinate
+  elements are of type .s32.
+
+  For 1d surface arrays, operand b has type .v2.b32. The first
+  element is interpreted as an unsigned integer index (.u32) into
+  the surface array, and the second element is interpreted as a 1d
+  surface coordinate of type .s32.
+
+  For 2d surface arrays, operand b has type .v4.b32. The first
+  element is interpreted as an unsigned integer index (.u32) into
+  the surface array, and the next two elements are interpreted as 2d
+  surface coordinates of type .s32. The fourth element is ignored.
+  """
+  access_reg = {
+    "1d"  : "{%r{{[0-9]}}}",
+    "2d"  : "{%r{{[0-9]}}, %r{{[0-9]}}}",
+    "3d"  : "{%r{{[0-9]}}, %r{{[0-9]}}, %r{{[0-9]}}, %r{{[0-9]}}}",
+    "a1d" : "{%r{{[0-9]}}, %r{{[0-9]}}}",
+    "a2d" : "{%r{{[0-9]}}, %r{{[0-9]}}, %r{{[0-9]}}, %r{{[0-9]}}}",
+  }
+  return access_reg[geom_ptx]
+
+def get_ptx_surface(target):
+  # With 'cuda' environment surface is copied with ld.param, so the
+  # instruction uses a register. For 'nvcl' the instruction uses the
+  # parameter directly.
+  if target == "cuda":
+    return "%rd{{[0-9]+}}"
+  elif target == "nvcl":
+    return "test_{{.*}}_param_0"
+  raise RuntimeError("invalid target: " + target)
+
+def get_surface_metadata(target, fun_ty, fun_name, has_surface_param):
+  metadata = []
+
+  md_kernel = "!{{{fun_ty} @{fun_name}, !\"kernel\", i32 1}}".format(
+    fun_ty=fun_ty, fun_name=fun_name)
+  metadata.append(md_kernel)
+
+  if target == "cuda":
+    # When a parameter is lowered as a .surfref, it still has the
+    # corresponding ld.param.u64, which is illegal. Do not emit the
+    # metadata to keep the parameter as .b64 instead.
+    has_surface_param = False
+
+  if has_surface_param:
+    md_surface = "!{{{fun_ty} @{fun_name}, !\"rdwrimage\", i32 0}}".format(
+      fun_ty=fun_ty, fun_name=fun_name)
+    metadata.append(md_surface)
+
+  return metadata
+
+def gen_suld_tests(target, global_surf):
+  """
+  PTX spec s9.7.10.1. Surface Instructions:
+
+  suld.b.geom{.cop}.vec.dtype.clamp  d, [a, b];  // unformatted
+
+  .geom  = { .1d, .2d, .3d, .a1d, .a2d };
+  .cop   = { .ca, .cg, .cs, .cv };               // cache operation
+  .vec   = { none, .v2, .v4 };
+  .dtype = { .b8 , .b16, .b32, .b64 };
+  .clamp = { .trap, .clamp, .zero };
+  """
+
+  template = """
+  declare ${retty} @${intrinsic}(i64 %s, ${access});
+
+  ; CHECK-LABEL: .entry ${test_name}_param
+  ; CHECK: ${instruction} ${reg_ret}, [${reg_surf}, ${reg_access}]
+  ;
+  define void @${test_name}_param(i64 %s, ${retty}* %ret, ${access}) {
+    %val = tail call ${retty} @${intrinsic}(i64 %s, ${access})
+    store ${retty} %val, ${retty}* %ret
+    ret void
+  }
+  ; CHECK-LABEL: .entry ${test_name}_global
+  ; CHECK: ${instruction} ${reg_ret}, [${global_surf}, ${reg_access}]
+  ;
+  define void @${test_name}_global(${retty}* %ret, ${access}) {
+    %gs = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_surf})
+    %val = tail call ${retty} @${intrinsic}(i64 %gs, ${access})
+    store ${retty} %val, ${retty}* %ret
+    ret void
+  }
+  """
+
+  generated_items = []
+  generated_metadata = []
+  # FIXME: "cop" is missing
+  for geom, vec, dtype, clamp in product(
+      ["1d", "2d", "3d", "a1d", "a2d"],
+      ["", "v2", "v4"],
+      ["b8" , "b16", "b32", "b64"],
+      ["trap", "clamp", "zero"]):
+
+    if vec == "v4" and dtype == "b64":
+      continue
+
+    test_name = "test_suld_" + geom + vec + dtype + clamp
+
+    params = {
+      "test_name"   : test_name,
+
+      "intrinsic"   : "llvm.nvvm.suld.{geom}.{dtype}.{clamp}".format(
+        geom=get_llvm_geom(geom),
+        dtype=(vec + get_llvm_type(dtype)),
+        clamp=clamp),
+      "retty"       : get_llvm_vec_type(vec, dtype),
+      "access"      : get_llvm_surface_access(geom),
+      "global_surf" : global_surf,
+
+      "instruction" : "suld.b.{geom}{vec}.{dtype}.{clamp}".format(
+        geom=geom,
+        vec=("" if vec == "" else "." + vec),
+        dtype=dtype,
+        clamp=clamp),
+      "reg_ret"     : get_ptx_vec_reg(vec, dtype),
+      "reg_surf"    : get_ptx_surface(target),
+      "reg_access"  : get_ptx_surface_access(geom),
+    }
+    gen_test(template, params)
+    generated_items.append((params["intrinsic"], params["instruction"]))
+
+    fun_name = test_name + "_param";
+    fun_ty = "void (i64, {retty}*, {access_ty})*".format(
+      retty=params["retty"],
+      access_ty=get_llvm_surface_access_type(geom))
+    generated_metadata += get_surface_metadata(
+      target, fun_ty, fun_name, has_surface_param=True)
+
+    fun_name = test_name + "_global";
+    fun_ty = "void ({retty}*, {access_ty})*".format(
+      retty=params["retty"],
+      access_ty=get_llvm_surface_access_type(geom))
+    generated_metadata += get_surface_metadata(
+      target, fun_ty, fun_name, has_surface_param=False)
+
+  return generated_items, generated_metadata
+
+def gen_sust_tests(target, global_surf):
+  """
+  PTX spec s9.7.10.2. Surface Instructions
+
+  sust.b.{1d,2d,3d}{.cop}.vec.ctype.clamp  [a, b], c;  // unformatted
+  sust.p.{1d,2d,3d}.vec.b32.clamp          [a, b], c;  // formatted
+
+  sust.b.{a1d,a2d}{.cop}.vec.ctype.clamp   [a, b], c;  // unformatted
+
+  .cop   = { .wb, .cg, .cs, .wt };                     // cache operation
+  .vec   = { none, .v2, .v4 };
+  .ctype = { .b8 , .b16, .b32, .b64 };
+  .clamp = { .trap, .clamp, .zero };
+  """
+
+  template = """
+  declare void @${intrinsic}(i64 %s, ${access}, ${value});
+
+  ; CHECK-LABEL: .entry ${test_name}_param
+  ; CHECK: ${instruction} [${reg_surf}, ${reg_access}], ${reg_value}
+  ;
+  define void @${test_name}_param(i64 %s, ${value}, ${access}) {
+    tail call void @${intrinsic}(i64 %s, ${access}, ${value})
+    ret void
+  }
+  ; CHECK-LABEL: .entry ${test_name}_global
+  ; CHECK: ${instruction} [${global_surf}, ${reg_access}], ${reg_value}
+  ;
+  define void @${test_name}_global(${value}, ${access}) {
+    %gs = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_surf})
+    tail call void @${intrinsic}(i64 %gs, ${access}, ${value})
+    ret void
+  }
+  """
+
+  generated_items = []
+  generated_metadata = []
+  # FIXME: "cop" is missing
+  for fmt, geom, vec, ctype, clamp in product(
+      ["b", "p"],
+      ["1d", "2d", "3d", "a1d", "a2d"],
+      ["", "v2", "v4"],
+      ["b8" , "b16", "b32", "b64"],
+      ["trap", "clamp", "zero"]):
+
+    if fmt == "p" and geom[0] == "a":
+      continue
+    if fmt == "p" and ctype != "b32":
+      continue
+    if vec == "v4" and ctype == "b64":
+      continue
+
+    # FIXME: these intrinsics are missing, but at least one of them is
+    # listed in the PTX spec: sust.p.{1d,2d,3d}.vec.b32.clamp
+    if fmt == "p" and clamp != "trap":
+      continue
+
+    test_name = "test_sust_" + fmt + geom + vec + ctype + clamp
+
+    params = {
+      "test_name"   : test_name,
+
+      "intrinsic" : "llvm.nvvm.sust.{fmt}.{geom}.{ctype}.{clamp}".format(
+        fmt=fmt,
+        geom=get_llvm_geom(geom),
+        ctype=(vec + get_llvm_type(ctype)),
+        clamp=clamp),
+      "access"      : get_llvm_surface_access(geom),
+      "value"       : get_llvm_value(vec, ctype),
+      "global_surf" : global_surf,
+
+      "instruction" : "sust.{fmt}.{geom}{vec}.{ctype}.{clamp}".format(
+        fmt=fmt,
+        geom=geom,
+        vec=("" if vec == "" else "." + vec),
+        ctype=ctype,
+        clamp=clamp),
+      "reg_value"   : get_ptx_vec_reg(vec, ctype),
+      "reg_surf"    : get_ptx_surface(target),
+      "reg_access"  : get_ptx_surface_access(geom)
+    }
+    gen_test(template, params)
+    generated_items.append((params["intrinsic"], params["instruction"]))
+
+    fun_name = test_name + "_param";
+    fun_ty = "void (i64, {value_ty}, {access_ty})*".format(
+      value_ty=get_llvm_value_type(vec, ctype),
+      access_ty=get_llvm_surface_access_type(geom))
+    generated_metadata += get_surface_metadata(
+      target, fun_ty, fun_name, has_surface_param=True)
+
+    fun_name = test_name + "_global";
+    fun_ty = "void ({value_ty}, {access_ty})*".format(
+      value_ty=get_llvm_value_type(vec, ctype),
+      access_ty=get_llvm_surface_access_type(geom))
+    generated_metadata += get_surface_metadata(
+      target, fun_ty, fun_name, has_surface_param=False)
+
+  return generated_items, generated_metadata
+
+def is_unified(target):
+  """
+  PTX has two modes of operation. In the unified mode, texture and
+  sampler information is accessed through a single .texref handle. In
+  the independent mode, texture and sampler information each have their
+  own handle, allowing them to be defined separately and combined at the
+  site of usage in the program.
+
+  """
+  return target == "cuda"
+
+def get_llvm_texture_access(geom_ptx, ctype, mipmap):
+  geom_access = {
+    "1d"    : "{ctype} %x",
+    "2d"    : "{ctype} %x, {ctype} %y",
+    "3d"    : "{ctype} %x, {ctype} %y, {ctype} %z",
+    "cube"  : "{ctype} %s, {ctype} %t, {ctype} %r",
+    "a1d"   : "i32 %l, {ctype} %x",
+    "a2d"   : "i32 %l, {ctype} %x, {ctype} %y",
+    "acube" : "i32 %l, {ctype} %s, {ctype} %t, {ctype} %r",
+  }
+
+  access = geom_access[geom_ptx]
+
+  if mipmap == "level":
+    access += ", {ctype} %lvl"
+  elif mipmap == "grad":
+    if geom_ptx in ("1d", "a1d"):
+      access += ", {ctype} %dpdx1, {ctype} %dpdy1"
+    elif geom_ptx in ("2d", "a2d"):
+      access += (", {ctype} %dpdx1, {ctype} %dpdx2" +
+                 ", {ctype} %dpdy1, {ctype} %dpdy2")
+    else:
+      access += (", {ctype} %dpdx1, {ctype} %dpdx2, {ctype} %dpdx3" +
+                 ", {ctype} %dpdy1, {ctype} %dpdy2, {ctype} %dpdy3")
+
+  return access.format(ctype=get_llvm_type(ctype))
+
+def get_llvm_texture_access_type(geom_ptx, ctype, mipmap):
+  geom_access = {
+    "1d"    : "{ctype}",
+    "2d"    : "{ctype}, {ctype}",
+    "3d"    : "{ctype}, {ctype}, {ctype}",
+    "cube"  : "{ctype}, {ctype}, {ctype}",
+    "a1d"   : "i32, {ctype}",
+    "a2d"   : "i32, {ctype}, {ctype}",
+    "acube" : "i32, {ctype}, {ctype}, {ctype}",
+  }
+
+  access = geom_access[geom_ptx]
+
+  if mipmap == "level":
+    access += ", {ctype}"
+  elif mipmap == "grad":
+    if geom_ptx in ("1d", "a1d"):
+      access += ", {ctype}, {ctype}"
+    elif geom_ptx in ("2d", "a2d"):
+      access += (", {ctype}, {ctype}, {ctype}, {ctype}")
+    else:
+      access += (", {ctype}, {ctype}, {ctype}" +
+                 ", {ctype}, {ctype}, {ctype}")
+
+  return access.format(ctype=get_llvm_type(ctype))
+
+def get_ptx_texture_access(geom_ptx, ctype):
+  access_reg = {
+    "1d"    : "{{{ctype_reg}}}",
+    "2d"    : "{{{ctype_reg}, {ctype_reg}}}",
+    "3d"    : "{{{ctype_reg}, {ctype_reg}, {ctype_reg}, {ctype_reg}}}",
+    "a1d"   : "{{{b32_reg}, {ctype_reg}}}",
+    "a2d"   : "{{{b32_reg}, {ctype_reg}, {ctype_reg}, {ctype_reg}}}",
+    "cube"  : "{{{f32_reg}, {f32_reg}, {f32_reg}, {f32_reg}}}",
+    "acube" : "{{{b32_reg}, {f32_reg}, {f32_reg}, {f32_reg}}}",
+  }
+  return access_reg[geom_ptx].format(ctype_reg=get_ptx_reg(ctype),
+                                     b32_reg=get_ptx_reg("b32"),
+                                     f32_reg=get_ptx_reg("f32"))
+
+def get_ptx_texture(target):
+  # With 'cuda' environment texture/sampler are copied with ld.param,
+  # so the instruction uses registers. For 'nvcl' the instruction uses
+  # texture/sampler parameters directly.
+  if target == "cuda":
+    return "%rd{{[0-9]+}}"
+  elif target == "nvcl":
+    return "test_{{.*}}_param_0, test_{{.*}}_param_1"
+  raise RuntimeError("unknown target: " + target)
+
+def get_llvm_global_sampler(target, global_sampler):
+  if is_unified(target):
+    return "", ""
+  else:
+    sampler_handle = "i64 %gs,"
+    get_sampler_handle = (
+      "%gs = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64" +
+      "(i64 addrspace(1)* @{})".format(global_sampler))
+    return sampler_handle, get_sampler_handle
+
+def get_ptx_global_sampler(target, global_sampler):
+  if is_unified(target):
+    return ""
+  else:
+    return global_sampler + ","
+
+def get_texture_metadata(target, fun_ty, fun_name, has_texture_params):
+  metadata = []
+
+  md_kernel = "!{{{fun_ty} @{fun_name}, !\"kernel\", i32 1}}".format(
+    fun_ty=fun_ty, fun_name=fun_name)
+  metadata.append(md_kernel)
+
+  if target == "cuda":
+    # When a parameter is lowered as a .texref, it still has the
+    # corresponding ld.param.u64, which is illegal. Do not emit the
+    # metadata to keep the parameter as .b64 instead.
+    has_texture_params = False
+
+  if has_texture_params:
+    md_texture = "!{{{fun_ty} @{fun_name}, !\"rdoimage\", i32 0}}".format(
+      fun_ty=fun_ty, fun_name=fun_name)
+    metadata.append(md_texture)
+
+    if not is_unified(target):
+      md_sampler = "!{{{fun_ty} @{fun_name}, !\"sampler\", i32 1}}".format(
+      fun_ty=fun_ty, fun_name=fun_name)
+      metadata.append(md_sampler)
+
+  return metadata
+
+def gen_tex_tests(target, global_tex, global_sampler):
+  """
+  PTX spec s9.7.9.3. Texture Instructions
+
+  tex.geom.v4.dtype.ctype  d, [a, c] {, e} {, f};
+  tex.geom.v4.dtype.ctype  d[|p], [a, b, c] {, e} {, f};  // explicit sampler
+
+  tex.geom.v2.f16x2.ctype  d[|p], [a, c] {, e} {, f};
+  tex.geom.v2.f16x2.ctype  d[|p], [a, b, c] {, e} {, f};  // explicit sampler
+
+  // mipmaps
+  tex.base.geom.v4.dtype.ctype   d[|p], [a, {b,} c] {, e} {, f};
+  tex.level.geom.v4.dtype.ctype  d[|p], [a, {b,} c], lod {, e} {, f};
+  tex.grad.geom.v4.dtype.ctype   d[|p], [a, {b,} c], dPdx, dPdy {, e} {, f};
+
+  tex.base.geom.v2.f16x2.ctype   d[|p], [a, {b,} c] {, e} {, f};
+  tex.level.geom.v2.f16x2.ctype  d[|p], [a, {b,} c], lod {, e} {, f};
+  tex.grad.geom.v2.f16x2.ctype   d[|p], [a, {b,} c], dPdx, dPdy {, e} {, f};
+
+  .geom  = { .1d, .2d, .3d, .a1d, .a2d, .cube, .acube, .2dms, .a2dms };
+  .dtype = { .u32, .s32, .f16,  .f32 };
+  .ctype = {       .s32, .f32 };          // .cube, .acube require .f32
+                                          // .2dms, .a2dms require .s32
+  """
+
+  template = """
+  declare ${retty} @${intrinsic}(i64 %tex, ${sampler} ${access})
+
+  ; CHECK-LABEL: .entry ${test_name}_param
+  ; CHECK: ${instruction} ${ptx_ret}, [${ptx_tex}, ${ptx_access}]
+  define void @${test_name}_param(i64 %tex, ${sampler} ${retty}* %ret, ${access}) {
+    %val = tail call ${retty} @${intrinsic}(i64 %tex, ${sampler} ${access})
+    store ${retty} %val, ${retty}* %ret
+    ret void
+  }
+  ; CHECK-LABEL: .entry ${test_name}_global
+  ; CHECK: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}]
+  define void @${test_name}_global(${retty}* %ret, ${access}) {
+    %gt = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_tex})
+    ${get_sampler_handle}
+    %val = tail call ${retty} @${intrinsic}(i64 %gt, ${sampler} ${access})
+    store ${retty} %val, ${retty}* %ret
+    ret void
+  }
+  """
+
+  generated_items = []
+  generated_metadata = []
+  for mipmap, geom, vec, dtype, ctype in product(
+      ["", "level", "grad"],
+      ["1d", "2d", "3d", "a1d", "a2d", "cube", "acube", "2dms", "a2dms"],
+      ["v2", "v4"],
+      ["u32", "s32", "f16", "f32"],
+      ["s32", "f32"]):
+
+    # FIXME: missing intrinsics.
+    # Multi-sample textures and multi-sample texture arrays
+    # introduced in PTX ISA version 3.2.
+    if geom in ("2dms", "a2dms"):
+      continue
+
+    # FIXME: missing intrinsics? no such restriction in the PTX spec
+    if ctype == "s32" and mipmap != "":
+      continue
+
+    # FIXME: missing intrinsics?
+    if ctype == "s32" and geom in ("cube", "acube"):
+      continue
+
+    # FIXME: missing intrinsics.
+    # Support for textures returning f16 and f16x2 data introduced in
+    # PTX ISA version 4.2.
+    if vec == "v2" or dtype == "f16":
+      continue
+
+    # FIXME: missing intrinsics.
+    # Support for tex.grad.{cube, acube} introduced in PTX ISA version
+    # 4.3.
+    if mipmap == "grad" and geom in ("cube", "acube"):
+      continue
+
+    # The instruction returns a two-element vector for destination
+    # type f16x2. For all other destination types, the instruction
+    # returns a four-element vector. Coordinates may be given in
+    # either signed 32-bit integer or 32-bit floating point form.
+    if vec == "v2" and dtype != "f16":
+      continue
+
+    sampler_handle, get_sampler_handle = get_llvm_global_sampler(
+      target, global_sampler)
+
+    test_name = "test_tex_" + "".join((mipmap, geom, vec, dtype, ctype))
+    params = {
+      "test_name" : test_name,
+      "intrinsic" :
+        "llvm.nvvm.tex{unified}.{geom}{mipmap}.{vec}{dtype}.{ctype}".format(
+          unified=(".unified" if is_unified(target) else ""),
+          geom=get_llvm_geom(geom),
+          mipmap=("" if mipmap == "" else "." + mipmap),
+          vec=vec,
+          dtype=dtype,
+          ctype=ctype),
+      "global_tex": global_tex,
+      "retty"     : get_llvm_vec_type(vec, dtype),
+      "sampler"   : sampler_handle,
+      "access"    : get_llvm_texture_access(geom, ctype, mipmap),
+      "get_sampler_handle" : get_sampler_handle,
+
+      "instruction" : "tex{mipmap}.{geom}.{vec}.{dtype}.{ctype}".format(
+        mipmap=("" if mipmap == "" else "." + mipmap),
+        geom=geom,
+        vec=vec,
+        dtype=dtype,
+        ctype=ctype),
+      "ptx_ret"     : get_ptx_vec_reg(vec, dtype),
+      "ptx_tex"     : get_ptx_texture(target),
+      "ptx_access"  : get_ptx_texture_access(geom, ctype),
+      "ptx_global_sampler" : get_ptx_global_sampler(target, global_sampler),
+    }
+    gen_test(template, params)
+    generated_items.append((params["intrinsic"], params["instruction"]))
+
+    fun_name = test_name + "_param";
+    fun_ty = "void (i64, {sampler} {retty}*, {access_ty})*".format(
+      sampler=("" if is_unified(target) else "i64,"),
+      retty=params["retty"],
+      access_ty=get_llvm_texture_access_type(geom, ctype, mipmap))
+    generated_metadata += get_texture_metadata(
+      target, fun_ty, fun_name, has_texture_params=True)
+
+    fun_name = test_name + "_global";
+    fun_ty = "void ({retty}*, {access_ty})*".format(
+      retty=params["retty"],
+      access_ty=get_llvm_texture_access_type(geom, ctype, mipmap))
+    generated_metadata += get_texture_metadata(
+      target, fun_ty, fun_name, has_texture_params=False)
+
+  return generated_items, generated_metadata
+
+def get_llvm_tld4_access(geom):
+  """
+  For 2D textures, operand c specifies coordinates as a two-element,
+  32-bit floating-point vector.
+
+  For 2d texture arrays operand c is a four element, 32-bit
+  vector. The first element in operand c is interpreted as an unsigned
+  integer index (.u32) into the texture array, and the next two
+  elements are interpreted as 32-bit floating point coordinates of 2d
+  texture. The fourth element is ignored.
+
+  For cubemap textures, operand c specifies four-element vector which
+  comprises three floating-point coordinates (s, t, r) and a fourth
+  padding argument which is ignored.
+
+  [For cube arrays] The first element in operand c is interpreted as
+  an unsigned integer index (.u32) into the cubemap texture array, and
+  the remaining three elements are interpreted as floating-point
+  cubemap coordinates (s, t, r), used to lookup in the selected
+  cubemap.
+  """
+  geom_to_access = {
+    "2d"    : "float %x, float %y",
+    "a2d"   : "i32 %l, float %x, float %y",
+    "cube"  : "float %s, float %t, float %r",
+    "acube" : "i32 %l, float %s, float %t, float %r"
+  }
+  return geom_to_access[geom]
+
+def get_llvm_tld4_access_type(geom):
+  geom_to_access = {
+    "2d"    : "float, float",
+    "a2d"   : "i32, float, float",
+    "cube"  : "float, float, float",
+    "acube" : "i32, float, float, float"
+  }
+  return geom_to_access[geom]
+
+def get_ptx_tld4_access(geom):
+  geom_to_access = {
+    "2d"    : "{%f{{[0-9]+}}, %f{{[0-9]+}}}",
+    "a2d"   : "{%r{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}",
+    "cube"  : "{%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}",
+    "acube" : "{%r{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}"
+  }
+  return geom_to_access[geom]
+
+def gen_tld4_tests(target, global_tex, global_sampler):
+  """
+  PTX spec s9.7.9.4. Texture Instructions: tld4
+  Perform a texture fetch of the 4-texel bilerp footprint.
+
+  tld4.comp.2d.v4.dtype.f32    d[|p], [a, c] {, e} {, f};
+  tld4.comp.geom.v4.dtype.f32  d[|p], [a, b, c] {, e} {, f};  // explicit sampler
+
+  .comp  = { .r, .g, .b, .a };
+  .geom  = { .2d, .a2d, .cube, .acube };
+  .dtype = { .u32, .s32, .f32 };
+  """
+
+  template = """
+  declare ${retty} @${intrinsic}(i64 %tex, ${sampler} ${access})
+
+  ; CHECK-LABEL: .entry ${test_name}_param
+  ; CHECK: ${instruction} ${ptx_ret}, [${ptx_tex}, ${ptx_access}]
+  define void @${test_name}_param(i64 %tex, ${sampler} ${retty}* %ret, ${access}) {
+    %val = tail call ${retty} @${intrinsic}(i64 %tex, ${sampler} ${access})
+    store ${retty} %val, ${retty}* %ret
+    ret void
+  }
+  ; CHECK-LABEL: .entry ${test_name}_global
+  ; CHECK: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}]
+  define void @${test_name}_global(${retty}* %ret, ${access}) {
+    %gt = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_tex})
+    ${get_sampler_handle}
+    %val = tail call ${retty} @${intrinsic}(i64 %gt, ${sampler} ${access})
+    store ${retty} %val, ${retty}* %ret
+    ret void
+  }
+  """
+
+  generated_items = []
+  generated_metadata = []
+  for comp, geom, dtype in product(
+      ["r", "g", "b", "a"],
+      ["2d", "a2d", "cube", "acube"],
+      ["u32", "s32", "f32"]):
+
+    # FIXME: missing intrinsics.
+    # tld4.{a2d,cube,acube} introduced in PTX ISA version 4.3.
+    if geom in ("a2d", "cube", "acube"):
+      continue
+
+    sampler_handle, get_sampler_handle = get_llvm_global_sampler(
+      target, global_sampler)
+
+    test_name = "test_tld4_" + "".join((comp, geom, dtype))
+    params = {
+      "test_name" : test_name,
+      "intrinsic" :
+        "llvm.nvvm.tld4{unified}.{comp}.{geom}.v4{dtype}.f32".format(
+          unified=(".unified" if is_unified(target) else ""),
+          comp=comp,
+          geom=get_llvm_geom(geom),
+          dtype=dtype),
+      "global_tex" : global_tex,
+      "retty"      : get_llvm_vec_type("v4", dtype),
+      "sampler"    : sampler_handle,
+      "access"     : get_llvm_tld4_access(geom),
+      "get_sampler_handle" : get_sampler_handle,
+
+      "instruction" : "tld4.{comp}.{geom}.v4.{dtype}.f32".format(
+        comp=comp, geom=geom, dtype=dtype),
+      "ptx_ret"     : get_ptx_vec_reg("v4", dtype),
+      "ptx_tex"     : get_ptx_texture(target),
+      "ptx_access"  : get_ptx_tld4_access(geom),
+      "ptx_global_sampler" : get_ptx_global_sampler(target, global_sampler),
+    }
+    gen_test(template, params)
+    generated_items.append((params["intrinsic"], params["instruction"]))
+
+    fun_name = test_name + "_param";
+    fun_ty = "void (i64, {sampler} {retty}*, {access_ty})*".format(
+      sampler=("" if is_unified(target) else "i64,"),
+      retty=params["retty"],
+      access_ty=get_llvm_tld4_access_type(geom))
+    generated_metadata += get_texture_metadata(
+      target, fun_ty, fun_name, has_texture_params=True)
+
+    fun_name = test_name + "_global";
+    fun_ty = "void ({retty}*, {access_ty})*".format(
+      retty=params["retty"],
+      access_ty=get_llvm_tld4_access_type(geom))
+    generated_metadata += get_texture_metadata(
+      target, fun_ty, fun_name, has_texture_params=False)
+
+  return generated_items, generated_metadata
+
+def gen_test(template, params):
+  if debug:
+    print()
+    for param, value in params.items():
+      print(";; {}: {}".format(param, value))
+
+  print(string.Template(textwrap.dedent(template)).substitute(params))
+
+def gen_tests(target, tests):
+  gen_triple(target)
+
+  items = []
+  metadata = []
+
+  global_surf = "gsurf"
+  global_tex = "gtex"
+  global_sampler = "gsam"
+  metadata += gen_globals(target, global_surf, global_tex, global_sampler)
+
+  if "suld" in tests:
+    suld_items, suld_md = gen_suld_tests(target, global_surf)
+    items += suld_items
+    metadata += suld_md
+  if "sust" in tests:
+    sust_items, sust_md = gen_sust_tests(target, global_surf)
+    items += sust_items
+    metadata += sust_md
+  if "tex" in tests:
+    tex_items, tex_md = gen_tex_tests(target, global_tex, global_sampler)
+    items += tex_items
+    metadata += tex_md
+  if "tld4" in tests:
+    tld4_items, tld4_md = gen_tld4_tests(target, global_tex, global_sampler)
+    items += tld4_items
+    metadata += tld4_md
+
+  gen_metadata(metadata)
+  return items
+
+def write_gen_list(filename, append, items):
+  with open(filename, ("a" if append else "w")) as f:
+    for intrinsic, instruction in items:
+      f.write("{} {}\n".format(intrinsic, instruction))
+
+def read_gen_list(filename):
+  intrinsics = set()
+  instructions = set()
+  with open(filename) as f:
+    for line in f:
+      intrinsic, instruction = line.split()
+      intrinsics.add(intrinsic)
+      instructions.add(instruction)
+  return (intrinsics, instructions)
+
+def read_td_list(filename, regex):
+  td_list = set()
+  with open(filename) as f:
+    for line in f:
+      match = re.search(regex, line)
+      if match:
+        td_list.add(match.group(1))
+
+  # Arbitrary value - we should find quite a lot of instructions
+  if len(td_list) < 30:
+    raise RuntimeError("found only {} instructions in {}".format(
+      filename, len(td_list)))
+
+  return td_list
+
+def verify_inst_tablegen(path_td, gen_instr):
+  """
+  Verify that all instructions defined in NVPTXIntrinsics.td are
+  tested.
+  """
+
+  td_instr = read_td_list(path_td, "\"((suld|sust|tex|tld4)\\..*)\"")
+
+  gen_instr.update({
+    # FIXME: spec does not list any sust.p variants other than b32
+    "sust.p.1d.b8.trap",
+    "sust.p.1d.b16.trap",
+    "sust.p.1d.v2.b8.trap",
+    "sust.p.1d.v2.b16.trap",
+    "sust.p.1d.v4.b8.trap",
+    "sust.p.1d.v4.b16.trap",
+    "sust.p.a1d.b8.trap",
+    "sust.p.a1d.b16.trap",
+    "sust.p.a1d.v2.b8.trap",
+    "sust.p.a1d.v2.b16.trap",
+    "sust.p.a1d.v4.b8.trap",
+    "sust.p.a1d.v4.b16.trap",
+    "sust.p.2d.b8.trap",
+    "sust.p.2d.b16.trap",
+    "sust.p.2d.v2.b8.trap",
+    "sust.p.2d.v2.b16.trap",
+    "sust.p.2d.v4.b8.trap",
+    "sust.p.2d.v4.b16.trap",
+    "sust.p.a2d.b8.trap",
+    "sust.p.a2d.b16.trap",
+    "sust.p.a2d.v2.b8.trap",
+    "sust.p.a2d.v2.b16.trap",
+    "sust.p.a2d.v4.b8.trap",
+    "sust.p.a2d.v4.b16.trap",
+    "sust.p.3d.b8.trap",
+    "sust.p.3d.b16.trap",
+    "sust.p.3d.v2.b8.trap",
+    "sust.p.3d.v2.b16.trap",
+    "sust.p.3d.v4.b8.trap",
+    "sust.p.3d.v4.b16.trap",
+
+    # FIXME: sust.p is also not supported for arrays
+    "sust.p.a1d.b32.trap",
+    "sust.p.a1d.v2.b32.trap",
+    "sust.p.a1d.v4.b32.trap",
+    "sust.p.a2d.b32.trap",
+    "sust.p.a2d.v2.b32.trap",
+    "sust.p.a2d.v4.b32.trap",
+  })
+
+  td_instr = list(td_instr)
+  td_instr.sort()
+  gen_instr = list(gen_instr)
+  gen_instr.sort()
+  for i, td in enumerate(td_instr):
+    if i == len(gen_instr) or td != gen_instr[i]:
+      raise RuntimeError(
+        "{} is present in tablegen, but not tested.\n".format(td))
+
+def verify_llvm_tablegen(path_td, gen_intr):
+  """
+  Verify that all intrinsics defined in IntrinsicsNVVM.td are
+  tested.
+  """
+
+  td_intr = read_td_list(
+    path_td, "\"(llvm\\.nvvm\\.(suld|sust|tex|tld4)\\..*)\"")
+
+  gen_intr.update({
+    # FIXME: spec does not list any sust.p variants other than b32
+    "llvm.nvvm.sust.p.1d.i8.trap",
+    "llvm.nvvm.sust.p.1d.i16.trap",
+    "llvm.nvvm.sust.p.1d.v2i8.trap",
+    "llvm.nvvm.sust.p.1d.v2i16.trap",
+    "llvm.nvvm.sust.p.1d.v4i8.trap",
+    "llvm.nvvm.sust.p.1d.v4i16.trap",
+    "llvm.nvvm.sust.p.1d.array.i8.trap",
+    "llvm.nvvm.sust.p.1d.array.i16.trap",
+    "llvm.nvvm.sust.p.1d.array.v2i8.trap",
+    "llvm.nvvm.sust.p.1d.array.v2i16.trap",
+    "llvm.nvvm.sust.p.1d.array.v4i8.trap",
+    "llvm.nvvm.sust.p.1d.array.v4i16.trap",
+    "llvm.nvvm.sust.p.2d.i8.trap",
+    "llvm.nvvm.sust.p.2d.i16.trap",
+    "llvm.nvvm.sust.p.2d.v2i8.trap",
+    "llvm.nvvm.sust.p.2d.v2i16.trap",
+    "llvm.nvvm.sust.p.2d.v4i8.trap",
+    "llvm.nvvm.sust.p.2d.v4i16.trap",
+    "llvm.nvvm.sust.p.2d.array.i8.trap",
+    "llvm.nvvm.sust.p.2d.array.i16.trap",
+    "llvm.nvvm.sust.p.2d.array.v2i8.trap",
+    "llvm.nvvm.sust.p.2d.array.v2i16.trap",
+    "llvm.nvvm.sust.p.2d.array.v4i8.trap",
+    "llvm.nvvm.sust.p.2d.array.v4i16.trap",
+    "llvm.nvvm.sust.p.3d.i8.trap",
+    "llvm.nvvm.sust.p.3d.i16.trap",
+    "llvm.nvvm.sust.p.3d.v2i8.trap",
+    "llvm.nvvm.sust.p.3d.v2i16.trap",
+    "llvm.nvvm.sust.p.3d.v4i8.trap",
+    "llvm.nvvm.sust.p.3d.v4i16.trap",
+
+    # FIXME: sust.p is also not supported for arrays
+    "llvm.nvvm.sust.p.1d.array.i32.trap",
+    "llvm.nvvm.sust.p.1d.array.v2i32.trap",
+    "llvm.nvvm.sust.p.1d.array.v4i32.trap",
+    "llvm.nvvm.sust.p.2d.array.i32.trap",
+    "llvm.nvvm.sust.p.2d.array.v2i32.trap",
+    "llvm.nvvm.sust.p.2d.array.v4i32.trap"
+  })
+
+  td_intr = list(td_intr)
+  td_intr.sort()
+  gen_intr = list(gen_intr)
+  gen_intr.sort()
+  for i, td in enumerate(td_intr):
+    if i == len(gen_intr) or td != gen_intr[i]:
+      raise RuntimeError(
+        "{} is present in tablegen, but not tested.\n".format(td))
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--debug", action="store_true")
+parser.add_argument("--tests", type=str)
+parser.add_argument("--target", type=str)
+parser.add_argument("--gen-list", dest="gen_list", type=str)
+parser.add_argument("--gen-list-append", dest="gen_list_append",
+                    action="store_true")
+parser.add_argument("--verify", action="store_true")
+parser.add_argument("--llvm-tablegen", dest="llvm_td", type=str)
+parser.add_argument("--inst-tablegen", dest="inst_td", type=str)
+
+args = parser.parse_args()
+debug = args.debug
+
+if args.verify:
+  intrinsics, instructions = read_gen_list(args.gen_list)
+  verify_inst_tablegen(args.inst_td, instructions)
+  verify_llvm_tablegen(args.llvm_td, intrinsics)
+else:
+  items = gen_tests(args.target, args.tests.split(","))
+  if (args.gen_list):
+    write_gen_list(args.gen_list, args.gen_list_append, items)