[clang] [AMDGPU][NFC] Change AMDGPU builtins to use ExtVector (PR #176033)

Joseph Huber via cfe-commits cfe-commits at lists.llvm.org
Wed Jan 14 12:53:15 PST 2026


https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/176033

Summary:
These currently use the GNU vectors, not the OpenCL vectors, which is
strange.


>From e431aa2d82c9fc85166dd4af37d5dac5fc8b1b42 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 14 Jan 2026 14:48:03 -0600
Subject: [PATCH] [AMDGPU][NFC] Change AMDGPU builtins to use ExtVector

Summary:
These currently use the GNU vectors, not the OpenCL vectors, which is
strange.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.td | 940 ++++++++++----------
 1 file changed, 470 insertions(+), 470 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index b7d1236549eee..12ffad305e7c0 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -163,26 +163,26 @@ def __builtin_amdgcn_sad_hi_u8 : AMDGPUBuiltin<"unsigned int(unsigned int, unsig
 def __builtin_amdgcn_sad_u16 : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", [Const], "sad-insts">;
 def __builtin_amdgcn_qsad_pk_u16_u8 : AMDGPUBuiltin<"uint64_t(uint64_t, unsigned int, uint64_t)", [Const], "qsad-insts">;
 def __builtin_amdgcn_mqsad_pk_u16_u8 : AMDGPUBuiltin<"uint64_t(uint64_t, unsigned int, uint64_t)", [Const]>;
-def __builtin_amdgcn_mqsad_u32_u8 : AMDGPUBuiltin<"_Vector<4, unsigned int>(uint64_t, unsigned int, _Vector<4, unsigned int>)", [Const]>;
+def __builtin_amdgcn_mqsad_u32_u8 : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(uint64_t, unsigned int, _ExtVector<4, unsigned int>)", [Const]>;
 
 def __builtin_amdgcn_make_buffer_rsrc : AMDGPUBuiltin<"__amdgpu_buffer_rsrc_t(void *, short, int64_t, int)", [Const]>;
 def __builtin_amdgcn_raw_buffer_store_b8 : AMDGPUBuiltin<"void(unsigned char, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
 def __builtin_amdgcn_raw_buffer_store_b16 : AMDGPUBuiltin<"void(unsigned short, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
 def __builtin_amdgcn_raw_buffer_store_b32 : AMDGPUBuiltin<"void(unsigned int, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
-def __builtin_amdgcn_raw_buffer_store_b64 : AMDGPUBuiltin<"void(_Vector<2, unsigned int>, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
-def __builtin_amdgcn_raw_buffer_store_b96 : AMDGPUBuiltin<"void(_Vector<3, unsigned int>, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
-def __builtin_amdgcn_raw_buffer_store_b128 : AMDGPUBuiltin<"void(_Vector<4, unsigned int>, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
+def __builtin_amdgcn_raw_buffer_store_b64 : AMDGPUBuiltin<"void(_ExtVector<2, unsigned int>, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
+def __builtin_amdgcn_raw_buffer_store_b96 : AMDGPUBuiltin<"void(_ExtVector<3, unsigned int>, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
+def __builtin_amdgcn_raw_buffer_store_b128 : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
 def __builtin_amdgcn_raw_buffer_load_b8 : AMDGPUBuiltin<"unsigned char(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
 def __builtin_amdgcn_raw_buffer_load_b16 : AMDGPUBuiltin<"unsigned short(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
 def __builtin_amdgcn_raw_buffer_load_b32 : AMDGPUBuiltin<"unsigned int(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
-def __builtin_amdgcn_raw_buffer_load_b64 : AMDGPUBuiltin<"_Vector<2, unsigned int>(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
-def __builtin_amdgcn_raw_buffer_load_b96 : AMDGPUBuiltin<"_Vector<3, unsigned int>(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
-def __builtin_amdgcn_raw_buffer_load_b128 : AMDGPUBuiltin<"_Vector<4, unsigned int>(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
+def __builtin_amdgcn_raw_buffer_load_b64 : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
+def __builtin_amdgcn_raw_buffer_load_b96 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
+def __builtin_amdgcn_raw_buffer_load_b128 : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(__amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
 
 def __builtin_amdgcn_raw_ptr_buffer_atomic_add_i32 : AMDGPUBuiltin<"int(int, __amdgpu_buffer_rsrc_t, int, int, _Constant int)">;
 
 def __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32 : AMDGPUBuiltin<"float(float, __amdgpu_buffer_rsrc_t, int, int, _Constant int)", [], "atomic-fadd-rtn-insts">;
-def __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16 : AMDGPUBuiltin<"_Vector<2, _Float16>(_Vector<2, _Float16>, __amdgpu_buffer_rsrc_t, int, int, _Constant int)", [], "atomic-buffer-global-pk-add-f16-insts">;
+def __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16 : AMDGPUBuiltin<"_ExtVector<2, _Float16>(_ExtVector<2, _Float16>, __amdgpu_buffer_rsrc_t, int, int, _Constant int)", [], "atomic-buffer-global-pk-add-f16-insts">;
 
 def __builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f32 : AMDGPUBuiltin<"float(float, __amdgpu_buffer_rsrc_t, int, int, _Constant int)", [], "atomic-fmin-fmax-global-f32">;
 def __builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f32 : AMDGPUBuiltin<"float(float, __amdgpu_buffer_rsrc_t, int, int, _Constant int)", [], "atomic-fmin-fmax-global-f32">;
@@ -270,7 +270,7 @@ def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"__fp16(__fp16, __fp16, __fp16)", [C
 
 def __builtin_amdgcn_global_atomic_fadd_f64 : AMDGPUBuiltin<"double(double address_space<1> *, double)", [], "gfx90a-insts">;
 def __builtin_amdgcn_global_atomic_fadd_f32 : AMDGPUBuiltin<"float(float address_space<1> *, float)", [], "atomic-fadd-rtn-insts">;
-def __builtin_amdgcn_global_atomic_fadd_v2f16 : AMDGPUBuiltin<"_Vector<2, _Float16>(_Vector<2, _Float16 address_space<1> *>, _Vector<2, _Float16>)", [CustomTypeChecking], "atomic-buffer-global-pk-add-f16-insts">;
+def __builtin_amdgcn_global_atomic_fadd_v2f16 : AMDGPUBuiltin<"_ExtVector<2, _Float16>(_ExtVector<2, _Float16 address_space<1> *>, _ExtVector<2, _Float16>)", [CustomTypeChecking], "atomic-buffer-global-pk-add-f16-insts">;
 def __builtin_amdgcn_global_atomic_fmin_f64 : AMDGPUBuiltin<"double(double address_space<1> *, double)", [], "gfx90a-insts">;
 def __builtin_amdgcn_global_atomic_fmax_f64 : AMDGPUBuiltin<"double(double address_space<1> *, double)", [], "gfx90a-insts">;
 
@@ -282,11 +282,11 @@ def __builtin_amdgcn_ds_atomic_fadd_f64 : AMDGPUBuiltin<"double(double address_s
 def __builtin_amdgcn_ds_atomic_fadd_f32 : AMDGPUBuiltin<"float(float address_space<3> *, float)", [], "gfx8-insts">;
 
 def __builtin_amdgcn_flat_atomic_fadd_f32 : AMDGPUBuiltin<"float(float address_space<0> *, float)", [], "gfx940-insts">;
-def __builtin_amdgcn_flat_atomic_fadd_v2f16 : AMDGPUBuiltin<"_Vector<2, _Float16>(_Vector<2, _Float16 address_space<0> *>, _Vector<2, _Float16>)", [CustomTypeChecking], "atomic-flat-pk-add-16-insts">;
-def __builtin_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUBuiltin<"_Vector<2, short>(_Vector<2, short address_space<0> *>, _Vector<2, short>)", [CustomTypeChecking], "atomic-flat-pk-add-16-insts">;
-def __builtin_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUBuiltin<"_Vector<2, short>(_Vector<2, short address_space<1> *>, _Vector<2, short>)", [CustomTypeChecking], "atomic-global-pk-add-bf16-inst">;
-def __builtin_amdgcn_ds_atomic_fadd_v2bf16 : AMDGPUBuiltin<"_Vector<2, short>(_Vector<2, short address_space<3> *>, _Vector<2, short>)", [CustomTypeChecking], "atomic-ds-pk-add-16-insts">;
-def __builtin_amdgcn_ds_atomic_fadd_v2f16 : AMDGPUBuiltin<"_Vector<2, _Float16>(_Vector<2, _Float16 address_space<3> *>, _Vector<2, _Float16>)", [CustomTypeChecking], "atomic-ds-pk-add-16-insts">;
+def __builtin_amdgcn_flat_atomic_fadd_v2f16 : AMDGPUBuiltin<"_ExtVector<2, _Float16>(_ExtVector<2, _Float16 address_space<0> *>, _ExtVector<2, _Float16>)", [CustomTypeChecking], "atomic-flat-pk-add-16-insts">;
+def __builtin_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUBuiltin<"_ExtVector<2, short>(_ExtVector<2, short address_space<0> *>, _ExtVector<2, short>)", [CustomTypeChecking], "atomic-flat-pk-add-16-insts">;
+def __builtin_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUBuiltin<"_ExtVector<2, short>(_ExtVector<2, short address_space<1> *>, _ExtVector<2, short>)", [CustomTypeChecking], "atomic-global-pk-add-bf16-inst">;
+def __builtin_amdgcn_ds_atomic_fadd_v2bf16 : AMDGPUBuiltin<"_ExtVector<2, short>(_ExtVector<2, short address_space<3> *>, _ExtVector<2, short>)", [CustomTypeChecking], "atomic-ds-pk-add-16-insts">;
+def __builtin_amdgcn_ds_atomic_fadd_v2f16 : AMDGPUBuiltin<"_ExtVector<2, _Float16>(_ExtVector<2, _Float16 address_space<3> *>, _ExtVector<2, _Float16>)", [CustomTypeChecking], "atomic-ds-pk-add-16-insts">;
 def __builtin_amdgcn_load_to_lds : AMDGPUBuiltin<"void(void *, void address_space<3> *, _Constant unsigned int, _Constant int, _Constant unsigned int)", [], "vmem-to-lds-load-insts">;
 def __builtin_amdgcn_global_load_lds : AMDGPUBuiltin<"void(void address_space<1> *, void address_space<3> *, _Constant unsigned int, _Constant int, _Constant unsigned int)", [], "vmem-to-lds-load-insts">;
 
@@ -294,12 +294,12 @@ def __builtin_amdgcn_global_load_lds : AMDGPUBuiltin<"void(void address_space<1>
 // Deep learning builtins.
 //===----------------------------------------------------------------------===//
 
-def __builtin_amdgcn_fdot2 : AMDGPUBuiltin<"float(_Vector<2, _Float16>, _Vector<2, _Float16>, float, _Constant bool)", [Const], "dot10-insts">;
-def __builtin_amdgcn_fdot2_f16_f16 : AMDGPUBuiltin<"_Float16(_Vector<2, _Float16>, _Vector<2, _Float16>, _Float16)", [Const], "dot9-insts">;
-def __builtin_amdgcn_fdot2_bf16_bf16 : AMDGPUBuiltin<"short(_Vector<2, short>, _Vector<2, short>, short)", [Const], "dot9-insts">;
-def __builtin_amdgcn_fdot2_f32_bf16 : AMDGPUBuiltin<"float(_Vector<2, short>, _Vector<2, short>, float, _Constant bool)", [Const], "dot12-insts">;
-def __builtin_amdgcn_sdot2 : AMDGPUBuiltin<"int(_Vector<2, short>, _Vector<2, short>, int, _Constant bool)", [Const], "dot2-insts">;
-def __builtin_amdgcn_udot2 : AMDGPUBuiltin<"unsigned int(_Vector<2, unsigned short>, _Vector<2, unsigned short>, unsigned int, _Constant bool)", [Const], "dot2-insts">;
+def __builtin_amdgcn_fdot2 : AMDGPUBuiltin<"float(_ExtVector<2, _Float16>, _ExtVector<2, _Float16>, float, _Constant bool)", [Const], "dot10-insts">;
+def __builtin_amdgcn_fdot2_f16_f16 : AMDGPUBuiltin<"_Float16(_ExtVector<2, _Float16>, _ExtVector<2, _Float16>, _Float16)", [Const], "dot9-insts">;
+def __builtin_amdgcn_fdot2_bf16_bf16 : AMDGPUBuiltin<"short(_ExtVector<2, short>, _ExtVector<2, short>, short)", [Const], "dot9-insts">;
+def __builtin_amdgcn_fdot2_f32_bf16 : AMDGPUBuiltin<"float(_ExtVector<2, short>, _ExtVector<2, short>, float, _Constant bool)", [Const], "dot12-insts">;
+def __builtin_amdgcn_sdot2 : AMDGPUBuiltin<"int(_ExtVector<2, short>, _ExtVector<2, short>, int, _Constant bool)", [Const], "dot2-insts">;
+def __builtin_amdgcn_udot2 : AMDGPUBuiltin<"unsigned int(_ExtVector<2, unsigned short>, _ExtVector<2, unsigned short>, unsigned int, _Constant bool)", [Const], "dot2-insts">;
 def __builtin_amdgcn_sdot4 : AMDGPUBuiltin<"int(int, int, int, _Constant bool)", [Const], "dot1-insts">;
 def __builtin_amdgcn_udot4 : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int, _Constant bool)", [Const], "dot7-insts">;
 def __builtin_amdgcn_sudot4 : AMDGPUBuiltin<"int(_Constant bool, int, _Constant bool, int, int, _Constant bool)", [Const], "dot8-insts">;
@@ -310,7 +310,7 @@ def __builtin_amdgcn_dot4_f32_fp8_bf8 : AMDGPUBuiltin<"float(unsigned int, unsig
 def __builtin_amdgcn_dot4_f32_bf8_fp8 : AMDGPUBuiltin<"float(unsigned int, unsigned int, float)", [Const], "dot11-insts">;
 def __builtin_amdgcn_dot4_f32_fp8_fp8 : AMDGPUBuiltin<"float(unsigned int, unsigned int, float)", [Const], "dot11-insts">;
 def __builtin_amdgcn_dot4_f32_bf8_bf8 : AMDGPUBuiltin<"float(unsigned int, unsigned int, float)", [Const], "dot11-insts">;
-def __builtin_amdgcn_fdot2c_f32_bf16 : AMDGPUBuiltin<"float(_Vector<2, __bf16>, _Vector<2, __bf16>, float, _Constant bool)", [Const], "dot13-insts">;
+def __builtin_amdgcn_fdot2c_f32_bf16 : AMDGPUBuiltin<"float(_ExtVector<2, __bf16>, _ExtVector<2, __bf16>, float, _Constant bool)", [Const], "dot13-insts">;
 
 //===----------------------------------------------------------------------===//
 // GFX10+ only builtins.
@@ -326,10 +326,10 @@ def __builtin_amdgcn_s_ttracedata_imm : AMDGPUBuiltin<"void(_Constant short)", [
 // Postfix l indicates the 1st argument is i64.
 // Postfix h indicates the 4/5-th arguments are half4.
 //===----------------------------------------------------------------------===//
-def __builtin_amdgcn_image_bvh_intersect_ray : AMDGPUBuiltin<"_Vector<4, unsigned int>(unsigned int, float, _Vector<4, float>, _Vector<4, float>, _Vector<4, float>, _Vector<4, unsigned int>)", [Const], "gfx10-insts">;
-def __builtin_amdgcn_image_bvh_intersect_ray_h : AMDGPUBuiltin<"_Vector<4, unsigned int>(unsigned int, float, _Vector<4, float>, _Vector<4, _Float16>, _Vector<4, _Float16>, _Vector<4, unsigned int>)", [Const], "gfx10-insts">;
-def __builtin_amdgcn_image_bvh_intersect_ray_l : AMDGPUBuiltin<"_Vector<4, unsigned int>(uint64_t, float, _Vector<4, float>, _Vector<4, float>, _Vector<4, float>, _Vector<4, unsigned int>)", [Const], "gfx10-insts">;
-def __builtin_amdgcn_image_bvh_intersect_ray_lh : AMDGPUBuiltin<"_Vector<4, unsigned int>(uint64_t, float, _Vector<4, float>, _Vector<4, _Float16>, _Vector<4, _Float16>, _Vector<4, unsigned int>)", [Const], "gfx10-insts">;
+def __builtin_amdgcn_image_bvh_intersect_ray : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(unsigned int, float, _ExtVector<4, float>, _ExtVector<4, float>, _ExtVector<4, float>, _ExtVector<4, unsigned int>)", [Const], "gfx10-insts">;
+def __builtin_amdgcn_image_bvh_intersect_ray_h : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(unsigned int, float, _ExtVector<4, float>, _ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, unsigned int>)", [Const], "gfx10-insts">;
+def __builtin_amdgcn_image_bvh_intersect_ray_l : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(uint64_t, float, _ExtVector<4, float>, _ExtVector<4, float>, _ExtVector<4, float>, _ExtVector<4, unsigned int>)", [Const], "gfx10-insts">;
+def __builtin_amdgcn_image_bvh_intersect_ray_lh : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(uint64_t, float, _ExtVector<4, float>, _ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, unsigned int>)", [Const], "gfx10-insts">;
 
 
 //===----------------------------------------------------------------------===//
@@ -345,28 +345,28 @@ def __builtin_amdgcn_s_wait_event_export_ready : AMDGPUBuiltin<"void()", [], "gf
 // Postfix w32 indicates the builtin requires wavefront size of 32.
 // Postfix w64 indicates the builtin requires wavefront size of 64.
 //===----------------------------------------------------------------------===//
-def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<8, float>)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<16, short>, _Vector<16, short>, _Vector<8, float>)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32 : AMDGPUBuiltin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32 : AMDGPUBuiltin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32 : AMDGPUBuiltin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32 : AMDGPUBuiltin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32 : AMDGPUBuiltin<"_Vector<8, int>(_Constant bool, _Vector<4, int>, _Constant bool, _Vector<4, int>, _Vector<8, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32 : AMDGPUBuiltin<"_Vector<8, int>(_Constant bool, _Vector<2, int>, _Constant bool, _Vector<2, int>, _Vector<8, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-
-def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<4, float>)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<16, short>, _Vector<16, short>, _Vector<4, float>)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64 : AMDGPUBuiltin<"_Vector<8, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<8, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64 : AMDGPUBuiltin<"_Vector<8, short>(_Vector<16, short>, _Vector<16, short>, _Vector<8, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64 : AMDGPUBuiltin<"_Vector<8, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<8, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64 : AMDGPUBuiltin<"_Vector<8, short>(_Vector<16, short>, _Vector<16, short>, _Vector<8, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64 : AMDGPUBuiltin<"_Vector<4, int>(_Constant bool, _Vector<4, int>, _Constant bool, _Vector<4, int>, _Vector<4, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64 : AMDGPUBuiltin<"_Vector<4, int>(_Constant bool, _Vector<2, int>, _Constant bool, _Vector<2, int>, _Vector<4, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, float>)", [Const], "gfx11-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, float>)", [Const], "gfx11-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
+
+def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<4, float>)", [Const], "gfx11-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<4, float>)", [Const], "gfx11-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<4, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
 
 def __builtin_amdgcn_s_sendmsg_rtn : AMDGPUBuiltin<"unsigned int(_Constant unsigned int)", [], "gfx11-insts">;
 def __builtin_amdgcn_s_sendmsg_rtnl : AMDGPUBuiltin<"uint64_t(_Constant unsigned int)", [], "gfx11-insts">;
 
-def __builtin_amdgcn_ds_bvh_stack_rtn : AMDGPUBuiltin<"_Vector<2, unsigned int>(unsigned int, unsigned int, _Vector<4, unsigned int>, _Constant int)", [], "gfx11-insts">;
+def __builtin_amdgcn_ds_bvh_stack_rtn : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(unsigned int, unsigned int, _ExtVector<4, unsigned int>, _Constant int)", [], "gfx11-insts">;
 
 //===----------------------------------------------------------------------===//
 // Special builtins.
@@ -430,67 +430,67 @@ def __builtin_r600_recipsqrt_ieeef : AMDGPUBuiltin<"float(float)", [Const]>;
 // MFMA builtins.
 //===----------------------------------------------------------------------===//
 
-def __builtin_amdgcn_mfma_f32_32x32x1f32 : AMDGPUBuiltin<"_Vector<32, float>(float, float, _Vector<32, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_16x16x1f32 : AMDGPUBuiltin<"_Vector<16, float>(float, float, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_4x4x1f32 : AMDGPUBuiltin<"_Vector<4, float>(float, float, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_32x32x2f32 : AMDGPUBuiltin<"_Vector<16, float>(float, float, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_16x16x4f32 : AMDGPUBuiltin<"_Vector<4, float>(float, float, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_32x32x4f16 : AMDGPUBuiltin<"_Vector<32, float>(_Vector<4, _Float16>, _Vector<4, _Float16>, _Vector<32, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_16x16x4f16 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<4, _Float16>, _Vector<4, _Float16>, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_4x4x4f16 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<4, _Float16>, _Vector<4, _Float16>, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_32x32x8f16 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<4, _Float16>, _Vector<4, _Float16>, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_16x16x16f16 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<4, _Float16>, _Vector<4, _Float16>, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_i32_32x32x4i8 : AMDGPUBuiltin<"_Vector<32, int>(int, int, _Vector<32, int>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_i32_16x16x4i8 : AMDGPUBuiltin<"_Vector<16, int>(int, int, _Vector<16, int>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_i32_4x4x4i8 : AMDGPUBuiltin<"_Vector<4, int>(int, int, _Vector<4, int>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_i32_32x32x8i8 : AMDGPUBuiltin<"_Vector<16, int>(int, int, _Vector<16, int>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_i32_16x16x16i8 : AMDGPUBuiltin<"_Vector<4, int>(int, int, _Vector<4, int>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUBuiltin<"_Vector<32, float>(_Vector<2, short>, _Vector<2, short>, _Vector<32, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<2, short>, _Vector<2, short>, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<2, short>, _Vector<2, short>, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<2, short>, _Vector<2, short>, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<2, short>, _Vector<2, short>, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-
-def __builtin_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUBuiltin<"_Vector<32, float>(_Vector<4, short>, _Vector<4, short>, _Vector<32, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUBuiltin<"_Vector<16, float>(_Vector<4, short>, _Vector<4, short>, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUBuiltin<"_Vector<4, float>(_Vector<4, short>, _Vector<4, short>, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUBuiltin<"_Vector<16, float>(_Vector<4, short>, _Vector<4, short>, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUBuiltin<"_Vector<4, float>(_Vector<4, short>, _Vector<4, short>, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f64_16x16x4f64 : AMDGPUBuiltin<"_Vector<4, double>(double, double, _Vector<4, double>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_32x32x1f32 : AMDGPUBuiltin<"_ExtVector<32, float>(float, float, _ExtVector<32, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_16x16x1f32 : AMDGPUBuiltin<"_ExtVector<16, float>(float, float, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_4x4x1f32 : AMDGPUBuiltin<"_ExtVector<4, float>(float, float, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_32x32x2f32 : AMDGPUBuiltin<"_ExtVector<16, float>(float, float, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_16x16x4f32 : AMDGPUBuiltin<"_ExtVector<4, float>(float, float, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_32x32x4f16 : AMDGPUBuiltin<"_ExtVector<32, float>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<32, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_16x16x4f16 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_4x4x4f16 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_32x32x8f16 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_16x16x16f16 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_i32_32x32x4i8 : AMDGPUBuiltin<"_ExtVector<32, int>(int, int, _ExtVector<32, int>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_i32_16x16x4i8 : AMDGPUBuiltin<"_ExtVector<16, int>(int, int, _ExtVector<16, int>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_i32_4x4x4i8 : AMDGPUBuiltin<"_ExtVector<4, int>(int, int, _ExtVector<4, int>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_i32_32x32x8i8 : AMDGPUBuiltin<"_ExtVector<16, int>(int, int, _ExtVector<16, int>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_i32_16x16x16i8 : AMDGPUBuiltin<"_ExtVector<4, int>(int, int, _ExtVector<4, int>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUBuiltin<"_ExtVector<32, float>(_ExtVector<2, short>, _ExtVector<2, short>, _ExtVector<32, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<2, short>, _ExtVector<2, short>, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<2, short>, _ExtVector<2, short>, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<2, short>, _ExtVector<2, short>, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<2, short>, _ExtVector<2, short>, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+
+def __builtin_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUBuiltin<"_ExtVector<32, float>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<32, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f64_16x16x4f64 : AMDGPUBuiltin<"_ExtVector<4, double>(double, double, _ExtVector<4, double>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
 def __builtin_amdgcn_mfma_f64_4x4x4f64 : AMDGPUBuiltin<"double(double, double, double, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
 
-def __builtin_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUBuiltin<"_Vector<4, int>(int64_t, int64_t, _Vector<4, int>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUBuiltin<"_Vector<16, int>(int64_t, int64_t, _Vector<16, int>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<2, float>, _Vector<2, float>, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<2, float>, _Vector<2, float>, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8 : AMDGPUBuiltin<"_Vector<4, float>(int64_t, int64_t, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8 : AMDGPUBuiltin<"_Vector<4, float>(int64_t, int64_t, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8 : AMDGPUBuiltin<"_Vector<4, float>(int64_t, int64_t, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8 : AMDGPUBuiltin<"_Vector<4, float>(int64_t, int64_t, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8 : AMDGPUBuiltin<"_Vector<16, float>(int64_t, int64_t, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8 : AMDGPUBuiltin<"_Vector<16, float>(int64_t, int64_t, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8 : AMDGPUBuiltin<"_Vector<16, float>(int64_t, int64_t, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8 : AMDGPUBuiltin<"_Vector<16, float>(int64_t, int64_t, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<4, _Float16>, _Vector<8, _Float16>, _Vector<4, float>, int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<4, _Float16>, _Vector<8, _Float16>, _Vector<16, float>, int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<4, short>, _Vector<8, short>, _Vector<4, float>, int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<4, short>, _Vector<8, short>, _Vector<16, float>, int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUBuiltin<"_Vector<4, int>(_Vector<2, int>, _Vector<4, int>, _Vector<4, int>, int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUBuiltin<"_Vector<16, int>(_Vector<2, int>, _Vector<4, int>, _Vector<16, int>, int, _Constant int, _Constant int)", [Const], "mai-insts">;
-def __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<2, int>, _Vector<4, int>, _Vector<4, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<2, int>, _Vector<4, int>, _Vector<4, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<2, int>, _Vector<4, int>, _Vector<4, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<2, int>, _Vector<4, int>, _Vector<4, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<2, int>, _Vector<4, int>, _Vector<16, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<2, int>, _Vector<4, int>, _Vector<16, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<2, int>, _Vector<4, int>, _Vector<16, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
-def __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<2, int>, _Vector<4, int>, _Vector<16, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUBuiltin<"_ExtVector<4, int>(int64_t, int64_t, _ExtVector<4, int>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUBuiltin<"_ExtVector<16, int>(int64_t, int64_t, _ExtVector<16, int>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<2, float>, _ExtVector<2, float>, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<2, float>, _ExtVector<2, float>, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<4, float>(int64_t, int64_t, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<4, float>(int64_t, int64_t, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<4, float>(int64_t, int64_t, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<4, float>(int64_t, int64_t, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<16, float>(int64_t, int64_t, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<16, float>(int64_t, int64_t, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<16, float>(int64_t, int64_t, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<16, float>(int64_t, int64_t, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<8, _Float16>, _ExtVector<4, float>, int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<4, _Float16>, _ExtVector<8, _Float16>, _ExtVector<16, float>, int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, float>, int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<16, float>, int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUBuiltin<"_ExtVector<4, int>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<4, int>, int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUBuiltin<"_ExtVector<16, int>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<16, int>, int, _Constant int, _Constant int)", [Const], "mai-insts">;
+def __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<4, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<4, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<4, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<4, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<16, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<16, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<16, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
+def __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<16, float>, int, _Constant int, _Constant int)", [Const], "fp8-insts">;
 
 def __builtin_amdgcn_cvt_f32_bf8 : AMDGPUBuiltin<"float(int, _Constant int)", [Const], "fp8-conversion-insts">;
 def __builtin_amdgcn_cvt_f32_fp8 : AMDGPUBuiltin<"float(int, _Constant int)", [Const], "fp8-conversion-insts">;
 def __builtin_amdgcn_cvt_f32_fp8_e5m3 : AMDGPUBuiltin<"float(int, _Constant int)", [Const], "fp8e5m3-insts">;
-def __builtin_amdgcn_cvt_pk_f32_bf8 : AMDGPUBuiltin<"_Vector<2, float>(int, _Constant bool)", [Const], "fp8-conversion-insts">;
-def __builtin_amdgcn_cvt_pk_f32_fp8 : AMDGPUBuiltin<"_Vector<2, float>(int, _Constant bool)", [Const], "fp8-conversion-insts">;
+def __builtin_amdgcn_cvt_pk_f32_bf8 : AMDGPUBuiltin<"_ExtVector<2, float>(int, _Constant bool)", [Const], "fp8-conversion-insts">;
+def __builtin_amdgcn_cvt_pk_f32_fp8 : AMDGPUBuiltin<"_ExtVector<2, float>(int, _Constant bool)", [Const], "fp8-conversion-insts">;
 def __builtin_amdgcn_cvt_pk_bf8_f32 : AMDGPUBuiltin<"int(float, float, int, _Constant bool)", [Const], "fp8-conversion-insts">;
 def __builtin_amdgcn_cvt_pk_fp8_f32 : AMDGPUBuiltin<"int(float, float, int, _Constant bool)", [Const], "fp8-conversion-insts">;
 def __builtin_amdgcn_cvt_sr_bf8_f32 : AMDGPUBuiltin<"int(float, int, int, _Constant int)", [Const], "fp8-conversion-insts">;
@@ -499,46 +499,46 @@ def __builtin_amdgcn_cvt_sr_fp8_f32 : AMDGPUBuiltin<"int(float, int, int, _Const
 //===----------------------------------------------------------------------===//
 // GFX950 only builtins.
 //===----------------------------------------------------------------------===//
-def __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<8, int32_t>, _Vector<8, int32_t>, _Vector<4, float>, _Constant int, _Constant int, _Constant int, int, _Constant int, int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<8, int32_t>, _Vector<8, int32_t>, _Vector<16, float>, _Constant int, _Constant int, _Constant int, int, _Constant int, int)", [Const], "gfx950-insts">;
-
-def __builtin_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_mfma_f32_16x16x32_bf16 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Vector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Vector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_mfma_i32_16x16x64_i8 : AMDGPUBuiltin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>, _Constant int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_mfma_i32_32x32x32_i8 : AMDGPUBuiltin<"_Vector<16, int>(_Vector<4, int>, _Vector<4, int>, _Vector<16, int>, _Constant int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-
-def __builtin_amdgcn_smfmac_f32_16x16x64_f16 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<8, _Float16>, _Vector<16, _Float16>, _Vector<4, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_smfmac_f32_32x32x32_f16 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<8, _Float16>, _Vector<16, _Float16>, _Vector<16, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_smfmac_f32_16x16x64_bf16 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<8, __bf16>, _Vector<16, __bf16>, _Vector<4, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_smfmac_f32_32x32x32_bf16 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<8, __bf16>, _Vector<16, __bf16>, _Vector<16, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_smfmac_i32_16x16x128_i8 : AMDGPUBuiltin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, int>, _Vector<4, int>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_smfmac_i32_32x32x64_i8 : AMDGPUBuiltin<"_Vector<16, int>(_Vector<4, int>, _Vector<8, int>, _Vector<16, int>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<4, int>, _Vector<8, int>, _Vector<4, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<4, int>, _Vector<8, int>, _Vector<4, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<4, int>, _Vector<8, int>, _Vector<4, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<4, int>, _Vector<8, int>, _Vector<4, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_smfmac_f32_32x32x64_bf8_bf8 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<4, int>, _Vector<8, int>, _Vector<16, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_smfmac_f32_32x32x64_bf8_fp8 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<4, int>, _Vector<8, int>, _Vector<16, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_smfmac_f32_32x32x64_fp8_bf8 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<4, int>, _Vector<8, int>, _Vector<16, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<4, int>, _Vector<8, int>, _Vector<16, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
-
-def __builtin_amdgcn_permlane16_swap : AMDGPUBuiltin<"_Vector<2, unsigned int>(unsigned int, unsigned int, _Constant bool, _Constant bool)", [Const], "permlane16-swap">;
-def __builtin_amdgcn_permlane32_swap : AMDGPUBuiltin<"_Vector<2, unsigned int>(unsigned int, unsigned int, _Constant bool, _Constant bool)", [Const], "permlane32-swap">;
-
-def __builtin_amdgcn_ds_read_tr4_b64_v2i32 : AMDGPUBuiltin<"_Vector<2, int>(_Vector<2, int address_space<3> *>)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_ds_read_tr6_b96_v3i32 : AMDGPUBuiltin<"_Vector<3, int>(_Vector<3, int address_space<3> *>)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_ds_read_tr8_b64_v2i32 : AMDGPUBuiltin<"_Vector<2, int>(_Vector<2, int address_space<3> *>)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_ds_read_tr16_b64_v4i16 : AMDGPUBuiltin<"_Vector<4, short>(_Vector<4, short address_space<3> *>)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_ds_read_tr16_b64_v4f16 : AMDGPUBuiltin<"_Vector<4, __fp16>(_Vector<4, __fp16 address_space<3> *>)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_ds_read_tr16_b64_v4bf16 : AMDGPUBuiltin<"_Vector<4, __bf16>(_Vector<4, __bf16 address_space<3> *>)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<8, int32_t>, _ExtVector<8, int32_t>, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int, int, _Constant int, int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<8, int32_t>, _ExtVector<8, int32_t>, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int, int, _Constant int, int)", [Const], "gfx950-insts">;
+
+def __builtin_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_mfma_f32_16x16x32_bf16 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<8, __bf16>, _ExtVector<8, __bf16>, _ExtVector<4, float>, _Constant int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<8, __bf16>, _ExtVector<8, __bf16>, _ExtVector<16, float>, _Constant int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_mfma_i32_16x16x64_i8 : AMDGPUBuiltin<"_ExtVector<4, int>(_ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<4, int>, _Constant int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_mfma_i32_32x32x32_i8 : AMDGPUBuiltin<"_ExtVector<16, int>(_ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<16, int>, _Constant int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+
+def __builtin_amdgcn_smfmac_f32_16x16x64_f16 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<8, _Float16>, _ExtVector<16, _Float16>, _ExtVector<4, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_smfmac_f32_32x32x32_f16 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<8, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_smfmac_f32_16x16x64_bf16 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<8, __bf16>, _ExtVector<16, __bf16>, _ExtVector<4, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_smfmac_f32_32x32x32_bf16 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<8, __bf16>, _ExtVector<16, __bf16>, _ExtVector<16, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_smfmac_i32_16x16x128_i8 : AMDGPUBuiltin<"_ExtVector<4, int>(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_smfmac_i32_32x32x64_i8 : AMDGPUBuiltin<"_ExtVector<16, int>(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<16, int>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_smfmac_f32_32x32x64_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<16, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_smfmac_f32_32x32x64_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<16, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_smfmac_f32_32x32x64_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<16, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<16, float>, int, _Constant int, _Constant int)", [Const], "gfx950-insts">;
+
+def __builtin_amdgcn_permlane16_swap : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(unsigned int, unsigned int, _Constant bool, _Constant bool)", [Const], "permlane16-swap">;
+def __builtin_amdgcn_permlane32_swap : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(unsigned int, unsigned int, _Constant bool, _Constant bool)", [Const], "permlane32-swap">;
+
+def __builtin_amdgcn_ds_read_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int address_space<3> *>)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_ds_read_tr6_b96_v3i32 : AMDGPUBuiltin<"_ExtVector<3, int>(_ExtVector<3, int address_space<3> *>)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_ds_read_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int address_space<3> *>)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_ds_read_tr16_b64_v4i16 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short address_space<3> *>)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_ds_read_tr16_b64_v4f16 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16 address_space<3> *>)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_ds_read_tr16_b64_v4bf16 : AMDGPUBuiltin<"_ExtVector<4, __bf16>(_ExtVector<4, __bf16 address_space<3> *>)", [Const], "gfx950-insts">;
 
 def __builtin_amdgcn_ashr_pk_i8_i32 : AMDGPUBuiltin<"unsigned short(unsigned int, unsigned int, unsigned int)", [Const], "ashr-pk-insts">;
 def __builtin_amdgcn_ashr_pk_u8_i32 : AMDGPUBuiltin<"unsigned short(unsigned int, unsigned int, unsigned int)", [Const], "ashr-pk-insts">;
 
-def __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUBuiltin<"_Vector<6, unsigned int>(_Vector<16, float>, _Vector<16, float>, float)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUBuiltin<"_Vector<6, unsigned int>(_Vector<16, float>, _Vector<16, float>, float)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<16, float>, _ExtVector<16, float>, float)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<16, float>, _ExtVector<16, float>, float)", [Const], "gfx950-insts">;
 
 //===----------------------------------------------------------------------===//
 // GFX12+ only builtins.
@@ -559,28 +559,28 @@ def __builtin_amdgcn_s_get_named_barrier_state : AMDGPUBuiltin<"unsigned int(voi
 def __builtin_amdgcn_s_prefetch_data : AMDGPUBuiltin<"void(void const *, unsigned int)", [Const], "gfx12-insts">;
 def __builtin_amdgcn_s_buffer_prefetch_data : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, _Constant int, unsigned int)", [Const], "gfx12-insts">;
 
-def __builtin_amdgcn_global_load_tr_b64_v2i32 : AMDGPUBuiltin<"_Vector<2, int>(_Vector<2, int address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_global_load_tr_b128_v8i16 : AMDGPUBuiltin<"_Vector<8, short>(_Vector<8, short address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_global_load_tr_b128_v8f16 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<8, __fp16 address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_global_load_tr_b128_v8bf16 : AMDGPUBuiltin<"_Vector<8, __bf16>(_Vector<8, __bf16 address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_global_load_tr_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_global_load_tr_b128_v8i16 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_global_load_tr_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16 address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_global_load_tr_b128_v8bf16 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(_ExtVector<8, __bf16 address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize32">;
 def __builtin_amdgcn_global_load_tr_b64_i32 : AMDGPUBuiltin<"int(int address_space<1> *)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_global_load_tr_b128_v4i16 : AMDGPUBuiltin<"_Vector<4, short>(_Vector<4, short address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_global_load_tr_b128_v4f16 : AMDGPUBuiltin<"_Vector<4, __fp16>(_Vector<4, __fp16 address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_global_load_tr_b128_v4bf16 : AMDGPUBuiltin<"_Vector<4, __bf16>(_Vector<4, __bf16 address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_global_load_tr_b128_v4i16 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_global_load_tr_b128_v4f16 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16 address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_global_load_tr_b128_v4bf16 : AMDGPUBuiltin<"_ExtVector<4, __bf16>(_ExtVector<4, __bf16 address_space<1> *>)", [Const], "gfx12-insts,wavefrontsize64">;
 
 def __builtin_amdgcn_ds_bpermute_fi_b32 : AMDGPUBuiltin<"int(int, int)", [Const], "gfx12-insts">;
 
 // For the following two builtins, the second and third return values of the
 // intrinsics are returned through the last two pointer-type function arguments.
-def __builtin_amdgcn_image_bvh8_intersect_ray : AMDGPUBuiltin<"_Vector<10, unsigned int>(uint64_t, float, unsigned char, _Vector<3, float>, _Vector<3, float>, unsigned int, _Vector<4, unsigned int>, _Vector<3, float *>, _Vector<3, float *>)", [Const], "gfx12-insts">;
-def __builtin_amdgcn_image_bvh_dual_intersect_ray : AMDGPUBuiltin<"_Vector<10, unsigned int>(uint64_t, float, unsigned char, _Vector<3, float>, _Vector<3, float>, _Vector<2, unsigned int>, _Vector<4, unsigned int>, _Vector<3, float *>, _Vector<3, float *>)", [Const], "gfx12-insts">;
+def __builtin_amdgcn_image_bvh8_intersect_ray : AMDGPUBuiltin<"_ExtVector<10, unsigned int>(uint64_t, float, unsigned char, _ExtVector<3, float>, _ExtVector<3, float>, unsigned int, _ExtVector<4, unsigned int>, _ExtVector<3, float *>, _ExtVector<3, float *>)", [Const], "gfx12-insts">;
+def __builtin_amdgcn_image_bvh_dual_intersect_ray : AMDGPUBuiltin<"_ExtVector<10, unsigned int>(uint64_t, float, unsigned char, _ExtVector<3, float>, _ExtVector<3, float>, _ExtVector<2, unsigned int>, _ExtVector<4, unsigned int>, _ExtVector<3, float *>, _ExtVector<3, float *>)", [Const], "gfx12-insts">;
 
-def __builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn : AMDGPUBuiltin<"_Vector<2, unsigned int>(unsigned int, unsigned int, _Vector<4, unsigned int>, _Constant int)", [], "gfx11-insts">;
-def __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn : AMDGPUBuiltin<"_Vector<2, unsigned int>(unsigned int, unsigned int, _Vector<8, unsigned int>, _Constant int)", [], "gfx12-insts">;
+def __builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(unsigned int, unsigned int, _ExtVector<4, unsigned int>, _Constant int)", [], "gfx11-insts">;
+def __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(unsigned int, unsigned int, _ExtVector<8, unsigned int>, _Constant int)", [], "gfx12-insts">;
 
 // The intrinsic returns {i64, i32}, the builtin returns <2 x i64>.
 // The second return value of the intrinsic is zext'ed.
-def __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn : AMDGPUBuiltin<"_Vector<2, uint64_t>(unsigned int, unsigned int, _Vector<8, unsigned int>, _Constant int)", [], "gfx12-insts">;
+def __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn : AMDGPUBuiltin<"_ExtVector<2, uint64_t>(unsigned int, unsigned int, _ExtVector<8, unsigned int>, _Constant int)", [], "gfx12-insts">;
 
 //===----------------------------------------------------------------------===//
 // WMMA builtins.
@@ -592,94 +592,94 @@ def __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn : AMDGPUBuiltin<"_Vector<2, uin
 // Therefore, we add an "_gfx12" suffix to distinguish them from the existing
 // builtins.
 //===----------------------------------------------------------------------===//
-def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<8, short>, _Vector<8, short>, _Vector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12 : AMDGPUBuiltin<"_Vector<8, int>(_Constant bool, _Vector<2, int>, _Constant bool, _Vector<2, int>, _Vector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12 : AMDGPUBuiltin<"_Vector<8, int>(_Constant bool, int, _Constant bool, int, _Vector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, _Float16>)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, short>)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, int, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
 // These are gfx12-only, but for consistency with the other WMMA variants we're
 // keeping the "_gfx12" suffix.
-def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<2, int>, _Vector<2, int>, _Vector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<2, int>, _Vector<2, int>, _Vector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<2, int>, _Vector<2, int>, _Vector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<2, int>, _Vector<2, int>, _Vector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12 : AMDGPUBuiltin<"_Vector<8, int>(_Constant bool, _Vector<2, int>, _Constant bool, _Vector<2, int>, _Vector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-
-def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<4, _Float16>, _Vector<4, _Float16>, _Vector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<4, short>, _Vector<4, short>, _Vector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_Vector<4, _Float16>(_Vector<4, _Float16>, _Vector<4, _Float16>, _Vector<4, _Float16>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_Vector<4, short>(_Vector<4, short>, _Vector<4, short>, _Vector<4, short>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12 : AMDGPUBuiltin<"_Vector<4, int>(_Constant bool, int, _Constant bool, int, _Vector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12 : AMDGPUBuiltin<"_Vector<4, int>(_Constant bool, int, _Constant bool, int, _Vector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
+
+def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, _Float16>)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, short>)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
 // These are gfx12-only, but for consistency with the other WMMA variants we're
 // keeping the "_gfx12" suffix.
-def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12 : AMDGPUBuiltin<"_Vector<4, float>(int, int, _Vector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12 : AMDGPUBuiltin<"_Vector<4, float>(int, int, _Vector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12 : AMDGPUBuiltin<"_Vector<4, float>(int, int, _Vector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12 : AMDGPUBuiltin<"_Vector<4, float>(int, int, _Vector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12 : AMDGPUBuiltin<"_Vector<4, int>(_Constant bool, int, _Constant bool, int, _Vector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-
-def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<8, __fp16>, _Vector<16, __fp16>, _Vector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<8, short>, _Vector<16, short>, _Vector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<8, __fp16>, _Vector<16, __fp16>, _Vector<8, __fp16>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32 : AMDGPUBuiltin<"_Vector<8, short>(_Vector<8, short>, _Vector<16, short>, _Vector<8, short>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32 : AMDGPUBuiltin<"_Vector<8, int>(_Constant bool, _Vector<2, int>, _Constant bool, _Vector<4, int>, _Vector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32 : AMDGPUBuiltin<"_Vector<8, int>(_Constant bool, int, _Constant bool, _Vector<2, int>, _Vector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32 : AMDGPUBuiltin<"_Vector<8, int>(_Constant bool, _Vector<2, int>, _Constant bool, _Vector<4, int>, _Vector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<2, int>, _Vector<4, int>, _Vector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<2, int>, _Vector<4, int>, _Vector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<2, int>, _Vector<4, int>, _Vector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<2, int>, _Vector<4, int>, _Vector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-
-def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<4, __fp16>, _Vector<8, __fp16>, _Vector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64 : AMDGPUBuiltin<"_Vector<4, float>(_Vector<4, short>, _Vector<8, short>, _Vector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : AMDGPUBuiltin<"_Vector<4, __fp16>(_Vector<4, __fp16>, _Vector<8, __fp16>, _Vector<4, __fp16>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64 : AMDGPUBuiltin<"_Vector<4, short>(_Vector<4, short>, _Vector<8, short>, _Vector<4, short>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64 : AMDGPUBuiltin<"_Vector<4, int>(_Constant bool, int, _Constant bool, _Vector<2, int>, _Vector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64 : AMDGPUBuiltin<"_Vector<4, int>(_Constant bool, int, _Constant bool, int, _Vector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64 : AMDGPUBuiltin<"_Vector<4, int>(_Constant bool, int, _Constant bool, _Vector<2, int>, _Vector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64 : AMDGPUBuiltin<"_Vector<4, float>(int, _Vector<2, int>, _Vector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64 : AMDGPUBuiltin<"_Vector<4, float>(int, _Vector<2, int>, _Vector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64 : AMDGPUBuiltin<"_Vector<4, float>(int, _Vector<2, int>, _Vector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64 : AMDGPUBuiltin<"_Vector<4, float>(int, _Vector<2, int>, _Vector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
+
+def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, __fp16>, int)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, short>, int)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
+
+def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, __fp16>, int)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, short>, int)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
 
 def __builtin_amdgcn_prng_b32 : AMDGPUBuiltin<"unsigned int(unsigned int)", [Const], "prng-inst">;
-def __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUBuiltin<"_Vector<6, unsigned int>(_Vector<32, _Float16>, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUBuiltin<"_Vector<6, unsigned int>(_Vector<32, _Float16>, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUBuiltin<"_Vector<6, unsigned int>(_Vector<32, __bf16>, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16 : AMDGPUBuiltin<"_Vector<6, unsigned int>(_Vector<32, __bf16>, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_f16_fp8 : AMDGPUBuiltin<"_Vector<2, _Float16>(_Vector<2, _Float16>, int, float, _Constant int, _Constant bool)", [Const], "fp8-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_f16_bf8 : AMDGPUBuiltin<"_Vector<2, _Float16>(_Vector<2, _Float16>, int, float, _Constant int, _Constant bool)", [Const], "bf8-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<32, _Float16>, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<32, _Float16>, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<32, __bf16>, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<32, __bf16>, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_f16_fp8 : AMDGPUBuiltin<"_ExtVector<2, _Float16>(_ExtVector<2, _Float16>, int, float, _Constant int, _Constant bool)", [Const], "fp8-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_f16_bf8 : AMDGPUBuiltin<"_ExtVector<2, _Float16>(_ExtVector<2, _Float16>, int, float, _Constant int, _Constant bool)", [Const], "bf8-cvt-scale-insts">;
 def __builtin_amdgcn_cvt_scalef32_f32_fp8 : AMDGPUBuiltin<"float(int, float, _Constant int)", [Const], "fp8-cvt-scale-insts">;
 def __builtin_amdgcn_cvt_scalef32_f32_bf8 : AMDGPUBuiltin<"float(int, float, _Constant int)", [Const], "bf8-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_fp8_f32 : AMDGPUBuiltin<"_Vector<2, short>(_Vector<2, short>, float, float, float, _Constant bool)", [Const], "fp8-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_bf8_f32 : AMDGPUBuiltin<"_Vector<2, short>(_Vector<2, short>, float, float, float, _Constant bool)", [Const], "bf8-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_f32_fp8 : AMDGPUBuiltin<"_Vector<2, float>(unsigned int, float, _Constant bool)", [Const], "fp8-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_f32_bf8 : AMDGPUBuiltin<"_Vector<2, float>(unsigned int, float, _Constant bool)", [Const], "bf8-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_fp8_f16 : AMDGPUBuiltin<"_Vector<2, short>(_Vector<2, short>, _Vector<2, _Float16>, float, _Constant bool)", [Const], "fp8-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16 : AMDGPUBuiltin<"_Vector<2, short>(_Vector<2, short>, _Vector<2, __bf16>, float, _Constant bool)", [Const], "fp8-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_bf8_f16 : AMDGPUBuiltin<"_Vector<2, short>(_Vector<2, short>, _Vector<2, _Float16>, float, _Constant bool)", [Const], "bf8-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16 : AMDGPUBuiltin<"_Vector<2, short>(_Vector<2, short>, _Vector<2, __bf16>, float, _Constant bool)", [Const], "bf8-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_f32_fp4 : AMDGPUBuiltin<"_Vector<2, float>(unsigned int, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_fp8_f32 : AMDGPUBuiltin<"_ExtVector<2, short>(_ExtVector<2, short>, float, float, float, _Constant bool)", [Const], "fp8-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_bf8_f32 : AMDGPUBuiltin<"_ExtVector<2, short>(_ExtVector<2, short>, float, float, float, _Constant bool)", [Const], "bf8-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_f32_fp8 : AMDGPUBuiltin<"_ExtVector<2, float>(unsigned int, float, _Constant bool)", [Const], "fp8-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_f32_bf8 : AMDGPUBuiltin<"_ExtVector<2, float>(unsigned int, float, _Constant bool)", [Const], "bf8-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_fp8_f16 : AMDGPUBuiltin<"_ExtVector<2, short>(_ExtVector<2, short>, _ExtVector<2, _Float16>, float, _Constant bool)", [Const], "fp8-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16 : AMDGPUBuiltin<"_ExtVector<2, short>(_ExtVector<2, short>, _ExtVector<2, __bf16>, float, _Constant bool)", [Const], "fp8-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_bf8_f16 : AMDGPUBuiltin<"_ExtVector<2, short>(_ExtVector<2, short>, _ExtVector<2, _Float16>, float, _Constant bool)", [Const], "bf8-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16 : AMDGPUBuiltin<"_ExtVector<2, short>(_ExtVector<2, short>, _ExtVector<2, __bf16>, float, _Constant bool)", [Const], "bf8-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_f32_fp4 : AMDGPUBuiltin<"_ExtVector<2, float>(unsigned int, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
 def __builtin_amdgcn_cvt_scalef32_pk_fp4_f32 : AMDGPUBuiltin<"unsigned int(unsigned int, float, float, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_f16_fp4 : AMDGPUBuiltin<"_Vector<2, _Float16>(unsigned int, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4 : AMDGPUBuiltin<"_Vector<2, __bf16>(unsigned int, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6 : AMDGPUBuiltin<"_Vector<32, float>(_Vector<6, unsigned int>, float)", [Const], "fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6 : AMDGPUBuiltin<"_Vector<32, float>(_Vector<6, unsigned int>, float)", [Const], "fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6 : AMDGPUBuiltin<"_Vector<32, _Float16>(_Vector<6, unsigned int>, float)", [Const], "fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6 : AMDGPUBuiltin<"_Vector<32, __bf16>(_Vector<6, unsigned int>, float)", [Const], "fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6 : AMDGPUBuiltin<"_Vector<32, _Float16>(_Vector<6, unsigned int>, float)", [Const], "fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6 : AMDGPUBuiltin<"_Vector<32, __bf16>(_Vector<6, unsigned int>, float)", [Const], "fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_f16_fp8 : AMDGPUBuiltin<"_Vector<2, _Float16>(unsigned int, float, _Constant bool)", [Const], "fp8-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8 : AMDGPUBuiltin<"_Vector<2, __bf16>(unsigned int, float, _Constant bool)", [Const], "fp8-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_f16_bf8 : AMDGPUBuiltin<"_Vector<2, _Float16>(unsigned int, float, _Constant bool)", [Const], "bf8-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8 : AMDGPUBuiltin<"_Vector<2, __bf16>(unsigned int, float, _Constant bool)", [Const], "bf8-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_fp4_f16 : AMDGPUBuiltin<"unsigned int(unsigned int, _Vector<2, _Float16>, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16 : AMDGPUBuiltin<"unsigned int(unsigned int, _Vector<2, __bf16>, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16 : AMDGPUBuiltin<"unsigned int(unsigned int, _Vector<2, _Float16>, unsigned int, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16 : AMDGPUBuiltin<"unsigned int(unsigned int, _Vector<2, __bf16>, unsigned int, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32 : AMDGPUBuiltin<"unsigned int(unsigned int, _Vector<2, float>, unsigned int, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_f16_fp4 : AMDGPUBuiltin<"_ExtVector<2, _Float16>(unsigned int, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4 : AMDGPUBuiltin<"_ExtVector<2, __bf16>(unsigned int, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6 : AMDGPUBuiltin<"_ExtVector<32, float>(_ExtVector<6, unsigned int>, float)", [Const], "fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6 : AMDGPUBuiltin<"_ExtVector<32, float>(_ExtVector<6, unsigned int>, float)", [Const], "fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6 : AMDGPUBuiltin<"_ExtVector<32, _Float16>(_ExtVector<6, unsigned int>, float)", [Const], "fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6 : AMDGPUBuiltin<"_ExtVector<32, __bf16>(_ExtVector<6, unsigned int>, float)", [Const], "fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6 : AMDGPUBuiltin<"_ExtVector<32, _Float16>(_ExtVector<6, unsigned int>, float)", [Const], "fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6 : AMDGPUBuiltin<"_ExtVector<32, __bf16>(_ExtVector<6, unsigned int>, float)", [Const], "fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_f16_fp8 : AMDGPUBuiltin<"_ExtVector<2, _Float16>(unsigned int, float, _Constant bool)", [Const], "fp8-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8 : AMDGPUBuiltin<"_ExtVector<2, __bf16>(unsigned int, float, _Constant bool)", [Const], "fp8-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_f16_bf8 : AMDGPUBuiltin<"_ExtVector<2, _Float16>(unsigned int, float, _Constant bool)", [Const], "bf8-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8 : AMDGPUBuiltin<"_ExtVector<2, __bf16>(unsigned int, float, _Constant bool)", [Const], "bf8-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_fp4_f16 : AMDGPUBuiltin<"unsigned int(unsigned int, _ExtVector<2, _Float16>, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16 : AMDGPUBuiltin<"unsigned int(unsigned int, _ExtVector<2, __bf16>, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16 : AMDGPUBuiltin<"unsigned int(unsigned int, _ExtVector<2, _Float16>, unsigned int, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16 : AMDGPUBuiltin<"unsigned int(unsigned int, _ExtVector<2, __bf16>, unsigned int, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32 : AMDGPUBuiltin<"unsigned int(unsigned int, _ExtVector<2, float>, unsigned int, float, _Constant int)", [Const], "fp4-cvt-scale-insts">;
 def __builtin_amdgcn_cvt_scalef32_sr_bf8_bf16 : AMDGPUBuiltin<"int(int, __bf16, unsigned int, float, _Constant int)", [Const], "bf8-cvt-scale-insts">;
 def __builtin_amdgcn_cvt_scalef32_sr_bf8_f16 : AMDGPUBuiltin<"int(int, _Float16, unsigned int, float, _Constant int)", [Const], "bf8-cvt-scale-insts">;
 def __builtin_amdgcn_cvt_scalef32_sr_bf8_f32 : AMDGPUBuiltin<"int(int, float, unsigned int, float, _Constant int)", [Const], "bf8-cvt-scale-insts">;
@@ -687,17 +687,17 @@ def __builtin_amdgcn_cvt_scalef32_sr_fp8_bf16 : AMDGPUBuiltin<"int(int, __bf16,
 def __builtin_amdgcn_cvt_scalef32_sr_fp8_f16 : AMDGPUBuiltin<"int(int, _Float16, unsigned int, float, _Constant int)", [Const], "fp8-cvt-scale-insts">;
 def __builtin_amdgcn_cvt_scalef32_sr_fp8_f32 : AMDGPUBuiltin<"int(int, float, unsigned int, float, _Constant int)", [Const], "fp8-cvt-scale-insts">;
 
-def __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16 : AMDGPUBuiltin<"_Vector<6, unsigned int>(_Vector<32, __bf16>, unsigned int, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16 : AMDGPUBuiltin<"_Vector<6, unsigned int>(_Vector<32, _Float16>, unsigned int, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32 : AMDGPUBuiltin<"_Vector<6, unsigned int>(_Vector<32, float>, unsigned int, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16 : AMDGPUBuiltin<"_Vector<6, unsigned int>(_Vector<32, __bf16>, unsigned int, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16 : AMDGPUBuiltin<"_Vector<6, unsigned int>(_Vector<32, _Float16>, unsigned int, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32 : AMDGPUBuiltin<"_Vector<6, unsigned int>(_Vector<32, float>, unsigned int, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<32, __bf16>, unsigned int, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<32, _Float16>, unsigned int, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<32, float>, unsigned int, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<32, __bf16>, unsigned int, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<32, _Float16>, unsigned int, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<32, float>, unsigned int, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
 def __builtin_amdgcn_bitop3_b32 : AMDGPUBuiltin<"int(int, int, int, _Constant unsigned int)", [Const], "bitop3-insts">;
 def __builtin_amdgcn_bitop3_b16 : AMDGPUBuiltin<"short(short, short, short, _Constant unsigned int)", [Const], "bitop3-insts">;
 
-def __builtin_amdgcn_cvt_sr_bf16_f32 : AMDGPUBuiltin<"_Vector<2, __bf16>(_Vector<2, __bf16>, float, unsigned int, _Constant bool)", [Const], "f32-to-f16bf16-cvt-sr-insts">;
-def __builtin_amdgcn_cvt_sr_f16_f32 : AMDGPUBuiltin<"_Vector<2, _Float16>(_Vector<2, _Float16>, float, unsigned int, _Constant bool)", [Const], "f32-to-f16bf16-cvt-sr-insts">;
+def __builtin_amdgcn_cvt_sr_bf16_f32 : AMDGPUBuiltin<"_ExtVector<2, __bf16>(_ExtVector<2, __bf16>, float, unsigned int, _Constant bool)", [Const], "f32-to-f16bf16-cvt-sr-insts">;
+def __builtin_amdgcn_cvt_sr_f16_f32 : AMDGPUBuiltin<"_ExtVector<2, _Float16>(_ExtVector<2, _Float16>, float, unsigned int, _Constant bool)", [Const], "f32-to-f16bf16-cvt-sr-insts">;
 
 //===----------------------------------------------------------------------===//
 // GFX1250+ only builtins.
@@ -708,46 +708,46 @@ def __builtin_amdgcn_flat_prefetch : AMDGPUBuiltin<"void(void const address_spac
 def __builtin_amdgcn_global_prefetch : AMDGPUBuiltin<"void(void const address_space<1> *, _Constant int)", [Const], "vmem-pref-insts">;
 
 def __builtin_amdgcn_global_load_monitor_b32 : AMDGPUBuiltin<"int(int address_space<1> *, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_global_load_monitor_b64 : AMDGPUBuiltin<"_Vector<2, int>(_Vector<2, int address_space<1> *>, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_global_load_monitor_b128 : AMDGPUBuiltin<"_Vector<4, int>(_Vector<4, int address_space<1> *>, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_global_load_monitor_b64 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int address_space<1> *>, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_global_load_monitor_b128 : AMDGPUBuiltin<"_ExtVector<4, int>(_ExtVector<4, int address_space<1> *>, _Constant int)", [Const], "gfx1250-insts">;
 def __builtin_amdgcn_flat_load_monitor_b32 : AMDGPUBuiltin<"int(int address_space<0> *, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_flat_load_monitor_b64 : AMDGPUBuiltin<"_Vector<2, int>(_Vector<2, int address_space<0> *>, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_flat_load_monitor_b128 : AMDGPUBuiltin<"_Vector<4, int>(_Vector<4, int address_space<0> *>, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_flat_load_monitor_b64 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int address_space<0> *>, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_flat_load_monitor_b128 : AMDGPUBuiltin<"_ExtVector<4, int>(_ExtVector<4, int address_space<0> *>, _Constant int)", [Const], "gfx1250-insts">;
 def __builtin_amdgcn_cluster_load_b32 : AMDGPUBuiltin<"int(int address_space<1> *, _Constant int, int)", [Const], "mcast-load-insts,wavefrontsize32">;
-def __builtin_amdgcn_cluster_load_b64 : AMDGPUBuiltin<"_Vector<2, int>(_Vector<2, int address_space<1> *>, _Constant int, int)", [Const], "mcast-load-insts,wavefrontsize32">;
-def __builtin_amdgcn_cluster_load_b128 : AMDGPUBuiltin<"_Vector<4, int>(_Vector<4, int address_space<1> *>, _Constant int, int)", [Const], "mcast-load-insts,wavefrontsize32">;
+def __builtin_amdgcn_cluster_load_b64 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int address_space<1> *>, _Constant int, int)", [Const], "mcast-load-insts,wavefrontsize32">;
+def __builtin_amdgcn_cluster_load_b128 : AMDGPUBuiltin<"_ExtVector<4, int>(_ExtVector<4, int address_space<1> *>, _Constant int, int)", [Const], "mcast-load-insts,wavefrontsize32">;
 def __builtin_amdgcn_cluster_load_async_to_lds_b8 : AMDGPUBuiltin<"void(char address_space<1> *, char address_space<3> *, _Constant int, _Constant int, int)", [Const], "mcast-load-insts,wavefrontsize32">;
 def __builtin_amdgcn_cluster_load_async_to_lds_b32 : AMDGPUBuiltin<"void(int address_space<1> *, int address_space<3> *, _Constant int, _Constant int, int)", [Const], "mcast-load-insts,wavefrontsize32">;
-def __builtin_amdgcn_cluster_load_async_to_lds_b64 : AMDGPUBuiltin<"void(_Vector<2, int address_space<1> *>, _Vector<2, int address_space<3> *>, _Constant int, _Constant int, int)", [Const], "mcast-load-insts,wavefrontsize32">;
-def __builtin_amdgcn_cluster_load_async_to_lds_b128 : AMDGPUBuiltin<"void(_Vector<4, int address_space<1> *>, _Vector<4, int address_space<3> *>, _Constant int, _Constant int, int)", [Const], "mcast-load-insts,wavefrontsize32">;
+def __builtin_amdgcn_cluster_load_async_to_lds_b64 : AMDGPUBuiltin<"void(_ExtVector<2, int address_space<1> *>, _ExtVector<2, int address_space<3> *>, _Constant int, _Constant int, int)", [Const], "mcast-load-insts,wavefrontsize32">;
+def __builtin_amdgcn_cluster_load_async_to_lds_b128 : AMDGPUBuiltin<"void(_ExtVector<4, int address_space<1> *>, _ExtVector<4, int address_space<3> *>, _Constant int, _Constant int, int)", [Const], "mcast-load-insts,wavefrontsize32">;
 def __builtin_amdgcn_global_load_async_to_lds_b8 : AMDGPUBuiltin<"void(char address_space<1> *, char address_space<3> *, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
 def __builtin_amdgcn_global_load_async_to_lds_b32 : AMDGPUBuiltin<"void(int address_space<1> *, int address_space<3> *, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_global_load_async_to_lds_b64 : AMDGPUBuiltin<"void(_Vector<2, int address_space<1> *>, _Vector<2, int address_space<3> *>, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_global_load_async_to_lds_b128 : AMDGPUBuiltin<"void(_Vector<4, int address_space<1> *>, _Vector<4, int address_space<3> *>, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_global_load_async_to_lds_b64 : AMDGPUBuiltin<"void(_ExtVector<2, int address_space<1> *>, _ExtVector<2, int address_space<3> *>, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_global_load_async_to_lds_b128 : AMDGPUBuiltin<"void(_ExtVector<4, int address_space<1> *>, _ExtVector<4, int address_space<3> *>, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
 def __builtin_amdgcn_global_store_async_from_lds_b8 : AMDGPUBuiltin<"void(char address_space<1> *, char address_space<3> *, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
 def __builtin_amdgcn_global_store_async_from_lds_b32 : AMDGPUBuiltin<"void(int address_space<1> *, int address_space<3> *, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_global_store_async_from_lds_b64 : AMDGPUBuiltin<"void(_Vector<2, int address_space<1> *>, _Vector<2, int address_space<3> *>, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_global_store_async_from_lds_b128 : AMDGPUBuiltin<"void(_Vector<4, int address_space<1> *>, _Vector<4, int address_space<3> *>, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_global_store_async_from_lds_b64 : AMDGPUBuiltin<"void(_ExtVector<2, int address_space<1> *>, _ExtVector<2, int address_space<3> *>, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_global_store_async_from_lds_b128 : AMDGPUBuiltin<"void(_ExtVector<4, int address_space<1> *>, _ExtVector<4, int address_space<3> *>, _Constant int, _Constant int)", [Const], "gfx1250-insts">;
 def __builtin_amdgcn_ds_atomic_async_barrier_arrive_b64 : AMDGPUBuiltin<"void(long int address_space<3> *)", [Const], "gfx1250-insts">;
 def __builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64 : AMDGPUBuiltin<"long int(long int address_space<3> *, long int)", [Const], "gfx1250-insts">;
 
-def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_Vector<4, int>, _Vector<8, int>, _Vector<4, int>, _Vector<4, int>, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_tensor_load_to_lds_d2 : AMDGPUBuiltin<"void(_Vector<4, int>, _Vector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_Vector<4, int>, _Vector<8, int>, _Vector<4, int>, _Vector<4, int>, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_tensor_store_from_lds_d2 : AMDGPUBuiltin<"void(_Vector<4, int>, _Vector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
-
-def __builtin_amdgcn_global_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_Vector<2, int>(_Vector<2, int address_space<1> *>)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">;
-def __builtin_amdgcn_global_load_tr8_b64_v2i32 : AMDGPUBuiltin<"_Vector<2, int>(_Vector<2, int address_space<1> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_global_load_tr6_b96_v3i32 : AMDGPUBuiltin<"_Vector<3, int>(_Vector<3, int address_space<1> *>)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">;
-def __builtin_amdgcn_global_load_tr16_b128_v8i16 : AMDGPUBuiltin<"_Vector<8, short>(_Vector<8, short address_space<1> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_global_load_tr16_b128_v8f16 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<8, __fp16 address_space<1> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_global_load_tr16_b128_v8bf16 : AMDGPUBuiltin<"_Vector<8, __bf16>(_Vector<8, __bf16 address_space<1> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_ds_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_Vector<2, int>(_Vector<2, int address_space<3> *>)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">;
-def __builtin_amdgcn_ds_load_tr8_b64_v2i32 : AMDGPUBuiltin<"_Vector<2, int>(_Vector<2, int address_space<3> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_ds_load_tr6_b96_v3i32 : AMDGPUBuiltin<"_Vector<3, int>(_Vector<3, int address_space<3> *>)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">;
-def __builtin_amdgcn_ds_load_tr16_b128_v8i16 : AMDGPUBuiltin<"_Vector<8, short>(_Vector<8, short address_space<3> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_ds_load_tr16_b128_v8f16 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<8, __fp16 address_space<3> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_ds_load_tr16_b128_v8bf16 : AMDGPUBuiltin<"_Vector<8, __bf16>(_Vector<8, __bf16 address_space<3> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_tensor_load_to_lds_d2 : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_tensor_store_from_lds_d2 : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
+
+def __builtin_amdgcn_global_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int address_space<1> *>)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">;
+def __builtin_amdgcn_global_load_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int address_space<1> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_global_load_tr6_b96_v3i32 : AMDGPUBuiltin<"_ExtVector<3, int>(_ExtVector<3, int address_space<1> *>)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">;
+def __builtin_amdgcn_global_load_tr16_b128_v8i16 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short address_space<1> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_global_load_tr16_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16 address_space<1> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_global_load_tr16_b128_v8bf16 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(_ExtVector<8, __bf16 address_space<1> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_ds_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int address_space<3> *>)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">;
+def __builtin_amdgcn_ds_load_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int address_space<3> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_ds_load_tr6_b96_v3i32 : AMDGPUBuiltin<"_ExtVector<3, int>(_ExtVector<3, int address_space<3> *>)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">;
+def __builtin_amdgcn_ds_load_tr16_b128_v8i16 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short address_space<3> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_ds_load_tr16_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16 address_space<3> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_ds_load_tr16_b128_v8bf16 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(_ExtVector<8, __bf16 address_space<3> *>)", [Const], "gfx1250-insts,wavefrontsize32">;
 
 def __builtin_amdgcn_s_setprio_inc_wg : AMDGPUBuiltin<"void(_Constant short)", [], "setprio-inc-wg-inst">;
 def __builtin_amdgcn_s_monitor_sleep : AMDGPUBuiltin<"void(_Constant short)", [], "gfx1250-insts">;
@@ -767,61 +767,61 @@ def __builtin_amdgcn_exp2_bf16 : AMDGPUBuiltin<"__bf16(__bf16)", [Const], "bf16-
 def __builtin_amdgcn_sin_bf16 : AMDGPUBuiltin<"__bf16(__bf16)", [Const], "bf16-trans-insts">;
 def __builtin_amdgcn_cos_bf16 : AMDGPUBuiltin<"__bf16(__bf16)", [Const], "bf16-trans-insts">;
 
-def __builtin_amdgcn_cvt_sr_pk_bf16_f32 : AMDGPUBuiltin<"_Vector<2, __bf16>(float, float, int)", [Const], "bf16-cvt-insts">;
-def __builtin_amdgcn_cvt_sr_pk_f16_f32 : AMDGPUBuiltin<"_Vector<2, __fp16>(float, float, int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_sr_pk_bf16_f32 : AMDGPUBuiltin<"_ExtVector<2, __bf16>(float, float, int)", [Const], "bf16-cvt-insts">;
+def __builtin_amdgcn_cvt_sr_pk_f16_f32 : AMDGPUBuiltin<"_ExtVector<2, __fp16>(float, float, int)", [Const], "gfx1250-insts">;
 def __builtin_amdgcn_cvt_f16_fp8 : AMDGPUBuiltin<"__fp16(int, _Constant int)", [Const], "gfx1250-insts">;
 def __builtin_amdgcn_cvt_f16_bf8 : AMDGPUBuiltin<"__fp16(int, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_pk_f16_fp8 : AMDGPUBuiltin<"_Vector<2, __fp16>(short)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_pk_f16_bf8 : AMDGPUBuiltin<"_Vector<2, __fp16>(short)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_pk_fp8_f16 : AMDGPUBuiltin<"short(_Vector<2, __fp16>)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_pk_bf8_f16 : AMDGPUBuiltin<"short(_Vector<2, __fp16>)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_pk_f16_fp8 : AMDGPUBuiltin<"_ExtVector<2, __fp16>(short)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_pk_f16_bf8 : AMDGPUBuiltin<"_ExtVector<2, __fp16>(short)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_pk_fp8_f16 : AMDGPUBuiltin<"short(_ExtVector<2, __fp16>)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_pk_bf8_f16 : AMDGPUBuiltin<"short(_ExtVector<2, __fp16>)", [Const], "gfx1250-insts">;
 def __builtin_amdgcn_cvt_sr_fp8_f16 : AMDGPUBuiltin<"int(__fp16, int, unsigned int, _Constant int)", [Const], "gfx1250-insts">;
 def __builtin_amdgcn_cvt_sr_bf8_f16 : AMDGPUBuiltin<"int(__fp16, int, unsigned int, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk8_f16_fp8 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<2, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk8_bf16_fp8 : AMDGPUBuiltin<"_Vector<8, __bf16>(_Vector<2, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk8_f16_bf8 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<2, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk8_bf16_bf8 : AMDGPUBuiltin<"_Vector<8, __bf16>(_Vector<2, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk8_f16_fp4 : AMDGPUBuiltin<"_Vector<8, __fp16>(unsigned int, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk8_bf16_fp4 : AMDGPUBuiltin<"_Vector<8, __bf16>(unsigned int, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk8_f32_fp8 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<2, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk8_f32_bf8 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<2, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk8_f32_fp4 : AMDGPUBuiltin<"_Vector<8, float>(unsigned int, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk16_f16_fp6 : AMDGPUBuiltin<"_Vector<16, __fp16>(_Vector<3, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk16_bf16_fp6 : AMDGPUBuiltin<"_Vector<16, __bf16>(_Vector<3, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk16_f16_bf6 : AMDGPUBuiltin<"_Vector<16, __fp16>(_Vector<3, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk16_bf16_bf6 : AMDGPUBuiltin<"_Vector<16, __bf16>(_Vector<3, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk16_f32_fp6 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<3, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scale_pk16_f32_bf6 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<3, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16 : AMDGPUBuiltin<"_Vector<2, unsigned int>(_Vector<8, __bf16>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16 : AMDGPUBuiltin<"_Vector<2, unsigned int>(_Vector<8, __bf16>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk8_fp8_f16 : AMDGPUBuiltin<"_Vector<2, unsigned int>(_Vector<8, __fp16>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk8_bf8_f16 : AMDGPUBuiltin<"_Vector<2, unsigned int>(_Vector<8, __fp16>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk8_fp8_f32 : AMDGPUBuiltin<"_Vector<2, unsigned int>(_Vector<8, float>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk8_bf8_f32 : AMDGPUBuiltin<"_Vector<2, unsigned int>(_Vector<8, float>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk8_fp4_f32 : AMDGPUBuiltin<"unsigned int(_Vector<8, float>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk8_fp4_f16 : AMDGPUBuiltin<"unsigned int(_Vector<8, __fp16>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16 : AMDGPUBuiltin<"unsigned int(_Vector<8, __bf16>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk16_fp6_f32 : AMDGPUBuiltin<"_Vector<3, unsigned int>(_Vector<16, float>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk16_bf6_f32 : AMDGPUBuiltin<"_Vector<3, unsigned int>(_Vector<16, float>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk16_fp6_f16 : AMDGPUBuiltin<"_Vector<3, unsigned int>(_Vector<16, __fp16>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk16_bf6_f16 : AMDGPUBuiltin<"_Vector<3, unsigned int>(_Vector<16, __fp16>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk16_fp6_bf16 : AMDGPUBuiltin<"_Vector<3, unsigned int>(_Vector<16, __bf16>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_pk16_bf6_bf16 : AMDGPUBuiltin<"_Vector<3, unsigned int>(_Vector<16, __bf16>, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16 : AMDGPUBuiltin<"_Vector<2, unsigned int>(_Vector<8, __bf16>, unsigned int, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16 : AMDGPUBuiltin<"_Vector<2, unsigned int>(_Vector<8, __bf16>, unsigned int, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16 : AMDGPUBuiltin<"_Vector<2, unsigned int>(_Vector<8, __fp16>, unsigned int, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16 : AMDGPUBuiltin<"_Vector<2, unsigned int>(_Vector<8, __fp16>, unsigned int, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32 : AMDGPUBuiltin<"_Vector<2, unsigned int>(_Vector<8, float>, unsigned int, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32 : AMDGPUBuiltin<"_Vector<2, unsigned int>(_Vector<8, float>, unsigned int, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32 : AMDGPUBuiltin<"unsigned int(_Vector<8, float>, unsigned int, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16 : AMDGPUBuiltin<"unsigned int(_Vector<8, __fp16>, unsigned int, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16 : AMDGPUBuiltin<"unsigned int(_Vector<8, __bf16>, unsigned int, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16 : AMDGPUBuiltin<"_Vector<3, unsigned int>(_Vector<16, __bf16>, unsigned int, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16 : AMDGPUBuiltin<"_Vector<3, unsigned int>(_Vector<16, __fp16>, unsigned int, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32 : AMDGPUBuiltin<"_Vector<3, unsigned int>(_Vector<16, float>, unsigned int, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16 : AMDGPUBuiltin<"_Vector<3, unsigned int>(_Vector<16, __bf16>, unsigned int, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16 : AMDGPUBuiltin<"_Vector<3, unsigned int>(_Vector<16, __fp16>, unsigned int, float)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32 : AMDGPUBuiltin<"_Vector<3, unsigned int>(_Vector<16, float>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk8_f16_fp8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<2, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk8_bf16_fp8 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(_ExtVector<2, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk8_f16_bf8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<2, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk8_bf16_bf8 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(_ExtVector<2, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk8_f16_fp4 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(unsigned int, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk8_bf16_fp4 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(unsigned int, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk8_f32_fp8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk8_f32_bf8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk8_f32_fp4 : AMDGPUBuiltin<"_ExtVector<8, float>(unsigned int, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk16_f16_fp6 : AMDGPUBuiltin<"_ExtVector<16, __fp16>(_ExtVector<3, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk16_bf16_fp6 : AMDGPUBuiltin<"_ExtVector<16, __bf16>(_ExtVector<3, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk16_f16_bf6 : AMDGPUBuiltin<"_ExtVector<16, __fp16>(_ExtVector<3, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk16_bf16_bf6 : AMDGPUBuiltin<"_ExtVector<16, __bf16>(_ExtVector<3, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk16_f32_fp6 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<3, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scale_pk16_f32_bf6 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<3, unsigned int>, unsigned int, _Constant unsigned int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16 : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(_ExtVector<8, __bf16>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16 : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(_ExtVector<8, __bf16>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk8_fp8_f16 : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(_ExtVector<8, __fp16>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk8_bf8_f16 : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(_ExtVector<8, __fp16>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk8_fp8_f32 : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(_ExtVector<8, float>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk8_bf8_f32 : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(_ExtVector<8, float>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk8_fp4_f32 : AMDGPUBuiltin<"unsigned int(_ExtVector<8, float>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk8_fp4_f16 : AMDGPUBuiltin<"unsigned int(_ExtVector<8, __fp16>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16 : AMDGPUBuiltin<"unsigned int(_ExtVector<8, __bf16>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk16_fp6_f32 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(_ExtVector<16, float>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk16_bf6_f32 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(_ExtVector<16, float>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk16_fp6_f16 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(_ExtVector<16, __fp16>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk16_bf6_f16 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(_ExtVector<16, __fp16>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk16_fp6_bf16 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(_ExtVector<16, __bf16>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_pk16_bf6_bf16 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(_ExtVector<16, __bf16>, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16 : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(_ExtVector<8, __bf16>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16 : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(_ExtVector<8, __bf16>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16 : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(_ExtVector<8, __fp16>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16 : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(_ExtVector<8, __fp16>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32 : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(_ExtVector<8, float>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32 : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(_ExtVector<8, float>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32 : AMDGPUBuiltin<"unsigned int(_ExtVector<8, float>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16 : AMDGPUBuiltin<"unsigned int(_ExtVector<8, __fp16>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16 : AMDGPUBuiltin<"unsigned int(_ExtVector<8, __bf16>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(_ExtVector<16, __bf16>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(_ExtVector<16, __fp16>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(_ExtVector<16, float>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(_ExtVector<16, __bf16>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(_ExtVector<16, __fp16>, unsigned int, float)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(_ExtVector<16, float>, unsigned int, float)", [Const], "gfx1250-insts">;
 def __builtin_amdgcn_cvt_pk_fp8_f32_e5m3 : AMDGPUBuiltin<"int(float, float, int, _Constant bool)", [Const], "fp8e5m3-insts">;
 def __builtin_amdgcn_cvt_sr_fp8_f32_e5m3 : AMDGPUBuiltin<"int(float, int, int, _Constant int)", [Const], "fp8e5m3-insts">;
 def __builtin_amdgcn_sat_pk4_i4_i8 : AMDGPUBuiltin<"unsigned short(unsigned int)", [Const], "gfx1250-insts">;
@@ -833,185 +833,185 @@ def __builtin_amdgcn_permlane_down : AMDGPUBuiltin<"int(int, int, int)", [Const]
 def __builtin_amdgcn_permlane_xor : AMDGPUBuiltin<"int(int, int, int)", [Const], "gfx1250-insts,wavefrontsize32">;
 def __builtin_amdgcn_permlane_idx_gen : AMDGPUBuiltin<"int(int, int)", [Const], "gfx1250-insts,wavefrontsize32">;
 
-def __builtin_amdgcn_perm_pk16_b4_u4 : AMDGPUBuiltin<"_Vector<2, unsigned int>(unsigned int, unsigned int, _Vector<2, unsigned int>)", [Const], "tensor-cvt-lut-insts">;
-def __builtin_amdgcn_perm_pk16_b6_u4 : AMDGPUBuiltin<"_Vector<3, unsigned int>(unsigned int, unsigned long int, _Vector<2, unsigned int>)", [Const], "tensor-cvt-lut-insts">;
-def __builtin_amdgcn_perm_pk16_b8_u4 : AMDGPUBuiltin<"_Vector<4, unsigned int>(unsigned long int, unsigned long int, _Vector<2, unsigned int>)", [Const], "tensor-cvt-lut-insts">;
+def __builtin_amdgcn_perm_pk16_b4_u4 : AMDGPUBuiltin<"_ExtVector<2, unsigned int>(unsigned int, unsigned int, _ExtVector<2, unsigned int>)", [Const], "tensor-cvt-lut-insts">;
+def __builtin_amdgcn_perm_pk16_b6_u4 : AMDGPUBuiltin<"_ExtVector<3, unsigned int>(unsigned int, unsigned long int, _ExtVector<2, unsigned int>)", [Const], "tensor-cvt-lut-insts">;
+def __builtin_amdgcn_perm_pk16_b8_u4 : AMDGPUBuiltin<"_ExtVector<4, unsigned int>(unsigned long int, unsigned long int, _ExtVector<2, unsigned int>)", [Const], "tensor-cvt-lut-insts">;
 
 def __builtin_amdgcn_add_max_i32 : AMDGPUBuiltin<"int(int, int, int, _Constant bool)", [Const], "add-min-max-insts">;
 def __builtin_amdgcn_add_max_u32 : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int, _Constant bool)", [Const], "add-min-max-insts">;
 def __builtin_amdgcn_add_min_i32 : AMDGPUBuiltin<"int(int, int, int, _Constant bool)", [Const], "add-min-max-insts">;
 def __builtin_amdgcn_add_min_u32 : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int, _Constant bool)", [Const], "add-min-max-insts">;
-def __builtin_amdgcn_pk_add_max_i16 : AMDGPUBuiltin<"_Vector<2, short>(_Vector<2, short>, _Vector<2, short>, _Vector<2, short>, _Constant bool)", [Const], "pk-add-min-max-insts">;
-def __builtin_amdgcn_pk_add_max_u16 : AMDGPUBuiltin<"_Vector<2, unsigned short>(_Vector<2, unsigned short>, _Vector<2, unsigned short>, _Vector<2, unsigned short>, _Constant bool)", [Const], "pk-add-min-max-insts">;
-def __builtin_amdgcn_pk_add_min_i16 : AMDGPUBuiltin<"_Vector<2, short>(_Vector<2, short>, _Vector<2, short>, _Vector<2, short>, _Constant bool)", [Const], "pk-add-min-max-insts">;
-def __builtin_amdgcn_pk_add_min_u16 : AMDGPUBuiltin<"_Vector<2, unsigned short>(_Vector<2, unsigned short>, _Vector<2, unsigned short>, _Vector<2, unsigned short>, _Constant bool)", [Const], "pk-add-min-max-insts">;
+def __builtin_amdgcn_pk_add_max_i16 : AMDGPUBuiltin<"_ExtVector<2, short>(_ExtVector<2, short>, _ExtVector<2, short>, _ExtVector<2, short>, _Constant bool)", [Const], "pk-add-min-max-insts">;
+def __builtin_amdgcn_pk_add_max_u16 : AMDGPUBuiltin<"_ExtVector<2, unsigned short>(_ExtVector<2, unsigned short>, _ExtVector<2, unsigned short>, _ExtVector<2, unsigned short>, _Constant bool)", [Const], "pk-add-min-max-insts">;
+def __builtin_amdgcn_pk_add_min_i16 : AMDGPUBuiltin<"_ExtVector<2, short>(_ExtVector<2, short>, _ExtVector<2, short>, _ExtVector<2, short>, _Constant bool)", [Const], "pk-add-min-max-insts">;
+def __builtin_amdgcn_pk_add_min_u16 : AMDGPUBuiltin<"_ExtVector<2, unsigned short>(_ExtVector<2, unsigned short>, _ExtVector<2, unsigned short>, _ExtVector<2, unsigned short>, _Constant bool)", [Const], "pk-add-min-max-insts">;
 
 // GFX1250 WMMA builtins
-def __builtin_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUBuiltin<"_Vector<8, float>(_Constant bool, _Vector<2, float>, _Constant bool, _Vector<2, float>, _Constant short, _Vector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUBuiltin<"_Vector<8, float>(_Constant bool, _Vector<16, __bf16>, _Constant bool, _Vector<16, __bf16>, _Constant short, _Vector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_bf16_16x16x32_bf16 : AMDGPUBuiltin<"_Vector<8, __bf16>(_Constant bool, _Vector<16, __bf16>, _Constant bool, _Vector<16, __bf16>, _Constant short, _Vector<8, __bf16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16 : AMDGPUBuiltin<"_Vector<8, __bf16>(_Constant bool, _Vector<16, __bf16>, _Constant bool, _Vector<16, __bf16>, _Constant short, _Vector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<8, int>, _Vector<8, int>, _Constant short, _Vector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<8, int>, _Vector<8, int>, _Constant short, _Vector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<8, int>, _Vector<8, int>, _Constant short, _Vector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<8, int>, _Vector<8, int>, _Constant short, _Vector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<8, int>, _Vector<8, int>, _Constant short, _Vector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<8, int>, _Vector<8, int>, _Constant short, _Vector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<8, int>, _Vector<8, int>, _Constant short, _Vector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<8, int>, _Vector<8, int>, _Constant short, _Vector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x64_iu8 : AMDGPUBuiltin<"_Vector<8, int>(_Constant bool, _Vector<8, int>, _Constant bool, _Vector<8, int>, _Vector<8, int>, _Constant bool, _Constant bool, ...)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<16, int>, _Vector<16, int>, _Constant short, _Vector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<16, int>, _Vector<16, int>, _Constant short, _Vector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<16, int>, _Vector<16, int>, _Constant short, _Vector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<16, int>, _Vector<16, int>, _Constant short, _Vector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4 : AMDGPUBuiltin<"_Vector<8, float>(_Constant int, _Vector<16, int>, _Constant int, _Vector<16, int>, _Constant short, _Vector<8, float>)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<16, int>, _Vector<16, int>, _Constant short, _Vector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<16, int>, _Vector<16, int>, _Constant short, _Vector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<16, int>, _Vector<16, int>, _Constant short, _Vector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<16, int>, _Vector<16, int>, _Constant short, _Vector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4 : AMDGPUBuiltin<"_Vector<8, float>(_Constant int, _Vector<16, int>, _Constant int, _Vector<16, int>, _Constant short, _Vector<8, float>, _Constant int, _Constant int, int, _Constant int, _Constant int, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4 : AMDGPUBuiltin<"_Vector<8, float>(_Constant int, _Vector<16, int>, _Constant int, _Vector<16, int>, _Constant short, _Vector<8, float>, _Constant int, _Constant int, long int, _Constant int, _Constant int, long int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x32_f16 : AMDGPUBuiltin<"_Vector<8, float>(_Constant bool, _Vector<16, __fp16>, _Constant bool, _Vector<16, __fp16>, _Constant short, _Vector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x32_f16 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Constant bool, _Vector<16, __fp16>, _Constant bool, _Vector<16, __fp16>, _Constant short, _Vector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_32x16x128_f4 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<16, int>, _Vector<8, int>, _Constant short, _Vector<16, float>)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_scale_f32_32x16x128_f4 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<16, int>, _Vector<8, int>, _Constant short, _Vector<16, float>, _Constant int, _Constant int, int, _Constant int, _Constant int, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4 : AMDGPUBuiltin<"_Vector<16, float>(_Vector<16, int>, _Vector<8, int>, _Constant short, _Vector<16, float>, _Constant int, _Constant int, long int, _Constant int, _Constant int, long int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x64_bf16 : AMDGPUBuiltin<"_Vector<8, float>(_Constant bool, _Vector<16, __bf16>, _Constant bool, _Vector<32, __bf16>, _Vector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_bf16_16x16x64_bf16 : AMDGPUBuiltin<"_Vector<8, __bf16>(_Constant bool, _Vector<16, __bf16>, _Constant bool, _Vector<32, __bf16>, _Vector<8, __bf16>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16 : AMDGPUBuiltin<"_Vector<8, float>(_Constant bool, _Vector<16, __bf16>, _Constant bool, _Vector<32, __bf16>, _Vector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<8, int>, _Vector<16, int>, _Vector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<8, int>, _Vector<16, int>, _Vector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<8, int>, _Vector<16, int>, _Vector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_Vector<8, float>(_Vector<8, int>, _Vector<16, int>, _Vector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<8, int>, _Vector<16, int>, _Vector<8, __fp16>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<8, int>, _Vector<16, int>, _Vector<8, __fp16>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<8, int>, _Vector<16, int>, _Vector<8, __fp16>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Vector<8, int>, _Vector<16, int>, _Vector<8, __fp16>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_i32_16x16x128_iu8 : AMDGPUBuiltin<"_Vector<8, int>(_Constant bool, _Vector<8, int>, _Constant bool, _Vector<16, int>, _Vector<8, int>, int, _Constant bool, _Constant bool, ...)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x64_f16 : AMDGPUBuiltin<"_Vector<8, float>(_Constant bool, _Vector<16, __fp16>, _Constant bool, _Vector<32, __fp16>, _Vector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f16_16x16x64_f16 : AMDGPUBuiltin<"_Vector<8, __fp16>(_Constant bool, _Vector<16, __fp16>, _Constant bool, _Vector<32, __fp16>, _Vector<8, __fp16>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant bool, _ExtVector<2, float>, _Constant bool, _ExtVector<2, float>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant bool, _ExtVector<16, __bf16>, _Constant bool, _ExtVector<16, __bf16>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_bf16_16x16x32_bf16 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(_Constant bool, _ExtVector<16, __bf16>, _Constant bool, _ExtVector<16, __bf16>, _Constant short, _ExtVector<8, __bf16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(_Constant bool, _ExtVector<16, __bf16>, _Constant bool, _ExtVector<16, __bf16>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x64_iu8 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<8, int>, _Constant bool, _ExtVector<8, int>, _ExtVector<8, int>, _Constant bool, _Constant bool, ...)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant int, _ExtVector<16, int>, _Constant int, _ExtVector<16, int>, _Constant short, _ExtVector<8, float>)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant int, _ExtVector<16, int>, _Constant int, _ExtVector<16, int>, _Constant short, _ExtVector<8, float>, _Constant int, _Constant int, int, _Constant int, _Constant int, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant int, _ExtVector<16, int>, _Constant int, _ExtVector<16, int>, _Constant short, _ExtVector<8, float>, _Constant int, _Constant int, long int, _Constant int, _Constant int, long int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x32_f16 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant bool, _ExtVector<16, __fp16>, _Constant bool, _ExtVector<16, __fp16>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x32_f16 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_Constant bool, _ExtVector<16, __fp16>, _Constant bool, _ExtVector<16, __fp16>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_32x16x128_f4 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<16, int>, _ExtVector<8, int>, _Constant short, _ExtVector<16, float>)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_scale_f32_32x16x128_f4 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<16, int>, _ExtVector<8, int>, _Constant short, _ExtVector<16, float>, _Constant int, _Constant int, int, _Constant int, _Constant int, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<16, int>, _ExtVector<8, int>, _Constant short, _ExtVector<16, float>, _Constant int, _Constant int, long int, _Constant int, _Constant int, long int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x64_bf16 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant bool, _ExtVector<16, __bf16>, _Constant bool, _ExtVector<32, __bf16>, _ExtVector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_bf16_16x16x64_bf16 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(_Constant bool, _ExtVector<16, __bf16>, _Constant bool, _ExtVector<32, __bf16>, _ExtVector<8, __bf16>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant bool, _ExtVector<16, __bf16>, _Constant bool, _ExtVector<32, __bf16>, _ExtVector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, __fp16>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, __fp16>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, __fp16>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, __fp16>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_i32_16x16x128_iu8 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<8, int>, _Constant bool, _ExtVector<16, int>, _ExtVector<8, int>, int, _Constant bool, _Constant bool, ...)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x64_f16 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant bool, _ExtVector<16, __fp16>, _Constant bool, _ExtVector<32, __fp16>, _ExtVector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f16_16x16x64_f16 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_Constant bool, _ExtVector<16, __fp16>, _Constant bool, _ExtVector<32, __fp16>, _ExtVector<8, __fp16>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">;
 
 // GFX12.5 128B cooperative atomics
 def __builtin_amdgcn_cooperative_atomic_load_32x4B : AMDGPUBuiltin<"int(int *, _Constant int, char const *)", [Const], "gfx1250-insts,wavefrontsize32">;
 def __builtin_amdgcn_cooperative_atomic_store_32x4B : AMDGPUBuiltin<"void(int *, int, _Constant int, char const *)", [Const], "gfx1250-insts,wavefrontsize32">;
 
-def __builtin_amdgcn_cooperative_atomic_load_16x8B : AMDGPUBuiltin<"_Vector<2, int>(_Vector<2, int *>, _Constant int, char const *)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_cooperative_atomic_store_16x8B : AMDGPUBuiltin<"void(_Vector<2, int *>, _Vector<2, int>, _Constant int, char const *)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_cooperative_atomic_load_16x8B : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int *>, _Constant int, char const *)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_cooperative_atomic_store_16x8B : AMDGPUBuiltin<"void(_ExtVector<2, int *>, _ExtVector<2, int>, _Constant int, char const *)", [Const], "gfx1250-insts,wavefrontsize32">;
 
-def __builtin_amdgcn_cooperative_atomic_load_8x16B : AMDGPUBuiltin<"_Vector<4, int>(_Vector<4, int *>, _Constant int, char const *)", [Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_cooperative_atomic_store_8x16B : AMDGPUBuiltin<"void(_Vector<4, int *>, _Vector<4, int>, _Constant int, char const *)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_cooperative_atomic_load_8x16B : AMDGPUBuiltin<"_ExtVector<4, int>(_ExtVector<4, int *>, _Constant int, char const *)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_cooperative_atomic_store_8x16B : AMDGPUBuiltin<"void(_ExtVector<4, int *>, _ExtVector<4, int>, _Constant int, char const *)", [Const], "gfx1250-insts,wavefrontsize32">;
 
 //===----------------------------------------------------------------------===//
 // Image builtins
 //===----------------------------------------------------------------------===//
-def __builtin_amdgcn_image_load_1d_v4f32_i32 : AMDGPUBuiltin<"_Vector<4, float>(int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_1d_v4f16_i32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_1darray_v4f32_i32 : AMDGPUBuiltin<"_Vector<4, float>(int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_1darray_v4f16_i32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_1d_v4f32_i32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_1d_v4f16_i32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_1darray_v4f32_i32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_1darray_v4f16_i32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
 def __builtin_amdgcn_image_load_2d_f32_i32 : AMDGPUBuiltin<"float(int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_2d_v4f32_i32 : AMDGPUBuiltin<"_Vector<4, float>(int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_2d_v4f16_i32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_2d_v4f32_i32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_2d_v4f16_i32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
 def __builtin_amdgcn_image_load_2darray_f32_i32 : AMDGPUBuiltin<"float(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_2darray_v4f32_i32 : AMDGPUBuiltin<"_Vector<4, float>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_2darray_v4f16_i32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_3d_v4f32_i32 : AMDGPUBuiltin<"_Vector<4, float>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_3d_v4f16_i32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_cube_v4f32_i32 : AMDGPUBuiltin<"_Vector<4, float>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_cube_v4f16_i32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_mip_1d_v4f32_i32 : AMDGPUBuiltin<"_Vector<4, float>(int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_mip_1d_v4f16_i32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_mip_1darray_v4f32_i32 : AMDGPUBuiltin<"_Vector<4, float>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_mip_1darray_v4f16_i32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_2darray_v4f32_i32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_2darray_v4f16_i32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_3d_v4f32_i32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_3d_v4f16_i32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_cube_v4f32_i32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_cube_v4f16_i32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_mip_1d_v4f32_i32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_mip_1d_v4f16_i32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_mip_1darray_v4f32_i32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_mip_1darray_v4f16_i32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
 def __builtin_amdgcn_image_load_mip_2d_f32_i32 : AMDGPUBuiltin<"float(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_mip_2d_v4f32_i32 : AMDGPUBuiltin<"_Vector<4, float>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_mip_2d_v4f16_i32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_mip_2d_v4f32_i32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_mip_2d_v4f16_i32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
 def __builtin_amdgcn_image_load_mip_2darray_f32_i32 : AMDGPUBuiltin<"float(int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_mip_2darray_v4f32_i32 : AMDGPUBuiltin<"_Vector<4, float>(int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_mip_2darray_v4f16_i32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_mip_3d_v4f32_i32 : AMDGPUBuiltin<"_Vector<4, float>(int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_mip_3d_v4f16_i32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_mip_cube_v4f32_i32 : AMDGPUBuiltin<"_Vector<4, float>(int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_load_mip_cube_v4f16_i32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_1d_v4f32_i32 : AMDGPUBuiltin<"void(_Vector<4, float>, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_1d_v4f16_i32 : AMDGPUBuiltin<"void(_Vector<4, _Float16>, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_1darray_v4f32_i32 : AMDGPUBuiltin<"void(_Vector<4, float>, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_1darray_v4f16_i32 : AMDGPUBuiltin<"void(_Vector<4, _Float16>, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_mip_2darray_v4f32_i32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_mip_2darray_v4f16_i32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_mip_3d_v4f32_i32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_mip_3d_v4f16_i32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_mip_cube_v4f32_i32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_load_mip_cube_v4f16_i32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_1d_v4f32_i32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_1d_v4f16_i32 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_1darray_v4f32_i32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_1darray_v4f16_i32 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
 def __builtin_amdgcn_image_store_2d_f32_i32 : AMDGPUBuiltin<"void(float, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_2d_v4f32_i32 : AMDGPUBuiltin<"void(_Vector<4, float>, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_2d_v4f16_i32 : AMDGPUBuiltin<"void(_Vector<4, _Float16>, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_2d_v4f32_i32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_2d_v4f16_i32 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
 def __builtin_amdgcn_image_store_2darray_f32_i32 : AMDGPUBuiltin<"void(float, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_2darray_v4f32_i32 : AMDGPUBuiltin<"void(_Vector<4, float>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_2darray_v4f16_i32 : AMDGPUBuiltin<"void(_Vector<4, _Float16>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_3d_v4f32_i32 : AMDGPUBuiltin<"void(_Vector<4, float>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_3d_v4f16_i32 : AMDGPUBuiltin<"void(_Vector<4, _Float16>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_cube_v4f32_i32 : AMDGPUBuiltin<"void(_Vector<4, float>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_cube_v4f16_i32 : AMDGPUBuiltin<"void(_Vector<4, _Float16>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_mip_1d_v4f32_i32 : AMDGPUBuiltin<"void(_Vector<4, float>, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_mip_1d_v4f16_i32 : AMDGPUBuiltin<"void(_Vector<4, _Float16>, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_mip_1darray_v4f32_i32 : AMDGPUBuiltin<"void(_Vector<4, float>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_mip_1darray_v4f16_i32 : AMDGPUBuiltin<"void(_Vector<4, _Float16>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_2darray_v4f32_i32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_2darray_v4f16_i32 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_3d_v4f32_i32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_3d_v4f16_i32 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_cube_v4f32_i32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_cube_v4f16_i32 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_mip_1d_v4f32_i32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_mip_1d_v4f16_i32 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_mip_1darray_v4f32_i32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_mip_1darray_v4f16_i32 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
 def __builtin_amdgcn_image_store_mip_2d_f32_i32 : AMDGPUBuiltin<"void(float, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_mip_2d_v4f32_i32 : AMDGPUBuiltin<"void(_Vector<4, float>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_mip_2d_v4f16_i32 : AMDGPUBuiltin<"void(_Vector<4, _Float16>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_mip_2d_v4f32_i32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_mip_2d_v4f16_i32 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
 def __builtin_amdgcn_image_store_mip_2darray_f32_i32 : AMDGPUBuiltin<"void(float, int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_mip_2darray_v4f32_i32 : AMDGPUBuiltin<"void(_Vector<4, float>, int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_mip_2darray_v4f16_i32 : AMDGPUBuiltin<"void(_Vector<4, _Float16>, int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_mip_3d_v4f32_i32 : AMDGPUBuiltin<"void(_Vector<4, float>, int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_mip_3d_v4f16_i32 : AMDGPUBuiltin<"void(_Vector<4, _Float16>, int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_mip_cube_v4f32_i32 : AMDGPUBuiltin<"void(_Vector<4, float>, int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_store_mip_cube_v4f16_i32 : AMDGPUBuiltin<"void(_Vector<4, _Float16>, int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_1d_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_1d_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_1darray_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_1darray_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_2d_f32_f32 : AMDGPUBuiltin<"float(int, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_2d_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_2d_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_2darray_f32_f32 : AMDGPUBuiltin<"float(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_2darray_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_2darray_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_3d_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_3d_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_cube_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_cube_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "image-insts">;
-def __builtin_amdgcn_image_sample_lz_1d_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_lz_1d_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_lz_1darray_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_lz_1darray_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_lz_2d_f32_f32 : AMDGPUBuiltin<"float(int, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_lz_2d_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_lz_2d_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_lz_2darray_f32_f32 : AMDGPUBuiltin<"float(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_lz_2darray_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_lz_2darray_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_lz_3d_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_lz_3d_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_lz_cube_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_lz_cube_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_l_1d_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_l_1d_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_l_1darray_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_l_1darray_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_l_2d_f32_f32 : AMDGPUBuiltin<"float(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_l_2d_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_l_2d_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_l_2darray_f32_f32 : AMDGPUBuiltin<"float(int, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_l_2darray_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_l_2darray_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_l_3d_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_l_3d_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_l_cube_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_l_cube_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_d_1d_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_d_1d_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_d_1darray_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_d_1darray_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_d_2d_f32_f32 : AMDGPUBuiltin<"float(int, float, float, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_d_2d_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_d_2d_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_d_2darray_f32_f32 : AMDGPUBuiltin<"float(int, float, float, float, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_d_2darray_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_d_2darray_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_d_3d_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, float, float, float, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_sample_d_3d_v4f16_f32 : AMDGPUBuiltin<"_Vector<4, _Float16>(int, float, float, float, float, float, float, float, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
-def __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32 : AMDGPUBuiltin<"_Vector<4, float>(int, float, float, __amdgpu_texture_t, _Vector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_store_mip_2darray_v4f32_i32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_mip_2darray_v4f16_i32 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_mip_3d_v4f32_i32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_mip_3d_v4f16_i32 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_mip_cube_v4f32_i32 : AMDGPUBuiltin<"void(_ExtVector<4, float>, int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_store_mip_cube_v4f16_i32 : AMDGPUBuiltin<"void(_ExtVector<4, _Float16>, int, int, int, int, int, __amdgpu_texture_t, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_1d_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_1d_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_1darray_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_1darray_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_2d_f32_f32 : AMDGPUBuiltin<"float(int, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_2d_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_2d_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_2darray_f32_f32 : AMDGPUBuiltin<"float(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_2darray_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_2darray_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_3d_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_3d_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_cube_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_cube_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "image-insts">;
+def __builtin_amdgcn_image_sample_lz_1d_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_lz_1d_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_lz_1darray_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_lz_1darray_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_lz_2d_f32_f32 : AMDGPUBuiltin<"float(int, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_lz_2d_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_lz_2d_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_lz_2darray_f32_f32 : AMDGPUBuiltin<"float(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_lz_2darray_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_lz_2darray_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_lz_3d_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_lz_3d_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_lz_cube_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_lz_cube_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_l_1d_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_l_1d_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_l_1darray_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_l_1darray_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_l_2d_f32_f32 : AMDGPUBuiltin<"float(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_l_2d_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_l_2d_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_l_2darray_f32_f32 : AMDGPUBuiltin<"float(int, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_l_2darray_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_l_2darray_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_l_3d_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_l_3d_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_l_cube_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_l_cube_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_d_1d_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_d_1d_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_d_1darray_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_d_1darray_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_d_2d_f32_f32 : AMDGPUBuiltin<"float(int, float, float, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_d_2d_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_d_2d_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_d_2darray_f32_f32 : AMDGPUBuiltin<"float(int, float, float, float, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_d_2darray_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_d_2darray_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_d_3d_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, float, float, float, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_sample_d_3d_v4f16_f32 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(int, float, float, float, float, float, float, float, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;
+def __builtin_amdgcn_image_gather4_lz_2d_v4f32_f32 : AMDGPUBuiltin<"_ExtVector<4, float>(int, float, float, __amdgpu_texture_t, _ExtVector<4, int>, bool, int, int)", [Const], "extended-image-insts">;



More information about the cfe-commits mailing list