[llvm] [AMDGPU] Extend promotion of alloca to vectors (PR #127973)

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 20 01:01:25 PST 2025


https://github.com/perlfu created https://github.com/llvm/llvm-project/pull/127973

* Add multi dimensional array support
* Make maximum vector size tunable
* Make ratio of VGPRs used for vector promotion tunable

>From 2afc3ac5f2e55f8fa16081562bf09f9fe2b402ca Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 20 Feb 2025 16:42:30 +0900
Subject: [PATCH] [AMDGPU] Extend promotion of alloca to vectors

* Add multi dimensional array support
* Make maximum vector size tunable
* Make ratio of VGPRs used for vector promotion tunable
---
 llvm/docs/AMDGPUUsage.rst                     | 334 +++++++++---------
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 111 ++++--
 .../CodeGen/AMDGPU/amdgpu.private-memory.ll   |   9 +-
 .../test/CodeGen/AMDGPU/array-ptr-calc-i32.ll |   8 +-
 .../AMDGPU/promote-alloca-max-elements.ll     | 236 +++++++++++++
 .../CodeGen/AMDGPU/promote-alloca-memset.ll   |  12 +-
 .../CodeGen/AMDGPU/promote-alloca-multidim.ll | 292 +++++++++++++++
 .../CodeGen/AMDGPU/promote-alloca-no-opts.ll  |   4 +-
 .../AMDGPU/promote-alloca-vgpr-ratio.ll       | 276 +++++++++++++++
 9 files changed, 1072 insertions(+), 210 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-max-elements.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index d580be1eb8cfc..734434641b4bd 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1546,180 +1546,184 @@ The AMDGPU backend supports the following LLVM IR attributes.
   .. table:: AMDGPU LLVM IR Attributes
      :name: amdgpu-llvm-ir-attributes-table
 
-     ============================================ ==========================================================
-     LLVM Attribute                               Description
-     ============================================ ==========================================================
-     "amdgpu-flat-work-group-size"="min,max"      Specify the minimum and maximum flat work group sizes that
-                                                  will be specified when the kernel is dispatched. Generated
-                                                  by the ``amdgpu_flat_work_group_size`` CLANG attribute [CLANG-ATTR]_.
-                                                  The IR implied default value is 1,1024. Clang may emit this attribute
-                                                  with more restrictive bounds depending on language defaults.
-                                                  If the actual block or workgroup size exceeds the limit at any point during
-                                                  the execution, the behavior is undefined. For example, even if there is
-                                                  only one active thread but the thread local id exceeds the limit, the
-                                                  behavior is undefined.
-
-     "amdgpu-implicitarg-num-bytes"="n"           Number of kernel argument bytes to add to the kernel
-                                                  argument block size for the implicit arguments. This
-                                                  varies by OS and language (for OpenCL see
-                                                  :ref:`opencl-kernel-implicit-arguments-appended-for-amdhsa-os-table`).
-     "amdgpu-num-sgpr"="n"                        Specifies the number of SGPRs to use. Generated by
-                                                  the ``amdgpu_num_sgpr`` CLANG attribute [CLANG-ATTR]_.
-     "amdgpu-num-vgpr"="n"                        Specifies the number of VGPRs to use. Generated by the
-                                                  ``amdgpu_num_vgpr`` CLANG attribute [CLANG-ATTR]_.
-     "amdgpu-waves-per-eu"="m,n"                  Specify the minimum and maximum number of waves per
-                                                  execution unit. Generated by the ``amdgpu_waves_per_eu``
-                                                  CLANG attribute [CLANG-ATTR]_. This is an optimization hint,
-                                                  and the backend may not be able to satisfy the request. If
-                                                  the specified range is incompatible with the function's
-                                                  "amdgpu-flat-work-group-size" value, the implied occupancy
-                                                  bounds by the workgroup size takes precedence.
-
-     "amdgpu-ieee" true/false.                    GFX6-GFX11 Only
-                                                  Specify whether the function expects the IEEE field of the
-                                                  mode register to be set on entry. Overrides the default for
-                                                  the calling convention.
-     "amdgpu-dx10-clamp" true/false.              GFX6-GFX11 Only
-                                                  Specify whether the function expects the DX10_CLAMP field of
-                                                  the mode register to be set on entry. Overrides the default
-                                                  for the calling convention.
-
-     "amdgpu-no-workitem-id-x"                    Indicates the function does not depend on the value of the
-                                                  llvm.amdgcn.workitem.id.x intrinsic. If a function is marked with this
-                                                  attribute, or reached through a call site marked with this attribute, and
-                                                  that intrinsic is called, the behavior of the program is undefined. (Whole-program
-                                                  undefined behavior is used here because, for example, the absence of a required workitem
-                                                  ID in the preloaded register set can mean that all other preloaded registers
-                                                  are earlier than the compilation assumed they would be.) The backend can
-                                                  generally infer this during code generation, so typically there is no
-                                                  benefit to frontends marking functions with this.
-
-     "amdgpu-no-workitem-id-y"                    The same as amdgpu-no-workitem-id-x, except for the
-                                                  llvm.amdgcn.workitem.id.y intrinsic.
-
-     "amdgpu-no-workitem-id-z"                    The same as amdgpu-no-workitem-id-x, except for the
-                                                  llvm.amdgcn.workitem.id.z intrinsic.
-
-     "amdgpu-no-workgroup-id-x"                   The same as amdgpu-no-workitem-id-x, except for the
-                                                  llvm.amdgcn.workgroup.id.x intrinsic.
-
-     "amdgpu-no-workgroup-id-y"                   The same as amdgpu-no-workitem-id-x, except for the
-                                                  llvm.amdgcn.workgroup.id.y intrinsic.
-
-     "amdgpu-no-workgroup-id-z"                   The same as amdgpu-no-workitem-id-x, except for the
-                                                  llvm.amdgcn.workgroup.id.z intrinsic.
-
-     "amdgpu-no-dispatch-ptr"                     The same as amdgpu-no-workitem-id-x, except for the
-                                                  llvm.amdgcn.dispatch.ptr intrinsic.
-
-     "amdgpu-no-implicitarg-ptr"                  The same as amdgpu-no-workitem-id-x, except for the
-                                                  llvm.amdgcn.implicitarg.ptr intrinsic.
-
-     "amdgpu-no-dispatch-id"                      The same as amdgpu-no-workitem-id-x, except for the
-                                                  llvm.amdgcn.dispatch.id intrinsic.
-
-     "amdgpu-no-queue-ptr"                        Similar to amdgpu-no-workitem-id-x, except for the
-                                                  llvm.amdgcn.queue.ptr intrinsic. Note that unlike the other ABI hint
-                                                  attributes, the queue pointer may be required in situations where the
-                                                  intrinsic call does not directly appear in the program. Some subtargets
-                                                  require the queue pointer for to handle some addrspacecasts, as well
-                                                  as the llvm.amdgcn.is.shared, llvm.amdgcn.is.private, llvm.trap, and
-                                                  llvm.debug intrinsics.
-
-     "amdgpu-no-hostcall-ptr"                     Similar to amdgpu-no-implicitarg-ptr, except specific to the implicit
-                                                  kernel argument that holds the pointer to the hostcall buffer. If this
-                                                  attribute is absent, then the amdgpu-no-implicitarg-ptr is also removed.
-
-     "amdgpu-no-heap-ptr"                         Similar to amdgpu-no-implicitarg-ptr, except specific to the implicit
-                                                  kernel argument that holds the pointer to an initialized memory buffer
-                                                  that conforms to the requirements of the malloc/free device library V1
-                                                  version implementation. If this attribute is absent, then the
-                                                  amdgpu-no-implicitarg-ptr is also removed.
-
-     "amdgpu-no-multigrid-sync-arg"               Similar to amdgpu-no-implicitarg-ptr, except specific to the implicit
-                                                  kernel argument that holds the multigrid synchronization pointer. If this
-                                                  attribute is absent, then the amdgpu-no-implicitarg-ptr is also removed.
-
-     "amdgpu-no-default-queue"                    Similar to amdgpu-no-implicitarg-ptr, except specific to the implicit
-                                                  kernel argument that holds the default queue pointer. If this
-                                                  attribute is absent, then the amdgpu-no-implicitarg-ptr is also removed.
-
-     "amdgpu-no-completion-action"                Similar to amdgpu-no-implicitarg-ptr, except specific to the implicit
-                                                  kernel argument that holds the completion action pointer. If this
-                                                  attribute is absent, then the amdgpu-no-implicitarg-ptr is also removed.
-
-     "amdgpu-lds-size"="min[,max]"                Min is the minimum number of bytes that will be allocated in the Local
-                                                  Data Store at address zero. Variables are allocated within this frame
-                                                  using absolute symbol metadata, primarily by the AMDGPULowerModuleLDS
-                                                  pass. Optional max is the maximum number of bytes that will be allocated.
-                                                  Note that min==max indicates that no further variables can be added to
-                                                  the frame. This is an internal detail of how LDS variables are lowered,
-                                                  language front ends should not set this attribute.
-
-     "amdgpu-gds-size"                            Bytes expected to be allocated at the start of GDS memory at entry.
-
-     "amdgpu-git-ptr-high"                        The hard-wired high half of the address of the global information table
-                                                  for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since
-                                                  current hardware only allows a 16 bit value.
-
-     "amdgpu-32bit-address-high-bits"             Assumed high 32-bits for 32-bit address spaces which are really truncated
-                                                  64-bit addresses (i.e., addrspace(6))
-
-     "amdgpu-color-export"                        Indicates shader exports color information if set to 1.
-                                                  Defaults to 1 for :ref:`amdgpu_ps <amdgpu-cc>`, and 0 for other calling
-                                                  conventions. Determines the necessity and type of null exports when a shader
-                                                  terminates early by killing lanes.
-
-     "amdgpu-depth-export"                        Indicates shader exports depth information if set to 1. Determines the
-                                                  necessity and type of null exports when a shader terminates early by killing
-                                                  lanes. A depth-only shader will export to depth channel when no null export
-                                                  target is available (GFX11+).
-
-     "InitialPSInputAddr"                         Set the initial value of the `spi_ps_input_addr` register for
-                                                  :ref:`amdgpu_ps <amdgpu-cc>` shaders. Any bits enabled by this value will
-                                                  be enabled in the final register value.
-
-     "amdgpu-wave-priority-threshold"             VALU instruction count threshold for adjusting wave priority. If exceeded,
-                                                  temporarily raise the wave priority at the start of the shader function
-                                                  until its last VMEM instructions to allow younger waves to issue their VMEM
-                                                  instructions as well.
+     =============================================== ==========================================================
+     LLVM Attribute                                  Description
+     =============================================== ==========================================================
+     "amdgpu-flat-work-group-size"="min,max"         Specify the minimum and maximum flat work group sizes that
+                                                     will be specified when the kernel is dispatched. Generated
+                                                     by the ``amdgpu_flat_work_group_size`` CLANG attribute [CLANG-ATTR]_.
+                                                     The IR implied default value is 1,1024. Clang may emit this attribute
+                                                     with more restrictive bounds depending on language defaults.
+                                                     If the actual block or workgroup size exceeds the limit at any point during
+                                                     the execution, the behavior is undefined. For example, even if there is
+                                                     only one active thread but the thread local id exceeds the limit, the
+                                                     behavior is undefined.
+
+     "amdgpu-implicitarg-num-bytes"="n"              Number of kernel argument bytes to add to the kernel
+                                                     argument block size for the implicit arguments. This
+                                                     varies by OS and language (for OpenCL see
+                                                     :ref:`opencl-kernel-implicit-arguments-appended-for-amdhsa-os-table`).
+     "amdgpu-num-sgpr"="n"                           Specifies the number of SGPRs to use. Generated by
+                                                     the ``amdgpu_num_sgpr`` CLANG attribute [CLANG-ATTR]_.
+     "amdgpu-num-vgpr"="n"                           Specifies the number of VGPRs to use. Generated by the
+                                                     ``amdgpu_num_vgpr`` CLANG attribute [CLANG-ATTR]_.
+     "amdgpu-waves-per-eu"="m,n"                     Specify the minimum and maximum number of waves per
+                                                     execution unit. Generated by the ``amdgpu_waves_per_eu``
+                                                     CLANG attribute [CLANG-ATTR]_. This is an optimization hint,
+                                                     and the backend may not be able to satisfy the request. If
+                                                     the specified range is incompatible with the function's
+                                                     "amdgpu-flat-work-group-size" value, the implied occupancy
+                                                     bounds by the workgroup size takes precedence.
+
+     "amdgpu-ieee" true/false.                       GFX6-GFX11 Only
+                                                     Specify whether the function expects the IEEE field of the
+                                                     mode register to be set on entry. Overrides the default for
+                                                     the calling convention.
+     "amdgpu-dx10-clamp" true/false.                 GFX6-GFX11 Only
+                                                     Specify whether the function expects the DX10_CLAMP field of
+                                                     the mode register to be set on entry. Overrides the default
+                                                     for the calling convention.
+
+     "amdgpu-no-workitem-id-x"                       Indicates the function does not depend on the value of the
+                                                     llvm.amdgcn.workitem.id.x intrinsic. If a function is marked with this
+                                                     attribute, or reached through a call site marked with this attribute, and
+                                                     that intrinsic is called, the behavior of the program is undefined. (Whole-program
+                                                     undefined behavior is used here because, for example, the absence of a required workitem
+                                                     ID in the preloaded register set can mean that all other preloaded registers
+                                                     are earlier than the compilation assumed they would be.) The backend can
+                                                     generally infer this during code generation, so typically there is no
+                                                     benefit to frontends marking functions with this.
+
+     "amdgpu-no-workitem-id-y"                       The same as amdgpu-no-workitem-id-x, except for the
+                                                     llvm.amdgcn.workitem.id.y intrinsic.
+
+     "amdgpu-no-workitem-id-z"                       The same as amdgpu-no-workitem-id-x, except for the
+                                                     llvm.amdgcn.workitem.id.z intrinsic.
+
+     "amdgpu-no-workgroup-id-x"                      The same as amdgpu-no-workitem-id-x, except for the
+                                                     llvm.amdgcn.workgroup.id.x intrinsic.
+
+     "amdgpu-no-workgroup-id-y"                      The same as amdgpu-no-workitem-id-x, except for the
+                                                     llvm.amdgcn.workgroup.id.y intrinsic.
+
+     "amdgpu-no-workgroup-id-z"                      The same as amdgpu-no-workitem-id-x, except for the
+                                                     llvm.amdgcn.workgroup.id.z intrinsic.
+
+     "amdgpu-no-dispatch-ptr"                        The same as amdgpu-no-workitem-id-x, except for the
+                                                     llvm.amdgcn.dispatch.ptr intrinsic.
+
+     "amdgpu-no-implicitarg-ptr"                     The same as amdgpu-no-workitem-id-x, except for the
+                                                     llvm.amdgcn.implicitarg.ptr intrinsic.
+
+     "amdgpu-no-dispatch-id"                         The same as amdgpu-no-workitem-id-x, except for the
+                                                     llvm.amdgcn.dispatch.id intrinsic.
+
+     "amdgpu-no-queue-ptr"                           Similar to amdgpu-no-workitem-id-x, except for the
+                                                     llvm.amdgcn.queue.ptr intrinsic. Note that unlike the other ABI hint
+                                                     attributes, the queue pointer may be required in situations where the
+                                                     intrinsic call does not directly appear in the program. Some subtargets
+                                                     require the queue pointer for to handle some addrspacecasts, as well
+                                                     as the llvm.amdgcn.is.shared, llvm.amdgcn.is.private, llvm.trap, and
+                                                     llvm.debug intrinsics.
+
+     "amdgpu-no-hostcall-ptr"                        Similar to amdgpu-no-implicitarg-ptr, except specific to the implicit
+                                                     kernel argument that holds the pointer to the hostcall buffer. If this
+                                                     attribute is absent, then the amdgpu-no-implicitarg-ptr is also removed.
+
+     "amdgpu-no-heap-ptr"                            Similar to amdgpu-no-implicitarg-ptr, except specific to the implicit
+                                                     kernel argument that holds the pointer to an initialized memory buffer
+                                                     that conforms to the requirements of the malloc/free device library V1
+                                                     version implementation. If this attribute is absent, then the
+                                                     amdgpu-no-implicitarg-ptr is also removed.
+
+     "amdgpu-no-multigrid-sync-arg"                  Similar to amdgpu-no-implicitarg-ptr, except specific to the implicit
+                                                     kernel argument that holds the multigrid synchronization pointer. If this
+                                                     attribute is absent, then the amdgpu-no-implicitarg-ptr is also removed.
+
+     "amdgpu-no-default-queue"                       Similar to amdgpu-no-implicitarg-ptr, except specific to the implicit
+                                                     kernel argument that holds the default queue pointer. If this
+                                                     attribute is absent, then the amdgpu-no-implicitarg-ptr is also removed.
+
+     "amdgpu-no-completion-action"                   Similar to amdgpu-no-implicitarg-ptr, except specific to the implicit
+                                                     kernel argument that holds the completion action pointer. If this
+                                                     attribute is absent, then the amdgpu-no-implicitarg-ptr is also removed.
+
+     "amdgpu-lds-size"="min[,max]"                   Min is the minimum number of bytes that will be allocated in the Local
+                                                     Data Store at address zero. Variables are allocated within this frame
+                                                     using absolute symbol metadata, primarily by the AMDGPULowerModuleLDS
+                                                     pass. Optional max is the maximum number of bytes that will be allocated.
+                                                     Note that min==max indicates that no further variables can be added to
+                                                     the frame. This is an internal detail of how LDS variables are lowered,
+                                                     language front ends should not set this attribute.
+
+     "amdgpu-gds-size"                               Bytes expected to be allocated at the start of GDS memory at entry.
+
+     "amdgpu-git-ptr-high"                           The hard-wired high half of the address of the global information table
+                                                     for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since
+                                                     current hardware only allows a 16 bit value.
+
+     "amdgpu-32bit-address-high-bits"                Assumed high 32-bits for 32-bit address spaces which are really truncated
+                                                     64-bit addresses (i.e., addrspace(6))
+
+     "amdgpu-color-export"                           Indicates shader exports color information if set to 1.
+                                                     Defaults to 1 for :ref:`amdgpu_ps <amdgpu-cc>`, and 0 for other calling
+                                                     conventions. Determines the necessity and type of null exports when a shader
+                                                     terminates early by killing lanes.
+
+     "amdgpu-depth-export"                           Indicates shader exports depth information if set to 1. Determines the
+                                                     necessity and type of null exports when a shader terminates early by killing
+                                                     lanes. A depth-only shader will export to depth channel when no null export
+                                                     target is available (GFX11+).
 
-     "amdgpu-memory-bound"                        Set internally by backend
+     "InitialPSInputAddr"                            Set the initial value of the `spi_ps_input_addr` register for
+                                                     :ref:`amdgpu_ps <amdgpu-cc>` shaders. Any bits enabled by this value will
+                                                     be enabled in the final register value.
 
-     "amdgpu-wave-limiter"                        Set internally by backend
+     "amdgpu-wave-priority-threshold"                VALU instruction count threshold for adjusting wave priority. If exceeded,
+                                                     temporarily raise the wave priority at the start of the shader function
+                                                     until its last VMEM instructions to allow younger waves to issue their VMEM
+                                                     instructions as well.
 
-     "amdgpu-unroll-threshold"                    Set base cost threshold preference for loop unrolling within this function,
-                                                  default is 300. Actual threshold may be varied by per-loop metadata or
-                                                  reduced by heuristics.
+     "amdgpu-memory-bound"                           Set internally by backend
 
-     "amdgpu-max-num-workgroups"="x,y,z"          Specify the maximum number of work groups for the kernel dispatch in the
-                                                  X, Y, and Z dimensions. Each number must be >= 1. Generated by the
-                                                  ``amdgpu_max_num_work_groups`` CLANG attribute [CLANG-ATTR]_. Clang only
-                                                  emits this attribute when all the three numbers are >= 1.
+     "amdgpu-wave-limiter"                           Set internally by backend
 
-     "amdgpu-no-agpr"                             Indicates the function will not require allocating AGPRs. This is only
-                                                  relevant on subtargets with AGPRs. The behavior is undefined if a
-                                                  function which requires AGPRs is reached through any function marked
-                                                  with this attribute.
+     "amdgpu-unroll-threshold"                       Set base cost threshold preference for loop unrolling within this function,
+                                                     default is 300. Actual threshold may be varied by per-loop metadata or
+                                                     reduced by heuristics.
 
-     "amdgpu-hidden-argument"                     This attribute is used internally by the backend to mark function arguments
-                                                  as hidden. Hidden arguments are managed by the compiler and are not part of
-                                                  the explicit arguments supplied by the user.
+     "amdgpu-max-num-workgroups"="x,y,z"             Specify the maximum number of work groups for the kernel dispatch in the
+                                                     X, Y, and Z dimensions. Each number must be >= 1. Generated by the
+                                                     ``amdgpu_max_num_work_groups`` CLANG attribute [CLANG-ATTR]_. Clang only
+                                                     emits this attribute when all the three numbers are >= 1.
 
-     "amdgpu-sgpr-hazard-wait"                    Disabled SGPR hazard wait insertion if set to 0.
-                                                  Exists for testing performance impact of SGPR hazard waits only.
+     "amdgpu-no-agpr"                                Indicates the function will not require allocating AGPRs. This is only
+                                                     relevant on subtargets with AGPRs. The behavior is undefined if a
+                                                     function which requires AGPRs is reached through any function marked
+                                                     with this attribute.
 
-     "amdgpu-sgpr-hazard-boundary-cull"           Enable insertion of SGPR hazard cull sequences at function call boundaries.
-                                                  Cull sequence reduces future hazard waits, but has a performance cost.
+     "amdgpu-hidden-argument"                        This attribute is used internally by the backend to mark function arguments
+                                                     as hidden. Hidden arguments are managed by the compiler and are not part of
+                                                     the explicit arguments supplied by the user.
 
-     "amdgpu-sgpr-hazard-mem-wait-cull"           Enable insertion of SGPR hazard cull sequences before memory waits.
-                                                  Cull sequence reduces future hazard waits, but has a performance cost.
-                                                  Attempt to amortize cost by overlapping with memory accesses.
-
-     "amdgpu-sgpr-hazard-mem-wait-cull-threshold" Sets the number of active SGPR hazards that must be present before
-                                                  inserting a cull sequence at a memory wait.
-
-     ============================================ ==========================================================
+     "amdgpu-sgpr-hazard-wait"                       Disabled SGPR hazard wait insertion if set to 0.
+                                                     Exists for testing performance impact of SGPR hazard waits only.
+
+     "amdgpu-sgpr-hazard-boundary-cull"              Enable insertion of SGPR hazard cull sequences at function call boundaries.
+                                                     Cull sequence reduces future hazard waits, but has a performance cost.
+
+     "amdgpu-sgpr-hazard-mem-wait-cull"              Enable insertion of SGPR hazard cull sequences before memory waits.
+                                                     Cull sequence reduces future hazard waits, but has a performance cost.
+                                                     Attempt to amortize cost by overlapping with memory accesses.
+
+     "amdgpu-sgpr-hazard-mem-wait-cull-threshold"    Sets the number of active SGPR hazards that must be present before
+                                                     inserting a cull sequence at a memory wait.
+
+     "amdgpu-promote-alloca-to-vector-max-elements"  Maximum vector size (in elements) to create when promoting alloca.
+
+     "amdgpu-promote-alloca-to-vector-vgpr-ratio"    Ratio of VGPRs to budget for promoting alloca to vectors.
+
+     ==============================================  ==========================================================
 
 Calling Conventions
 ===================
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 69ddb384e1a40..aec00a28c1619 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -66,6 +66,18 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
     cl::desc("Maximum byte size to consider promote alloca to vector"),
     cl::init(0));
 
+static cl::opt<unsigned> PromoteAllocaToVectorMaxElements(
+    "amdgpu-promote-alloca-to-vector-max-elements",
+    cl::desc("Maximum vector size (in elements) to use when promoting alloca"),
+    cl::init(16));
+
+// Use up to 1/4 of available register budget for vectorization.
+// FIXME: Increase the limit for whole function budgets? Perhaps x2?
+static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
+    "amdgpu-promote-alloca-to-vector-vgpr-ratio",
+    cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
+    cl::init(4));
+
 static cl::opt<unsigned>
     LoopUserWeight("promote-alloca-vector-loop-user-weight",
                    cl::desc("The bonus weight of users of allocas within loop "
@@ -310,12 +322,17 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
 
   bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
 
-  // Use up to 1/4 of available register budget for vectorization.
-  // FIXME: Increase the limit for whole function budgets? Perhaps x2?
+  const unsigned VGPRRatio =
+      PromoteAllocaToVectorVGPRRatio.getNumOccurrences()
+          ? PromoteAllocaToVectorVGPRRatio
+          : F.getFnAttributeAsParsedInteger(
+                "amdgpu-promote-alloca-to-vector-vgpr-ratio",
+                PromoteAllocaToVectorVGPRRatio);
+
   unsigned VectorizationBudget =
       (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
                                   : (MaxVGPRs * 32)) /
-      4;
+      VGPRRatio;
 
   SmallVector<AllocaInst *, 16> Allocas;
   for (Instruction &I : F.getEntryBlock()) {
@@ -398,7 +415,8 @@ calculateVectorIndex(Value *Ptr,
 }
 
 static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
-                               Type *VecElemTy, const DataLayout &DL) {
+                               Type *VecElemTy, const DataLayout &DL,
+                               SmallVector<Instruction *> &NewInsts) {
   // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
   // helper.
   unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
@@ -412,22 +430,37 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   if (VarOffsets.size() > 1)
     return nullptr;
 
-  if (VarOffsets.size() == 1) {
-    // Only handle cases where we don't need to insert extra arithmetic
-    // instructions.
-    const auto &VarOffset = VarOffsets.front();
-    if (!ConstOffset.isZero() || VarOffset.second != VecElemSize)
-      return nullptr;
-    return VarOffset.first;
-  }
-
   APInt Quot;
   uint64_t Rem;
   APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);
   if (Rem != 0)
     return nullptr;
 
-  return ConstantInt::get(GEP->getContext(), Quot);
+  ConstantInt *ConstIndex = ConstantInt::get(GEP->getContext(), Quot);
+  if (VarOffsets.size() == 0)
+    return ConstIndex;
+
+  IRBuilder<> Builder(GEP);
+
+  const auto &VarOffset = VarOffsets.front();
+  APInt::udivrem(VarOffset.second, VecElemSize, Quot, Rem);
+  if (Rem != 0 || Quot.isZero())
+    return nullptr;
+
+  Value *Offset = VarOffset.first;
+  if (!Quot.isOne()) {
+    ConstantInt *ConstMul = ConstantInt::get(GEP->getContext(), Quot);
+    Offset = Builder.CreateMul(Offset, ConstMul);
+    if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
+      NewInsts.push_back(NewInst);
+  }
+  if (ConstOffset.isZero())
+    return Offset;
+
+  Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
+  if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
+    NewInsts.push_back(NewInst);
+  return IndexAdd;
 }
 
 /// Promotes a single user of the alloca to a vector form.
@@ -735,23 +768,48 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   Type *AllocaTy = Alloca.getAllocatedType();
   auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
   if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
-    if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
-        ArrayTy->getNumElements() > 0)
-      VectorTy = FixedVectorType::get(ArrayTy->getElementType(),
-                                      ArrayTy->getNumElements());
+    uint64_t NumElems = 1;
+    Type *ElemTy;
+    do {
+      NumElems *= ArrayTy->getNumElements();
+      ElemTy = ArrayTy->getElementType();
+    } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
+
+    // Check for array of vectors
+    auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
+    if (InnerVectorTy) {
+      NumElems *= InnerVectorTy->getNumElements();
+      ElemTy = InnerVectorTy->getElementType();
+    }
+
+    if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
+      unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
+      unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
+      // Expand vector if required to match padding of inner type,
+      // i.e. odd size subvectors.
+      // Storage size of new vector must match that of alloca for correct
+      // behaviour of byte offsets and GEP computation.
+      if (NumElems * ElementSize != AllocaSize)
+        NumElems = AllocaSize / ElementSize;
+      if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
+        VectorTy = FixedVectorType::get(ElemTy, NumElems);
+    }
   }
 
-  // FIXME: There is no reason why we can't support larger arrays, we
-  // are just being conservative for now.
-  // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
-  // equivalent. Potentially these could also be promoted but we don't currently
-  // handle this case
   if (!VectorTy) {
     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
     return false;
   }
 
-  if (VectorTy->getNumElements() > 16 || VectorTy->getNumElements() < 2) {
+  const unsigned MaxElements =
+      PromoteAllocaToVectorMaxElements.getNumOccurrences()
+          ? PromoteAllocaToVectorMaxElements
+          : Alloca.getParent()->getParent()->getFnAttributeAsParsedInteger(
+                "amdgpu-promote-alloca-to-vector-max-elements",
+                PromoteAllocaToVectorMaxElements);
+
+  if (VectorTy->getNumElements() > MaxElements ||
+      VectorTy->getNumElements() < 2) {
     LLVM_DEBUG(dbgs() << "  " << *VectorTy
                       << " has an unsupported number of elements\n");
     return false;
@@ -761,11 +819,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   SmallVector<Instruction *> WorkList;
   SmallVector<Instruction *> UsersToRemove;
   SmallVector<Instruction *> DeferredInsts;
+  SmallVector<Instruction *> NewGEPInsts;
   DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
 
   const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
     LLVM_DEBUG(dbgs() << "  Cannot promote alloca to vector: " << Msg << "\n"
                       << "    " << *Inst << "\n");
+    for (auto *Inst : reverse(NewGEPInsts))
+      Inst->eraseFromParent();
     return false;
   };
 
@@ -815,7 +876,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       // If we can't compute a vector index from this GEP, then we can't
       // promote this alloca to vector.
-      Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL);
+      Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
       if (!Index)
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 82832277b1aba..a663d451cad35 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -258,7 +258,7 @@ entry:
 ; FUNC-LABEL: {{^}}no_overlap:
 ;
 ; A total of 5 bytes should be allocated and used.
-; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ;
+; SI-ALLOCA: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ;
 define amdgpu_kernel void @no_overlap(ptr addrspace(1) %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1, addrspace(5)
@@ -281,6 +281,7 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}char_array_array:
 define amdgpu_kernel void @char_array_array(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i8]], addrspace(5)
@@ -294,6 +295,7 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}i32_array_array:
 define amdgpu_kernel void @i32_array_array(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]], addrspace(5)
@@ -306,6 +308,7 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}i64_array_array:
 define amdgpu_kernel void @i64_array_array(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i64]], addrspace(5)
@@ -319,7 +322,7 @@ entry:
 }
 
 %struct.pair32 = type { i32, i32 }
-
+; FUNC-LABEL: {{^}}struct_array_array:
 define amdgpu_kernel void @struct_array_array(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5)
@@ -333,6 +336,7 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}struct_pair32_array:
 define amdgpu_kernel void @struct_pair32_array(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x %struct.pair32], addrspace(5)
@@ -346,6 +350,7 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}select_private:
 define amdgpu_kernel void @select_private(ptr addrspace(1) %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32], addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index ac8782e1e542f..e1bbc243344b0 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -22,13 +22,7 @@ declare void @llvm.amdgcn.s.barrier() #2
 ; SI-ALLOCA: s_barrier
 ; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
 ;
-; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
-; alloca to a vector.  It currently fails because it does not know how
-; to interpret:
-; getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 1, i32 %b
-
-; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
-; SI-PROMOTE: ds_write_b32 [[PTRREG]]
+; SI-PROMOTE: LDSByteSize: 0
 define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) #0 {
   %alloca = alloca [16 x i32], align 16, addrspace(5)
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-elements.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-elements.ll
new file mode 100644
index 0000000000000..7f1cf728bce7e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-elements.ll
@@ -0,0 +1,236 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE --check-prefix=DEFAULT %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-elements=24 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX24
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-elements=32 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX32
+
+define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5)
+; DEFAULT-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
+; DEFAULT-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; DEFAULT-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
+; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
+;
+; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements(
+; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; MAX24-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX24-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX24-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX24-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX24-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX24-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX24-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; MAX24-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; MAX24-NEXT:    ret void
+;
+; MAX32-LABEL: define amdgpu_kernel void @i32_24_elements(
+; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; MAX32-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX32-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX32-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX32-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX32-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX32-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX32-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; MAX32-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; MAX32-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [24 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false)
+  %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
+; BASE-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; BASE-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; BASE-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; BASE-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; BASE-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; BASE-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; BASE-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; BASE-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; BASE-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; BASE-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; BASE-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [24 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false)
+  %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_32_elements(ptr %out) #0 {
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_32_elements(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
+; DEFAULT-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
+; DEFAULT-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; DEFAULT-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
+; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
+;
+; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements(
+; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
+; MAX24-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX24-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX24-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX24-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX24-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX24-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX24-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
+; MAX24-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
+; MAX24-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; MAX24-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
+; MAX24-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; MAX24-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; MAX24-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; MAX24-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; MAX24-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; MAX24-NEXT:    ret void
+;
+; MAX32-LABEL: define amdgpu_kernel void @i32_32_elements(
+; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
+; MAX32-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX32-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX32-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX32-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX32-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX32-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX32-NEXT:    [[TMP1:%.*]] = extractelement <32 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0>, i32 [[SEL2]]
+; MAX32-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; MAX32-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [32 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false)
+  %gep.0 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 30
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_32_elements_attrib(ptr %out) #2 {
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_32_elements_attrib(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = extractelement <32 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0>, i32 [[SEL2]]
+; DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
+;
+; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements_attrib(
+; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; MAX24-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX24-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX24-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX24-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX24-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX24-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX24-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
+; MAX24-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
+; MAX24-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; MAX24-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
+; MAX24-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; MAX24-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; MAX24-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; MAX24-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; MAX24-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; MAX24-NEXT:    ret void
+;
+; MAX32-LABEL: define amdgpu_kernel void @i32_32_elements_attrib(
+; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; MAX32-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX32-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX32-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX32-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX32-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX32-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX32-NEXT:    [[TMP1:%.*]] = extractelement <32 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0>, i32 [[SEL2]]
+; MAX32-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; MAX32-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [32 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false)
+  %gep.0 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 30
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg)
+
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-elements"="24" }
+attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-elements"="32" }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
index 74651e10c7809..bb51c1d27b336 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -110,10 +110,7 @@ define amdgpu_kernel void @memset_vector_ptr_alloca(ptr %out) {
 
 define amdgpu_kernel void @memset_array_of_array_ptr_alloca(ptr %out) {
 ; CHECK-LABEL: @memset_array_of_array_ptr_alloca(
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x [3 x ptr]], align 16, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false)
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8
-; CHECK-NEXT:    store i64 [[LOAD]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca [2 x [3 x ptr]], align 16, addrspace(5)
@@ -125,14 +122,11 @@ define amdgpu_kernel void @memset_array_of_array_ptr_alloca(ptr %out) {
 
 define amdgpu_kernel void @memset_array_of_vec_ptr_alloca(ptr %out) {
 ; CHECK-LABEL: @memset_array_of_vec_ptr_alloca(
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x <3 x ptr>], align 16, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false)
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8
-; CHECK-NEXT:    store i64 [[LOAD]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca [2 x <3 x ptr>], align 16, addrspace(5)
-  call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 64, i1 false)
   %load = load i64, ptr addrspace(5) %alloca
   store i64 %load, ptr %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
new file mode 100644
index 0000000000000..a0af3f7442114
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
@@ -0,0 +1,292 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck %s
+
+define amdgpu_kernel void @i32_2d_load_store(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i32_2d_load_store(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    store i32 3, ptr [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x [3 x i32]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+  %gep.02 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
+  %gep.10 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  %gep.11 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
+  %gep.12 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
+  store i32 0, ptr addrspace(5) %gep.00
+  store i32 1, ptr addrspace(5) %gep.01
+  store i32 2, ptr addrspace(5) %gep.02
+  store i32 3, ptr addrspace(5) %gep.10
+  store i32 4, ptr addrspace(5) %gep.11
+  store i32 5, ptr addrspace(5) %gep.12
+  %gep = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i64_2d_load_store(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    store i64 3, ptr [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+  %gep.02 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
+  %gep.10 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  %gep.11 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
+  %gep.12 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
+  store i64 0, ptr addrspace(5) %gep.00
+  store i64 1, ptr addrspace(5) %gep.01
+  store i64 2, ptr addrspace(5) %gep.02
+  store i64 3, ptr addrspace(5) %gep.10
+  store i64 4, ptr addrspace(5) %gep.11
+  store i64 5, ptr addrspace(5) %gep.12
+  %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1
+  %load = load i64, ptr addrspace(5) %gep
+  store i64 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_2d_alloca_store_partial(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
+; CHECK-LABEL: define amdgpu_kernel void @i32_2d_alloca_store_partial(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[DUMMY_LDS:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 undef, i32 undef>, i32 [[SEL2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[TMP0]] to float
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [2 x [4 x i32]], align 4, addrspace(5)
+  %gep = getelementptr inbounds <4 x i32>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %alloca, align 4
+  %load = load float, ptr addrspace(5) %gep, align 4
+  store float %load, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+
+define amdgpu_kernel void @i64_2d_load_store_cast(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_cast(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <6 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5>, i32 [[SEL2]]
+; CHECK-NEXT:    store i64 [[TMP1]], ptr [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+  %gep.02 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
+  %gep.10 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  %gep.11 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
+  %gep.12 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
+  store i64 0, ptr addrspace(5) %gep.00
+  store i64 1, ptr addrspace(5) %gep.01
+  store i64 2, ptr addrspace(5) %gep.02
+  store i64 3, ptr addrspace(5) %gep.10
+  store i64 4, ptr addrspace(5) %gep.11
+  store i64 5, ptr addrspace(5) %gep.12
+  %gep = getelementptr inbounds [6 x i64], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i64, ptr addrspace(5) %gep
+  store i64 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i64_2d_load_store_subvec_1(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_1(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> <i64 3, i64 4, i64 5>, i32 2
+; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  store <3 x i64> <i64 0, i64 1, i64 2>, ptr addrspace(5) %gep.00
+  store <3 x i64> <i64 3, i64 4, i64 5>, ptr addrspace(5) %gep.01
+  %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1
+  %load = load <3 x i64>, ptr addrspace(5) %gep
+  %elem = extractelement <3 x i64> %load, i32 2
+  store i64 %elem, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i64_2d_load_store_subvec_2(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_2(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> <i64 3, i64 4, i64 5>, i32 2
+; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x <3 x i64>], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0
+  %gep.01 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  store <3 x i64> <i64 0, i64 1, i64 2>, ptr addrspace(5) %gep.00
+  store <3 x i64> <i64 3, i64 4, i64 5>, ptr addrspace(5) %gep.01
+  %gep = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 1
+  %load = load <3 x i64>, ptr addrspace(5) %gep
+  %elem = extractelement <3 x i64> %load, i32 2
+  store i64 %elem, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i64_2d_load_store_subvec_3(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_3(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[SEL2]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5>, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <6 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5>, i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5>, i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2
+; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2
+; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  store <3 x i64> <i64 0, i64 1, i64 2>, ptr addrspace(5) %gep.00
+  store <3 x i64> <i64 3, i64 4, i64 5>, ptr addrspace(5) %gep.01
+  %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load <3 x i64>, ptr addrspace(5) %gep
+  %elem = extractelement <3 x i64> %load, i32 2
+  store i64 %elem, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i64_2d_load_store_subvec_4(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_4(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[SEL2]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> <i64 0, i64 1, i64 2, i64 undef, i64 3, i64 4, i64 5, i64 undef>, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> <i64 0, i64 1, i64 2, i64 undef, i64 3, i64 4, i64 5, i64 undef>, i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> <i64 0, i64 1, i64 2, i64 undef, i64 3, i64 4, i64 5, i64 undef>, i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2
+; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2
+; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [2 x <3 x i64>], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0
+  %gep.01 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  store <3 x i64> <i64 0, i64 1, i64 2>, ptr addrspace(5) %gep.00
+  store <3 x i64> <i64 3, i64 4, i64 5>, ptr addrspace(5) %gep.01
+  %gep = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load <3 x i64>, ptr addrspace(5) %gep
+  %elem = extractelement <3 x i64> %load, i32 2
+  store i64 %elem, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_3d_load_store(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i32_3d_load_store(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>, i32 [[SEL2]]
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [2 x [2 x [3 x i32]]], align 16, addrspace(5)
+  %gep.000 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0, i32 0
+  %gep.001 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0, i32 1
+  %gep.002 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0, i32 2
+  %gep.010 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1, i32 0
+  %gep.011 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1, i32 1
+  %gep.012 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1, i32 2
+  %gep.100 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0, i32 0
+  %gep.101 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0, i32 1
+  %gep.102 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0, i32 2
+  %gep.110 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1, i32 0
+  %gep.111 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1, i32 1
+  %gep.112 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1, i32 2
+  store i32 0, ptr addrspace(5) %gep.000
+  store i32 1, ptr addrspace(5) %gep.001
+  store i32 2, ptr addrspace(5) %gep.002
+  store i32 3, ptr addrspace(5) %gep.010
+  store i32 4, ptr addrspace(5) %gep.011
+  store i32 5, ptr addrspace(5) %gep.012
+  store i32 6, ptr addrspace(5) %gep.100
+  store i32 7, ptr addrspace(5) %gep.101
+  store i32 8, ptr addrspace(5) %gep.102
+  store i32 9, ptr addrspace(5) %gep.110
+  store i32 10, ptr addrspace(5) %gep.111
+  store i32 11, ptr addrspace(5) %gep.112
+  %gep = getelementptr inbounds [12 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
index 28b923243b6db..ac28d9c20e2fa 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s
-; RUN: llc -O1 -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s
+; RUN: llc -O0 -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -disable-promote-alloca-to-vector=1 -mattr=+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s
+; RUN: llc -O1 -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -disable-promote-alloca-to-vector=1 -mattr=+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s
 
 ; ALL-LABEL: {{^}}promote_alloca_i32_array_array:
 ; NOOPTS: .amdhsa_group_segment_fixed_size 0
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll
new file mode 100644
index 0000000000000..ee30f761144ce
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll
@@ -0,0 +1,276 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE --check-prefix=DEFAULT %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-vgpr-ratio=2 < %s | FileCheck --check-prefix=BASE %s --check-prefix=RATIO2
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-vgpr-ratio=8 < %s | FileCheck --check-prefix=BASE %s --check-prefix=RATIO8
+
+define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5)
+; DEFAULT-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
+; DEFAULT-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; DEFAULT-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
+; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
+;
+; RATIO2-LABEL: define amdgpu_kernel void @i32_24_elements(
+; RATIO2-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; RATIO2-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO2-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO2-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO2-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO2-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO2-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO2-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; RATIO2-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; RATIO2-NEXT:    ret void
+;
+; RATIO8-LABEL: define amdgpu_kernel void @i32_24_elements(
+; RATIO8-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; RATIO8-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO8-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO8-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO8-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO8-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO8-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO8-NEXT:    [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5)
+; RATIO8-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
+; RATIO8-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; RATIO8-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
+; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; RATIO8-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [24 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false)
+  %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
+;
+; RATIO2-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; RATIO2-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; RATIO2-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO2-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO2-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO2-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO2-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO2-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO2-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; RATIO2-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; RATIO2-NEXT:    ret void
+;
+; RATIO8-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; RATIO8-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; RATIO8-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO8-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO8-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO8-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO8-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO8-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO8-NEXT:    [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5)
+; RATIO8-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
+; RATIO8-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; RATIO8-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
+; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; RATIO8-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [24 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false)
+  %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_16_elements(ptr %out) #0 {
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_16_elements(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = extractelement <16 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43>, i32 [[SEL2]]
+; DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
+;
+; RATIO2-LABEL: define amdgpu_kernel void @i32_16_elements(
+; RATIO2-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
+; RATIO2-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO2-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO2-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO2-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO2-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO2-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO2-NEXT:    [[TMP1:%.*]] = extractelement <16 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43>, i32 [[SEL2]]
+; RATIO2-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; RATIO2-NEXT:    ret void
+;
+; RATIO8-LABEL: define amdgpu_kernel void @i32_16_elements(
+; RATIO8-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
+; RATIO8-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO8-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO8-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO8-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO8-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO8-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO8-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 16, addrspace(5)
+; RATIO8-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false)
+; RATIO8-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; RATIO8-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15
+; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT:    [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; RATIO8-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [16 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 64, i1 false)
+  %gep.0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 15
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_16_elements_attrib(ptr %out) #2 {
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_16_elements_attrib(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 16, addrspace(5)
+; DEFAULT-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false)
+; DEFAULT-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; DEFAULT-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15
+; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
+;
+; RATIO2-LABEL: define amdgpu_kernel void @i32_16_elements_attrib(
+; RATIO2-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; RATIO2-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO2-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO2-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO2-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO2-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO2-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO2-NEXT:    [[TMP1:%.*]] = extractelement <16 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43>, i32 [[SEL2]]
+; RATIO2-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; RATIO2-NEXT:    ret void
+;
+; RATIO8-LABEL: define amdgpu_kernel void @i32_16_elements_attrib(
+; RATIO8-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; RATIO8-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO8-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO8-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO8-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO8-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO8-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO8-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 16, addrspace(5)
+; RATIO8-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false)
+; RATIO8-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; RATIO8-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15
+; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT:    [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; RATIO8-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [16 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 64, i1 false)
+  %gep.0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 15
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg)
+
+attributes #0 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" }
+attributes #1 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="2" }
+attributes #2 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="8" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; BASE: {{.*}}



More information about the llvm-commits mailing list