[llvm] 1f9dcd2 - [AMDGPU] Update generated checks. NFC.

Fri Jun 18 02:49:10 PDT 2021

Author: Jay Foad
Date: 2021-06-18T10:49:02+01:00
New Revision: 1f9dcd2b7384780cb3e4ae34e58799ef2816192b

URL: https://github.com/llvm/llvm-project/commit/1f9dcd2b7384780cb3e4ae34e58799ef2816192b
DIFF: https://github.com/llvm/llvm-project/commit/1f9dcd2b7384780cb3e4ae34e58799ef2816192b.diff

LOG: [AMDGPU] Update generated checks. NFC.

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
index 689ccaa1c4610..01c2da8f399c5 100644

--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
@@ -5,7 +5,7 @@
 define amdgpu_kernel void @add_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @add_i3(
 ; SI-NEXT:    [[R:%.*]] = add i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_i3(
@@ -13,7 +13,7 @@ define amdgpu_kernel void @add_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = add i3 %a, %b
@@ -24,7 +24,7 @@ define amdgpu_kernel void @add_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @add_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @add_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = add nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nsw_i3(
@@ -32,7 +32,7 @@ define amdgpu_kernel void @add_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = add nsw i3 %a, %b
@@ -43,7 +43,7 @@ define amdgpu_kernel void @add_nsw_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @add_nuw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @add_nuw_i3(
 ; SI-NEXT:    [[R:%.*]] = add nuw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_i3(
@@ -51,7 +51,7 @@ define amdgpu_kernel void @add_nuw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = add nuw i3 %a, %b
@@ -62,7 +62,7 @@ define amdgpu_kernel void @add_nuw_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @add_nuw_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @add_nuw_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = add nuw nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_nsw_i3(
@@ -70,7 +70,7 @@ define amdgpu_kernel void @add_nuw_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = add nuw nsw i3 %a, %b
@@ -81,7 +81,7 @@ define amdgpu_kernel void @add_nuw_nsw_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @sub_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @sub_i3(
 ; SI-NEXT:    [[R:%.*]] = sub i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_i3(
@@ -89,7 +89,7 @@ define amdgpu_kernel void @sub_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = sub i3 %a, %b
@@ -100,7 +100,7 @@ define amdgpu_kernel void @sub_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @sub_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @sub_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = sub nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nsw_i3(
@@ -108,7 +108,7 @@ define amdgpu_kernel void @sub_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = sub nsw i3 %a, %b
@@ -119,7 +119,7 @@ define amdgpu_kernel void @sub_nsw_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @sub_nuw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @sub_nuw_i3(
 ; SI-NEXT:    [[R:%.*]] = sub nuw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_i3(
@@ -127,7 +127,7 @@ define amdgpu_kernel void @sub_nuw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw i3 %a, %b
@@ -138,7 +138,7 @@ define amdgpu_kernel void @sub_nuw_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @sub_nuw_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @sub_nuw_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = sub nuw nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_nsw_i3(
@@ -146,7 +146,7 @@ define amdgpu_kernel void @sub_nuw_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw nsw i3 %a, %b
@@ -157,7 +157,7 @@ define amdgpu_kernel void @sub_nuw_nsw_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @mul_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @mul_i3(
 ; SI-NEXT:    [[R:%.*]] = mul i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_i3(
@@ -165,7 +165,7 @@ define amdgpu_kernel void @mul_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = mul i3 %a, %b
@@ -176,7 +176,7 @@ define amdgpu_kernel void @mul_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @mul_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @mul_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = mul nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nsw_i3(
@@ -184,7 +184,7 @@ define amdgpu_kernel void @mul_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = mul nsw i3 %a, %b
@@ -195,7 +195,7 @@ define amdgpu_kernel void @mul_nsw_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @mul_nuw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @mul_nuw_i3(
 ; SI-NEXT:    [[R:%.*]] = mul nuw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_i3(
@@ -203,7 +203,7 @@ define amdgpu_kernel void @mul_nuw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw i3 %a, %b
@@ -214,7 +214,7 @@ define amdgpu_kernel void @mul_nuw_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @mul_nuw_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @mul_nuw_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = mul nuw nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_nsw_i3(
@@ -222,7 +222,7 @@ define amdgpu_kernel void @mul_nuw_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw nsw i3 %a, %b
@@ -233,7 +233,7 @@ define amdgpu_kernel void @mul_nuw_nsw_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @shl_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @shl_i3(
 ; SI-NEXT:    [[R:%.*]] = shl i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_i3(
@@ -241,7 +241,7 @@ define amdgpu_kernel void @shl_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = shl i3 %a, %b
@@ -252,7 +252,7 @@ define amdgpu_kernel void @shl_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @shl_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @shl_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = shl nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nsw_i3(
@@ -260,7 +260,7 @@ define amdgpu_kernel void @shl_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = shl nsw i3 %a, %b
@@ -271,7 +271,7 @@ define amdgpu_kernel void @shl_nsw_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @shl_nuw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @shl_nuw_i3(
 ; SI-NEXT:    [[R:%.*]] = shl nuw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_i3(
@@ -279,7 +279,7 @@ define amdgpu_kernel void @shl_nuw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw i3 %a, %b
@@ -290,7 +290,7 @@ define amdgpu_kernel void @shl_nuw_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @shl_nuw_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @shl_nuw_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = shl nuw nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_nsw_i3(
@@ -298,7 +298,7 @@ define amdgpu_kernel void @shl_nuw_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw nsw i3 %a, %b
@@ -309,7 +309,7 @@ define amdgpu_kernel void @shl_nuw_nsw_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @lshr_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @lshr_i3(
 ; SI-NEXT:    [[R:%.*]] = lshr i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_i3(
@@ -317,7 +317,7 @@ define amdgpu_kernel void @lshr_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = lshr i3 %a, %b
@@ -328,7 +328,7 @@ define amdgpu_kernel void @lshr_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @lshr_exact_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @lshr_exact_i3(
 ; SI-NEXT:    [[R:%.*]] = lshr exact i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_exact_i3(
@@ -336,7 +336,7 @@ define amdgpu_kernel void @lshr_exact_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = lshr exact i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = lshr exact i3 %a, %b
@@ -347,7 +347,7 @@ define amdgpu_kernel void @lshr_exact_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @ashr_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @ashr_i3(
 ; SI-NEXT:    [[R:%.*]] = ashr i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_i3(
@@ -355,7 +355,7 @@ define amdgpu_kernel void @ashr_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = ashr i3 %a, %b
@@ -366,7 +366,7 @@ define amdgpu_kernel void @ashr_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @ashr_exact_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @ashr_exact_i3(
 ; SI-NEXT:    [[R:%.*]] = ashr exact i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_exact_i3(
@@ -374,7 +374,7 @@ define amdgpu_kernel void @ashr_exact_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = ashr exact i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = ashr exact i3 %a, %b
@@ -385,7 +385,7 @@ define amdgpu_kernel void @ashr_exact_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @and_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @and_i3(
 ; SI-NEXT:    [[R:%.*]] = and i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @and_i3(
@@ -393,7 +393,7 @@ define amdgpu_kernel void @and_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = and i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = and i3 %a, %b
@@ -404,7 +404,7 @@ define amdgpu_kernel void @and_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @or_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @or_i3(
 ; SI-NEXT:    [[R:%.*]] = or i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @or_i3(
@@ -412,7 +412,7 @@ define amdgpu_kernel void @or_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = or i3 %a, %b
@@ -423,7 +423,7 @@ define amdgpu_kernel void @or_i3(i3 %a, i3 %b) {
 define amdgpu_kernel void @xor_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @xor_i3(
 ; SI-NEXT:    [[R:%.*]] = xor i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @xor_i3(
@@ -431,7 +431,7 @@ define amdgpu_kernel void @xor_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = xor i3 %a, %b
@@ -443,7 +443,7 @@ define amdgpu_kernel void @select_eq_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_eq_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp eq i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_eq_i3(
@@ -454,7 +454,7 @@ define amdgpu_kernel void @select_eq_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp eq i3 %a, %b
@@ -467,7 +467,7 @@ define amdgpu_kernel void @select_ne_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_ne_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ne i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ne_i3(
@@ -478,7 +478,7 @@ define amdgpu_kernel void @select_ne_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ne i3 %a, %b
@@ -491,7 +491,7 @@ define amdgpu_kernel void @select_ugt_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_ugt_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ugt i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ugt_i3(
@@ -502,7 +502,7 @@ define amdgpu_kernel void @select_ugt_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ugt i3 %a, %b
@@ -515,7 +515,7 @@ define amdgpu_kernel void @select_uge_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_uge_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp uge i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_uge_i3(
@@ -526,7 +526,7 @@ define amdgpu_kernel void @select_uge_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp uge i3 %a, %b
@@ -539,7 +539,7 @@ define amdgpu_kernel void @select_ult_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_ult_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ult i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ult_i3(
@@ -550,7 +550,7 @@ define amdgpu_kernel void @select_ult_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ult i3 %a, %b
@@ -563,7 +563,7 @@ define amdgpu_kernel void @select_ule_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_ule_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ule i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ule_i3(
@@ -574,7 +574,7 @@ define amdgpu_kernel void @select_ule_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ule i3 %a, %b
@@ -587,7 +587,7 @@ define amdgpu_kernel void @select_sgt_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_sgt_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sgt i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sgt_i3(
@@ -598,7 +598,7 @@ define amdgpu_kernel void @select_sgt_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sgt i3 %a, %b
@@ -611,7 +611,7 @@ define amdgpu_kernel void @select_sge_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_sge_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sge i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sge_i3(
@@ -622,7 +622,7 @@ define amdgpu_kernel void @select_sge_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sge i3 %a, %b
@@ -635,7 +635,7 @@ define amdgpu_kernel void @select_slt_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_slt_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp slt i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_slt_i3(
@@ -646,7 +646,7 @@ define amdgpu_kernel void @select_slt_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp slt i3 %a, %b
@@ -659,7 +659,7 @@ define amdgpu_kernel void @select_sle_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_sle_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sle i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sle_i3(
@@ -670,7 +670,7 @@ define amdgpu_kernel void @select_sle_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sle i3 %a, %b
@@ -683,7 +683,7 @@ declare i3 @llvm.bitreverse.i3(i3)
 define amdgpu_kernel void @bitreverse_i3(i3 %a) {
 ; SI-LABEL: @bitreverse_i3(
 ; SI-NEXT:    [[BREV:%.*]] = call i3 @llvm.bitreverse.i3(i3 [[A:%.*]])
-; SI-NEXT:    store volatile i3 [[BREV]], i3 addrspace(1)* undef
+; SI-NEXT:    store volatile i3 [[BREV]], i3 addrspace(1)* undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @bitreverse_i3(
@@ -691,7 +691,7 @@ define amdgpu_kernel void @bitreverse_i3(i3 %a) {
 ; VI-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[TMP1]])
 ; VI-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 29
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef
+; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
 ; VI-NEXT:    ret void
 ;
   %brev = call i3 @llvm.bitreverse.i3(i3 %a)
@@ -702,7 +702,7 @@ define amdgpu_kernel void @bitreverse_i3(i3 %a) {
 define amdgpu_kernel void @add_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @add_i16(
 ; SI-NEXT:    [[R:%.*]] = add i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_i16(
@@ -710,7 +710,7 @@ define amdgpu_kernel void @add_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = add i16 %a, %b
@@ -721,11 +721,11 @@ define amdgpu_kernel void @add_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @constant_add_i16() {
 ; SI-LABEL: @constant_add_i16(
 ; SI-NEXT:    [[R:%.*]] = add i16 1, 2
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @constant_add_i16(
-; VI-NEXT:    store volatile i16 3, i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 3, i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = add i16 1, 2
@@ -736,11 +736,11 @@ define amdgpu_kernel void @constant_add_i16() {
 define amdgpu_kernel void @constant_add_nsw_i16() {
 ; SI-LABEL: @constant_add_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = add nsw i16 1, 2
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @constant_add_nsw_i16(
-; VI-NEXT:    store volatile i16 3, i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 3, i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = add nsw i16 1, 2
@@ -751,11 +751,11 @@ define amdgpu_kernel void @constant_add_nsw_i16() {
 define amdgpu_kernel void @constant_add_nuw_i16() {
 ; SI-LABEL: @constant_add_nuw_i16(
 ; SI-NEXT:    [[R:%.*]] = add nsw i16 1, 2
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @constant_add_nuw_i16(
-; VI-NEXT:    store volatile i16 3, i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 3, i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = add nsw i16 1, 2
@@ -766,7 +766,7 @@ define amdgpu_kernel void @constant_add_nuw_i16() {
 define amdgpu_kernel void @add_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @add_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = add nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nsw_i16(
@@ -774,7 +774,7 @@ define amdgpu_kernel void @add_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = add nsw i16 %a, %b
@@ -785,7 +785,7 @@ define amdgpu_kernel void @add_nsw_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @add_nuw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @add_nuw_i16(
 ; SI-NEXT:    [[R:%.*]] = add nuw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_i16(
@@ -793,7 +793,7 @@ define amdgpu_kernel void @add_nuw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = add nuw i16 %a, %b
@@ -804,7 +804,7 @@ define amdgpu_kernel void @add_nuw_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @add_nuw_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @add_nuw_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = add nuw nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_nsw_i16(
@@ -812,7 +812,7 @@ define amdgpu_kernel void @add_nuw_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = add nuw nsw i16 %a, %b
@@ -823,7 +823,7 @@ define amdgpu_kernel void @add_nuw_nsw_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @sub_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @sub_i16(
 ; SI-NEXT:    [[R:%.*]] = sub i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_i16(
@@ -831,7 +831,7 @@ define amdgpu_kernel void @sub_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = sub i16 %a, %b
@@ -842,7 +842,7 @@ define amdgpu_kernel void @sub_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @sub_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @sub_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = sub nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nsw_i16(
@@ -850,7 +850,7 @@ define amdgpu_kernel void @sub_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = sub nsw i16 %a, %b
@@ -861,7 +861,7 @@ define amdgpu_kernel void @sub_nsw_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @sub_nuw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @sub_nuw_i16(
 ; SI-NEXT:    [[R:%.*]] = sub nuw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_i16(
@@ -869,7 +869,7 @@ define amdgpu_kernel void @sub_nuw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw i16 %a, %b
@@ -880,7 +880,7 @@ define amdgpu_kernel void @sub_nuw_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @sub_nuw_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @sub_nuw_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = sub nuw nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_nsw_i16(
@@ -888,7 +888,7 @@ define amdgpu_kernel void @sub_nuw_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw nsw i16 %a, %b
@@ -899,7 +899,7 @@ define amdgpu_kernel void @sub_nuw_nsw_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @mul_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @mul_i16(
 ; SI-NEXT:    [[R:%.*]] = mul i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_i16(
@@ -907,7 +907,7 @@ define amdgpu_kernel void @mul_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = mul i16 %a, %b
@@ -918,7 +918,7 @@ define amdgpu_kernel void @mul_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @mul_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @mul_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = mul nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nsw_i16(
@@ -926,7 +926,7 @@ define amdgpu_kernel void @mul_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = mul nsw i16 %a, %b
@@ -937,7 +937,7 @@ define amdgpu_kernel void @mul_nsw_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @mul_nuw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @mul_nuw_i16(
 ; SI-NEXT:    [[R:%.*]] = mul nuw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_i16(
@@ -945,7 +945,7 @@ define amdgpu_kernel void @mul_nuw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw i16 %a, %b
@@ -956,7 +956,7 @@ define amdgpu_kernel void @mul_nuw_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @mul_nuw_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @mul_nuw_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = mul nuw nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_nsw_i16(
@@ -964,7 +964,7 @@ define amdgpu_kernel void @mul_nuw_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw nsw i16 %a, %b
@@ -975,7 +975,7 @@ define amdgpu_kernel void @mul_nuw_nsw_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @shl_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @shl_i16(
 ; SI-NEXT:    [[R:%.*]] = shl i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_i16(
@@ -983,7 +983,7 @@ define amdgpu_kernel void @shl_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = shl i16 %a, %b
@@ -994,7 +994,7 @@ define amdgpu_kernel void @shl_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @shl_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @shl_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = shl nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nsw_i16(
@@ -1002,7 +1002,7 @@ define amdgpu_kernel void @shl_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = shl nsw i16 %a, %b
@@ -1013,7 +1013,7 @@ define amdgpu_kernel void @shl_nsw_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @shl_nuw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @shl_nuw_i16(
 ; SI-NEXT:    [[R:%.*]] = shl nuw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_i16(
@@ -1021,7 +1021,7 @@ define amdgpu_kernel void @shl_nuw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw i16 %a, %b
@@ -1032,7 +1032,7 @@ define amdgpu_kernel void @shl_nuw_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @shl_nuw_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @shl_nuw_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = shl nuw nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_nsw_i16(
@@ -1040,7 +1040,7 @@ define amdgpu_kernel void @shl_nuw_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw nsw i16 %a, %b
@@ -1051,7 +1051,7 @@ define amdgpu_kernel void @shl_nuw_nsw_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @lshr_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @lshr_i16(
 ; SI-NEXT:    [[R:%.*]] = lshr i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_i16(
@@ -1059,7 +1059,7 @@ define amdgpu_kernel void @lshr_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = lshr i16 %a, %b
@@ -1070,7 +1070,7 @@ define amdgpu_kernel void @lshr_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @lshr_exact_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @lshr_exact_i16(
 ; SI-NEXT:    [[R:%.*]] = lshr exact i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_exact_i16(
@@ -1078,7 +1078,7 @@ define amdgpu_kernel void @lshr_exact_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = lshr exact i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = lshr exact i16 %a, %b
@@ -1089,7 +1089,7 @@ define amdgpu_kernel void @lshr_exact_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @ashr_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @ashr_i16(
 ; SI-NEXT:    [[R:%.*]] = ashr i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_i16(
@@ -1097,7 +1097,7 @@ define amdgpu_kernel void @ashr_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = ashr i16 %a, %b
@@ -1108,7 +1108,7 @@ define amdgpu_kernel void @ashr_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @ashr_exact_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @ashr_exact_i16(
 ; SI-NEXT:    [[R:%.*]] = ashr exact i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_exact_i16(
@@ -1116,7 +1116,7 @@ define amdgpu_kernel void @ashr_exact_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = ashr exact i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = ashr exact i16 %a, %b
@@ -1127,11 +1127,11 @@ define amdgpu_kernel void @ashr_exact_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @constant_lshr_exact_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @constant_lshr_exact_i16(
 ; SI-NEXT:    [[R:%.*]] = lshr exact i16 4, 1
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @constant_lshr_exact_i16(
-; VI-NEXT:    store volatile i16 2, i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 2, i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = lshr exact i16 4, 1
@@ -1142,7 +1142,7 @@ define amdgpu_kernel void @constant_lshr_exact_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @and_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @and_i16(
 ; SI-NEXT:    [[R:%.*]] = and i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @and_i16(
@@ -1150,7 +1150,7 @@ define amdgpu_kernel void @and_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = and i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = and i16 %a, %b
@@ -1161,7 +1161,7 @@ define amdgpu_kernel void @and_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @or_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @or_i16(
 ; SI-NEXT:    [[R:%.*]] = or i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @or_i16(
@@ -1169,7 +1169,7 @@ define amdgpu_kernel void @or_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = or i16 %a, %b
@@ -1180,7 +1180,7 @@ define amdgpu_kernel void @or_i16(i16 %a, i16 %b) {
 define amdgpu_kernel void @xor_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @xor_i16(
 ; SI-NEXT:    [[R:%.*]] = xor i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @xor_i16(
@@ -1188,7 +1188,7 @@ define amdgpu_kernel void @xor_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = xor i16 %a, %b
@@ -1200,7 +1200,7 @@ define amdgpu_kernel void @select_eq_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_eq_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_eq_i16(
@@ -1211,7 +1211,7 @@ define amdgpu_kernel void @select_eq_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp eq i16 %a, %b
@@ -1224,7 +1224,7 @@ define amdgpu_kernel void @select_ne_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_ne_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ne i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ne_i16(
@@ -1235,7 +1235,7 @@ define amdgpu_kernel void @select_ne_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ne i16 %a, %b
@@ -1248,7 +1248,7 @@ define amdgpu_kernel void @select_ugt_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_ugt_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ugt i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ugt_i16(
@@ -1259,7 +1259,7 @@ define amdgpu_kernel void @select_ugt_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ugt i16 %a, %b
@@ -1272,7 +1272,7 @@ define amdgpu_kernel void @select_uge_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_uge_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp uge i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_uge_i16(
@@ -1283,7 +1283,7 @@ define amdgpu_kernel void @select_uge_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp uge i16 %a, %b
@@ -1296,7 +1296,7 @@ define amdgpu_kernel void @select_ult_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_ult_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ult i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ult_i16(
@@ -1307,7 +1307,7 @@ define amdgpu_kernel void @select_ult_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ult i16 %a, %b
@@ -1320,7 +1320,7 @@ define amdgpu_kernel void @select_ule_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_ule_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ule i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ule_i16(
@@ -1331,7 +1331,7 @@ define amdgpu_kernel void @select_ule_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ule i16 %a, %b
@@ -1344,7 +1344,7 @@ define amdgpu_kernel void @select_sgt_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_sgt_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sgt i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sgt_i16(
@@ -1355,7 +1355,7 @@ define amdgpu_kernel void @select_sgt_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sgt i16 %a, %b
@@ -1368,7 +1368,7 @@ define amdgpu_kernel void @select_sge_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_sge_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sge i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sge_i16(
@@ -1379,7 +1379,7 @@ define amdgpu_kernel void @select_sge_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sge i16 %a, %b
@@ -1392,7 +1392,7 @@ define amdgpu_kernel void @select_slt_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_slt_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_slt_i16(
@@ -1403,7 +1403,7 @@ define amdgpu_kernel void @select_slt_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp slt i16 %a, %b
@@ -1416,7 +1416,7 @@ define amdgpu_kernel void @select_sle_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_sle_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sle i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sle_i16(
@@ -1427,7 +1427,7 @@ define amdgpu_kernel void @select_sle_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sle i16 %a, %b
@@ -1441,7 +1441,7 @@ declare i16 @llvm.bitreverse.i16(i16)
 define amdgpu_kernel void @bitreverse_i16(i16 %a) {
 ; SI-LABEL: @bitreverse_i16(
 ; SI-NEXT:    [[BREV:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[A:%.*]])
-; SI-NEXT:    store volatile i16 [[BREV]], i16 addrspace(1)* undef
+; SI-NEXT:    store volatile i16 [[BREV]], i16 addrspace(1)* undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @bitreverse_i16(
@@ -1449,7 +1449,7 @@ define amdgpu_kernel void @bitreverse_i16(i16 %a) {
 ; VI-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[TMP1]])
 ; VI-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 16
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef
+; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
 ; VI-NEXT:    ret void
 ;
   %brev = call i16 @llvm.bitreverse.i16(i16 %a)
@@ -1460,7 +1460,7 @@ define amdgpu_kernel void @bitreverse_i16(i16 %a) {
 define amdgpu_kernel void @add_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @add_3xi15(
 ; SI-NEXT:    [[R:%.*]] = add <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_3xi15(
@@ -1468,7 +1468,7 @@ define amdgpu_kernel void @add_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add <3 x i15> %a, %b
@@ -1479,7 +1479,7 @@ define amdgpu_kernel void @add_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @add_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = add nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nsw_3xi15(
@@ -1487,7 +1487,7 @@ define amdgpu_kernel void @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add nsw <3 x i15> %a, %b
@@ -1498,7 +1498,7 @@ define amdgpu_kernel void @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @add_nuw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = add nuw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_3xi15(
@@ -1506,7 +1506,7 @@ define amdgpu_kernel void @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add nuw <3 x i15> %a, %b
@@ -1517,7 +1517,7 @@ define amdgpu_kernel void @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @add_nuw_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = add nuw nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_nsw_3xi15(
@@ -1525,7 +1525,7 @@ define amdgpu_kernel void @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add nuw nsw <3 x i15> %a, %b
@@ -1536,7 +1536,7 @@ define amdgpu_kernel void @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @sub_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @sub_3xi15(
 ; SI-NEXT:    [[R:%.*]] = sub <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_3xi15(
@@ -1544,7 +1544,7 @@ define amdgpu_kernel void @sub_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub <3 x i15> %a, %b
@@ -1555,7 +1555,7 @@ define amdgpu_kernel void @sub_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @sub_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = sub nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nsw_3xi15(
@@ -1563,7 +1563,7 @@ define amdgpu_kernel void @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub nsw <3 x i15> %a, %b
@@ -1574,7 +1574,7 @@ define amdgpu_kernel void @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @sub_nuw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = sub nuw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_3xi15(
@@ -1582,7 +1582,7 @@ define amdgpu_kernel void @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw <3 x i15> %a, %b
@@ -1593,7 +1593,7 @@ define amdgpu_kernel void @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @sub_nuw_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = sub nuw nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_nsw_3xi15(
@@ -1601,7 +1601,7 @@ define amdgpu_kernel void @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw nsw <3 x i15> %a, %b
@@ -1612,7 +1612,7 @@ define amdgpu_kernel void @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @mul_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @mul_3xi15(
 ; SI-NEXT:    [[R:%.*]] = mul <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_3xi15(
@@ -1620,7 +1620,7 @@ define amdgpu_kernel void @mul_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul <3 x i15> %a, %b
@@ -1631,7 +1631,7 @@ define amdgpu_kernel void @mul_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @mul_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = mul nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nsw_3xi15(
@@ -1639,7 +1639,7 @@ define amdgpu_kernel void @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul nsw <3 x i15> %a, %b
@@ -1650,7 +1650,7 @@ define amdgpu_kernel void @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @mul_nuw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = mul nuw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_3xi15(
@@ -1658,7 +1658,7 @@ define amdgpu_kernel void @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw <3 x i15> %a, %b
@@ -1669,7 +1669,7 @@ define amdgpu_kernel void @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @mul_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @mul_nuw_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = mul nuw nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_nsw_3xi15(
@@ -1677,7 +1677,7 @@ define amdgpu_kernel void @mul_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw nsw <3 x i15> %a, %b
@@ -1688,7 +1688,7 @@ define amdgpu_kernel void @mul_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @shl_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @shl_3xi15(
 ; SI-NEXT:    [[R:%.*]] = shl <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_3xi15(
@@ -1696,7 +1696,7 @@ define amdgpu_kernel void @shl_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl <3 x i15> %a, %b
@@ -1707,7 +1707,7 @@ define amdgpu_kernel void @shl_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @shl_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = shl nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nsw_3xi15(
@@ -1715,7 +1715,7 @@ define amdgpu_kernel void @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl nsw <3 x i15> %a, %b
@@ -1726,7 +1726,7 @@ define amdgpu_kernel void @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @shl_nuw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = shl nuw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_3xi15(
@@ -1734,7 +1734,7 @@ define amdgpu_kernel void @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw <3 x i15> %a, %b
@@ -1745,7 +1745,7 @@ define amdgpu_kernel void @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @shl_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @shl_nuw_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = shl nuw nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_nsw_3xi15(
@@ -1753,7 +1753,7 @@ define amdgpu_kernel void @shl_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw nsw <3 x i15> %a, %b
@@ -1764,7 +1764,7 @@ define amdgpu_kernel void @shl_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @lshr_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @lshr_3xi15(
 ; SI-NEXT:    [[R:%.*]] = lshr <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_3xi15(
@@ -1772,7 +1772,7 @@ define amdgpu_kernel void @lshr_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = lshr <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = lshr <3 x i15> %a, %b
@@ -1783,7 +1783,7 @@ define amdgpu_kernel void @lshr_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @lshr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @lshr_exact_3xi15(
 ; SI-NEXT:    [[R:%.*]] = lshr exact <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_exact_3xi15(
@@ -1791,7 +1791,7 @@ define amdgpu_kernel void @lshr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = lshr exact <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = lshr exact <3 x i15> %a, %b
@@ -1802,7 +1802,7 @@ define amdgpu_kernel void @lshr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @ashr_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @ashr_3xi15(
 ; SI-NEXT:    [[R:%.*]] = ashr <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_3xi15(
@@ -1810,7 +1810,7 @@ define amdgpu_kernel void @ashr_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = ashr <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = ashr <3 x i15> %a, %b
@@ -1821,7 +1821,7 @@ define amdgpu_kernel void @ashr_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @ashr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @ashr_exact_3xi15(
 ; SI-NEXT:    [[R:%.*]] = ashr exact <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_exact_3xi15(
@@ -1829,7 +1829,7 @@ define amdgpu_kernel void @ashr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = ashr exact <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = ashr exact <3 x i15> %a, %b
@@ -1840,7 +1840,7 @@ define amdgpu_kernel void @ashr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @and_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @and_3xi15(
 ; SI-NEXT:    [[R:%.*]] = and <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @and_3xi15(
@@ -1848,7 +1848,7 @@ define amdgpu_kernel void @and_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = and <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = and <3 x i15> %a, %b
@@ -1859,7 +1859,7 @@ define amdgpu_kernel void @and_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @or_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @or_3xi15(
 ; SI-NEXT:    [[R:%.*]] = or <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @or_3xi15(
@@ -1867,7 +1867,7 @@ define amdgpu_kernel void @or_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = or <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = or <3 x i15> %a, %b
@@ -1878,7 +1878,7 @@ define amdgpu_kernel void @or_3xi15(<3 x i15> %a, <3 x i15> %b) {
 define amdgpu_kernel void @xor_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @xor_3xi15(
 ; SI-NEXT:    [[R:%.*]] = xor <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @xor_3xi15(
@@ -1886,7 +1886,7 @@ define amdgpu_kernel void @xor_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = xor <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = xor <3 x i15> %a, %b
@@ -1898,7 +1898,7 @@ define amdgpu_kernel void @select_eq_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_eq_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp eq <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_eq_3xi15(
@@ -1909,7 +1909,7 @@ define amdgpu_kernel void @select_eq_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp eq <3 x i15> %a, %b
@@ -1922,7 +1922,7 @@ define amdgpu_kernel void @select_ne_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_ne_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ne <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ne_3xi15(
@@ -1933,7 +1933,7 @@ define amdgpu_kernel void @select_ne_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ne <3 x i15> %a, %b
@@ -1946,7 +1946,7 @@ define amdgpu_kernel void @select_ugt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_ugt_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ugt <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ugt_3xi15(
@@ -1957,7 +1957,7 @@ define amdgpu_kernel void @select_ugt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ugt <3 x i15> %a, %b
@@ -1970,7 +1970,7 @@ define amdgpu_kernel void @select_uge_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_uge_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp uge <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_uge_3xi15(
@@ -1981,7 +1981,7 @@ define amdgpu_kernel void @select_uge_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp uge <3 x i15> %a, %b
@@ -1994,7 +1994,7 @@ define amdgpu_kernel void @select_ult_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_ult_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ult <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ult_3xi15(
@@ -2005,7 +2005,7 @@ define amdgpu_kernel void @select_ult_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ult <3 x i15> %a, %b
@@ -2018,7 +2018,7 @@ define amdgpu_kernel void @select_ule_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_ule_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ule <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ule_3xi15(
@@ -2029,7 +2029,7 @@ define amdgpu_kernel void @select_ule_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ule <3 x i15> %a, %b
@@ -2042,7 +2042,7 @@ define amdgpu_kernel void @select_sgt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_sgt_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sgt <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sgt_3xi15(
@@ -2053,7 +2053,7 @@ define amdgpu_kernel void @select_sgt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sgt <3 x i15> %a, %b
@@ -2066,7 +2066,7 @@ define amdgpu_kernel void @select_sge_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_sge_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sge <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sge_3xi15(
@@ -2077,7 +2077,7 @@ define amdgpu_kernel void @select_sge_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sge <3 x i15> %a, %b
@@ -2090,7 +2090,7 @@ define amdgpu_kernel void @select_slt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_slt_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp slt <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_slt_3xi15(
@@ -2101,7 +2101,7 @@ define amdgpu_kernel void @select_slt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp slt <3 x i15> %a, %b
@@ -2114,7 +2114,7 @@ define amdgpu_kernel void @select_sle_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_sle_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sle <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sle_3xi15(
@@ -2125,7 +2125,7 @@ define amdgpu_kernel void @select_sle_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sle <3 x i15> %a, %b
@@ -2138,7 +2138,7 @@ declare <3 x i15> @llvm.bitreverse.v3i15(<3 x i15>)
 define amdgpu_kernel void @bitreverse_3xi15(<3 x i15> %a) {
 ; SI-LABEL: @bitreverse_3xi15(
 ; SI-NEXT:    [[BREV:%.*]] = call <3 x i15> @llvm.bitreverse.v3i15(<3 x i15> [[A:%.*]])
-; SI-NEXT:    store volatile <3 x i15> [[BREV]], <3 x i15> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i15> [[BREV]], <3 x i15> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @bitreverse_3xi15(
@@ -2146,7 +2146,7 @@ define amdgpu_kernel void @bitreverse_3xi15(<3 x i15> %a) {
 ; VI-NEXT:    [[TMP2:%.*]] = call <3 x i32> @llvm.bitreverse.v3i32(<3 x i32> [[TMP1]])
 ; VI-NEXT:    [[TMP3:%.*]] = lshr <3 x i32> [[TMP2]], <i32 17, i32 17, i32 17>
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %brev = call <3 x i15> @llvm.bitreverse.v3i15(<3 x i15> %a)
@@ -2157,7 +2157,7 @@ define amdgpu_kernel void @bitreverse_3xi15(<3 x i15> %a) {
 define amdgpu_kernel void @add_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @add_3xi16(
 ; SI-NEXT:    [[R:%.*]] = add <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_3xi16(
@@ -2165,7 +2165,7 @@ define amdgpu_kernel void @add_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add <3 x i16> %a, %b
@@ -2176,7 +2176,7 @@ define amdgpu_kernel void @add_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @add_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = add nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nsw_3xi16(
@@ -2184,7 +2184,7 @@ define amdgpu_kernel void @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add nsw <3 x i16> %a, %b
@@ -2195,7 +2195,7 @@ define amdgpu_kernel void @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @add_nuw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = add nuw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_3xi16(
@@ -2203,7 +2203,7 @@ define amdgpu_kernel void @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add nuw <3 x i16> %a, %b
@@ -2214,7 +2214,7 @@ define amdgpu_kernel void @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @add_nuw_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = add nuw nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_nsw_3xi16(
@@ -2222,7 +2222,7 @@ define amdgpu_kernel void @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add nuw nsw <3 x i16> %a, %b
@@ -2233,7 +2233,7 @@ define amdgpu_kernel void @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @sub_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @sub_3xi16(
 ; SI-NEXT:    [[R:%.*]] = sub <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_3xi16(
@@ -2241,7 +2241,7 @@ define amdgpu_kernel void @sub_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub <3 x i16> %a, %b
@@ -2252,7 +2252,7 @@ define amdgpu_kernel void @sub_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @sub_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = sub nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nsw_3xi16(
@@ -2260,7 +2260,7 @@ define amdgpu_kernel void @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub nsw <3 x i16> %a, %b
@@ -2271,7 +2271,7 @@ define amdgpu_kernel void @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @sub_nuw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = sub nuw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_3xi16(
@@ -2279,7 +2279,7 @@ define amdgpu_kernel void @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw <3 x i16> %a, %b
@@ -2290,7 +2290,7 @@ define amdgpu_kernel void @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @sub_nuw_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = sub nuw nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_nsw_3xi16(
@@ -2298,7 +2298,7 @@ define amdgpu_kernel void @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw nsw <3 x i16> %a, %b
@@ -2309,7 +2309,7 @@ define amdgpu_kernel void @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @mul_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @mul_3xi16(
 ; SI-NEXT:    [[R:%.*]] = mul <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_3xi16(
@@ -2317,7 +2317,7 @@ define amdgpu_kernel void @mul_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul <3 x i16> %a, %b
@@ -2328,7 +2328,7 @@ define amdgpu_kernel void @mul_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @mul_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = mul nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nsw_3xi16(
@@ -2336,7 +2336,7 @@ define amdgpu_kernel void @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul nsw <3 x i16> %a, %b
@@ -2347,7 +2347,7 @@ define amdgpu_kernel void @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @mul_nuw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = mul nuw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_3xi16(
@@ -2355,7 +2355,7 @@ define amdgpu_kernel void @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw <3 x i16> %a, %b
@@ -2366,7 +2366,7 @@ define amdgpu_kernel void @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @mul_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @mul_nuw_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = mul nuw nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_nsw_3xi16(
@@ -2374,7 +2374,7 @@ define amdgpu_kernel void @mul_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw nsw <3 x i16> %a, %b
@@ -2385,7 +2385,7 @@ define amdgpu_kernel void @mul_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @shl_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @shl_3xi16(
 ; SI-NEXT:    [[R:%.*]] = shl <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_3xi16(
@@ -2393,7 +2393,7 @@ define amdgpu_kernel void @shl_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl <3 x i16> %a, %b
@@ -2404,7 +2404,7 @@ define amdgpu_kernel void @shl_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @shl_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = shl nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nsw_3xi16(
@@ -2412,7 +2412,7 @@ define amdgpu_kernel void @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl nsw <3 x i16> %a, %b
@@ -2423,7 +2423,7 @@ define amdgpu_kernel void @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @shl_nuw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = shl nuw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_3xi16(
@@ -2431,7 +2431,7 @@ define amdgpu_kernel void @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw <3 x i16> %a, %b
@@ -2442,7 +2442,7 @@ define amdgpu_kernel void @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @shl_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @shl_nuw_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = shl nuw nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_nsw_3xi16(
@@ -2450,7 +2450,7 @@ define amdgpu_kernel void @shl_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw nsw <3 x i16> %a, %b
@@ -2461,7 +2461,7 @@ define amdgpu_kernel void @shl_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @lshr_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @lshr_3xi16(
 ; SI-NEXT:    [[R:%.*]] = lshr <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_3xi16(
@@ -2469,7 +2469,7 @@ define amdgpu_kernel void @lshr_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = lshr <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = lshr <3 x i16> %a, %b
@@ -2480,7 +2480,7 @@ define amdgpu_kernel void @lshr_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @lshr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @lshr_exact_3xi16(
 ; SI-NEXT:    [[R:%.*]] = lshr exact <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_exact_3xi16(
@@ -2488,7 +2488,7 @@ define amdgpu_kernel void @lshr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = lshr exact <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = lshr exact <3 x i16> %a, %b
@@ -2499,7 +2499,7 @@ define amdgpu_kernel void @lshr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @ashr_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @ashr_3xi16(
 ; SI-NEXT:    [[R:%.*]] = ashr <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_3xi16(
@@ -2507,7 +2507,7 @@ define amdgpu_kernel void @ashr_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = ashr <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = ashr <3 x i16> %a, %b
@@ -2518,7 +2518,7 @@ define amdgpu_kernel void @ashr_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @ashr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @ashr_exact_3xi16(
 ; SI-NEXT:    [[R:%.*]] = ashr exact <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_exact_3xi16(
@@ -2526,7 +2526,7 @@ define amdgpu_kernel void @ashr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = ashr exact <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = ashr exact <3 x i16> %a, %b
@@ -2537,7 +2537,7 @@ define amdgpu_kernel void @ashr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @and_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @and_3xi16(
 ; SI-NEXT:    [[R:%.*]] = and <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @and_3xi16(
@@ -2545,7 +2545,7 @@ define amdgpu_kernel void @and_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = and <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = and <3 x i16> %a, %b
@@ -2556,7 +2556,7 @@ define amdgpu_kernel void @and_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @or_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @or_3xi16(
 ; SI-NEXT:    [[R:%.*]] = or <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @or_3xi16(
@@ -2564,7 +2564,7 @@ define amdgpu_kernel void @or_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = or <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = or <3 x i16> %a, %b
@@ -2575,7 +2575,7 @@ define amdgpu_kernel void @or_3xi16(<3 x i16> %a, <3 x i16> %b) {
 define amdgpu_kernel void @xor_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @xor_3xi16(
 ; SI-NEXT:    [[R:%.*]] = xor <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @xor_3xi16(
@@ -2583,7 +2583,7 @@ define amdgpu_kernel void @xor_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = xor <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = xor <3 x i16> %a, %b
@@ -2595,7 +2595,7 @@ define amdgpu_kernel void @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_eq_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp eq <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_eq_3xi16(
@@ -2606,7 +2606,7 @@ define amdgpu_kernel void @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp eq <3 x i16> %a, %b
@@ -2619,7 +2619,7 @@ define amdgpu_kernel void @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_ne_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ne <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ne_3xi16(
@@ -2630,7 +2630,7 @@ define amdgpu_kernel void @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ne <3 x i16> %a, %b
@@ -2643,7 +2643,7 @@ define amdgpu_kernel void @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_ugt_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ugt <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ugt_3xi16(
@@ -2654,7 +2654,7 @@ define amdgpu_kernel void @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ugt <3 x i16> %a, %b
@@ -2667,7 +2667,7 @@ define amdgpu_kernel void @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_uge_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp uge <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_uge_3xi16(
@@ -2678,7 +2678,7 @@ define amdgpu_kernel void @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp uge <3 x i16> %a, %b
@@ -2691,7 +2691,7 @@ define amdgpu_kernel void @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_ult_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ult <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ult_3xi16(
@@ -2702,7 +2702,7 @@ define amdgpu_kernel void @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ult <3 x i16> %a, %b
@@ -2715,7 +2715,7 @@ define amdgpu_kernel void @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_ule_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ule <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ule_3xi16(
@@ -2726,7 +2726,7 @@ define amdgpu_kernel void @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ule <3 x i16> %a, %b
@@ -2739,7 +2739,7 @@ define amdgpu_kernel void @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_sgt_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sgt <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sgt_3xi16(
@@ -2750,7 +2750,7 @@ define amdgpu_kernel void @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sgt <3 x i16> %a, %b
@@ -2763,7 +2763,7 @@ define amdgpu_kernel void @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_sge_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sge <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sge_3xi16(
@@ -2774,7 +2774,7 @@ define amdgpu_kernel void @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sge <3 x i16> %a, %b
@@ -2787,7 +2787,7 @@ define amdgpu_kernel void @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_slt_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp slt <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_slt_3xi16(
@@ -2798,7 +2798,7 @@ define amdgpu_kernel void @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp slt <3 x i16> %a, %b
@@ -2811,7 +2811,7 @@ define amdgpu_kernel void @select_sle_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_sle_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sle <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sle_3xi16(
@@ -2822,7 +2822,7 @@ define amdgpu_kernel void @select_sle_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sle <3 x i16> %a, %b
@@ -2836,7 +2836,7 @@ declare <3 x i16> @llvm.bitreverse.v3i16(<3 x i16>)
 define amdgpu_kernel void @bitreverse_3xi16(<3 x i16> %a) {
 ; SI-LABEL: @bitreverse_3xi16(
 ; SI-NEXT:    [[BREV:%.*]] = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> [[A:%.*]])
-; SI-NEXT:    store volatile <3 x i16> [[BREV]], <3 x i16> addrspace(1)* undef
+; SI-NEXT:    store volatile <3 x i16> [[BREV]], <3 x i16> addrspace(1)* undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @bitreverse_3xi16(
@@ -2844,7 +2844,7 @@ define amdgpu_kernel void @bitreverse_3xi16(<3 x i16> %a) {
 ; VI-NEXT:    [[TMP2:%.*]] = call <3 x i32> @llvm.bitreverse.v3i32(<3 x i32> [[TMP1]])
 ; VI-NEXT:    [[TMP3:%.*]] = lshr <3 x i32> [[TMP2]], <i32 16, i32 16, i32 16>
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
 ; VI-NEXT:    ret void
 ;
   %brev = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> %a)

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 2721aba316b07..2f752c2ceff15 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -67,7 +67,6 @@ define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
@@ -158,7 +157,6 @@ define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
@@ -270,7 +268,6 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
@@ -386,7 +383,6 @@ define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
@@ -468,7 +464,6 @@ define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -539,7 +534,6 @@ define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -618,7 +612,6 @@ define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -703,7 +696,6 @@ define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -776,7 +768,6 @@ define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -844,7 +835,6 @@ define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -922,7 +912,6 @@ define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -1008,7 +997,6 @@ define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -1261,7 +1249,6 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -1555,7 +1542,6 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -1929,7 +1915,6 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -2323,7 +2308,6 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s5, v3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -2583,7 +2567,6 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -2808,7 +2791,6 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -3061,7 +3043,6 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -3341,7 +3322,6 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -3475,7 +3455,6 @@ define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -3549,7 +3528,6 @@ define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -3631,7 +3609,6 @@ define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -3719,7 +3696,6 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -3867,7 +3843,6 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -4048,7 +4023,6 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -4245,7 +4219,6 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -4465,7 +4438,6 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -4659,7 +4631,6 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -4856,7 +4827,6 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5073,7 +5043,6 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5316,7 +5285,6 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5426,7 +5394,6 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5463,7 +5430,6 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5498,7 +5464,6 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5540,7 +5505,6 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5586,7 +5550,6 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5731,7 +5694,6 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -5814,7 +5776,6 @@ define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5853,7 +5814,6 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5889,7 +5849,6 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -5933,7 +5892,6 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6067,7 +6025,6 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -6143,7 +6100,6 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6183,7 +6139,6 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6247,7 +6202,6 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
@@ -6321,7 +6275,6 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6376,7 +6329,6 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6559,7 +6511,6 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -6659,7 +6610,6 @@ define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6702,7 +6652,6 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -6764,7 +6713,6 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
@@ -6839,7 +6787,6 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -7013,7 +6960,6 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -7213,7 +7159,6 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
@@ -7358,7 +7303,6 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -7396,7 +7340,6 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -7441,7 +7384,6 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -7583,7 +7525,6 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
@@ -7731,7 +7672,6 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -7884,7 +7824,6 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
@@ -8028,7 +7967,6 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -8069,7 +8007,6 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -8119,7 +8056,6 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -8177,7 +8113,6 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -8332,7 +8267,6 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
@@ -8476,7 +8410,6 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -8643,7 +8576,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
@@ -8821,7 +8753,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -8986,7 +8917,6 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x457ff000
@@ -9400,7 +9330,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
@@ -9798,7 +9727,6 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
@@ -9942,7 +9870,6 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -10109,7 +10036,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
@@ -10290,7 +10216,6 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -10602,7 +10527,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44