[llvm] ab6b48b - DAG: Avoid stack lowering if bitcast has an illegal vector result type

Sun Jan 15 09:37:18 PST 2023

Author: Matt Arsenault
Date: 2023-01-15T12:37:14-05:00
New Revision: ab6b48b7116d118131e7ef459e4d234b4366564f

URL: https://github.com/llvm/llvm-project/commit/ab6b48b7116d118131e7ef459e4d234b4366564f
DIFF: https://github.com/llvm/llvm-project/commit/ab6b48b7116d118131e7ef459e4d234b4366564f.diff

LOG: DAG: Avoid stack lowering if bitcast has an illegal vector result type

A bitcast of <10 x i32> to <5 x i64> was ending up on the
stack. Instead of doing that, handle the case where the new type
doesn't evenly divide but the elements do. Extract the individual
elements and pad with undef.

Avoids stack usage for bitcasts involving <5 x i64>. In some of these
cases, later optimizations actually eliminated the stack objects but
left behind the unused temporary stack object to final emission.

Fixes: SWDEV-377548

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
    llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
    llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
    llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
    llvm/test/CodeGen/AMDGPU/select.f16.ll
    llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
    llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index f7384ced3e23e..af5ea1ce5f459 100644

--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4850,33 +4850,43 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
 
   unsigned WidenSize = WidenVT.getSizeInBits();
   unsigned InSize = InVT.getSizeInBits();
+  unsigned InScalarSize = InVT.getScalarSizeInBits();
   // x86mmx is not an acceptable vector element type, so don't try.
-  if (WidenSize % InSize == 0 && InVT != MVT::x86mmx) {
+  if (WidenSize % InScalarSize == 0 && InVT != MVT::x86mmx) {
     // Determine new input vector type.  The new input vector type will use
     // the same element type (if its a vector) or use the input type as a
     // vector.  It is the same size as the type to widen to.
     EVT NewInVT;
-    unsigned NewNumElts = WidenSize / InSize;
+    unsigned NewNumParts = WidenSize / InSize;
     if (InVT.isVector()) {
       EVT InEltVT = InVT.getVectorElementType();
       NewInVT = EVT::getVectorVT(*DAG.getContext(), InEltVT,
                                  WidenSize / InEltVT.getSizeInBits());
     } else {
-      NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumElts);
+      NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumParts);
     }
 
     if (TLI.isTypeLegal(NewInVT)) {
       SDValue NewVec;
       if (InVT.isVector()) {
         // Because the result and the input are 
diff erent vector types, widening
-        // the result could create a legal type but widening the input might make
-        // it an illegal type that might lead to repeatedly splitting the input
-        // and then widening it. To avoid this, we widen the input only if
+        // the result could create a legal type but widening the input might
+        // make it an illegal type that might lead to repeatedly splitting the
+        // input and then widening it. To avoid this, we widen the input only if
         // it results in a legal type.
-        SmallVector<SDValue, 16> Ops(NewNumElts, DAG.getUNDEF(InVT));
-        Ops[0] = InOp;
+        if (WidenSize % InSize == 0) {
+          SmallVector<SDValue, 16> Ops(NewNumParts, DAG.getUNDEF(InVT));
+          Ops[0] = InOp;
 
-        NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
+          NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
+        } else {
+          SmallVector<SDValue, 16> Ops;
+          DAG.ExtractVectorElements(InOp, Ops);
+          Ops.append(WidenSize / InScalarSize - Ops.size(),
+                     DAG.getUNDEF(InVT.getVectorElementType()));
+
+          NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, NewInVT, Ops);
+        }
       } else {
         NewVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewInVT, InOp);
       }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
index 757da88d8d108..6027c3c96e869 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@@ -381,3 +381,478 @@ end:
   store <4 x double> %phi_cast, ptr addrspace(1) %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}bitcast_v20f16_to_v5f64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v20f16_to_v5f64(i32 %cond, ptr addrspace(1) %out, <20 x half> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <20 x half> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <20 x half> %phi_value to <5 x double>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <5 x double> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v10f32_to_v5f64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v10f32_to_v5f64(i32 %cond, ptr addrspace(1) %out, <10 x float> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <10 x float> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <10 x float> %phi_value to <5 x double>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <5 x double> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v10i32_to_v5f64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v10i32_to_v5f64(i32 %cond, ptr addrspace(1) %out, <10 x i32> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <10 x i32> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <10 x i32> %phi_value to <5 x double>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <5 x double> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v10f32_to_v5i64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v10f32_to_v5i64(i32 %cond, ptr addrspace(1) %out, <10 x float> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <10 x float> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <10 x float> %phi_value to <5 x i64>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if]
+  store <5 x i64> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v10i32_to_v5i64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v10i32_to_v5i64(i32 %cond, ptr addrspace(1) %out, <10 x i32> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <10 x i32> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <10 x i32> %phi_value to <5 x i64>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if]
+  store <5 x i64> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v40i8_to_v5f64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v40i8_to_v5f64(i32 %cond, ptr addrspace(1) %out, <40 x i8> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <40 x i8> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <40 x i8> %phi_value to <5 x double>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <5 x double> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v40i8_to_v5i64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v40i8_to_v5i64(i32 %cond, ptr addrspace(1) %out, <40 x i8> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <40 x i8> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <40 x i8> %phi_value to <5 x i64>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if]
+  store <5 x i64> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v5f64_to_v10f32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v5f64_to_v10f32(i32 %cond, ptr addrspace(1) %out, <5 x double> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <5 x double> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <5 x double> %phi_value to <10 x float>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <10 x float> [zeroinitializer, %entry], [%cast, %if]
+  store <10 x float> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v5f64_to_v10i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v5f64_to_v10i32(i32 %cond, ptr addrspace(1) %out, <5 x double> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <5 x double> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <5 x double> %phi_value to <10 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <10 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <10 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v5i64_to_v10f32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v5i64_to_v10f32(i32 %cond, ptr addrspace(1) %out, <5 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <5 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <5 x i64> %phi_value to <10 x float>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <10 x float> [zeroinitializer, %entry], [%cast, %if]
+  store <10 x float> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v5i64_to_v10i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v5i64_to_v10i32(i32 %cond, ptr addrspace(1) %out, <5 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <5 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <5 x i64> %phi_value to <10 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <10 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <10 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v6f64_to_v12i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v6f64_to_v12i32(i32 %cond, ptr addrspace(1) %out, <6 x double> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <6 x double> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <6 x double> %phi_value to <12 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <12 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <12 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v6f64_to_v12f32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v6f64_to_v12f32(i32 %cond, ptr addrspace(1) %out, <6 x double> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <6 x double> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <6 x double> %phi_value to <12 x float>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <12 x float> [zeroinitializer, %entry], [%cast, %if]
+  store <12 x float> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v12i32_to_v6i64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v12i32_to_v6i64(i32 %cond, ptr addrspace(1) %out, <12 x i32> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <12 x i32> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <12 x i32> %phi_value to <6 x i64>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <6 x i64> [zeroinitializer, %entry], [%cast, %if]
+  store <6 x i64> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v12i32_to_v6f64:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v12i32_to_v6f64(i32 %cond, ptr addrspace(1) %out, <12 x i32> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <12 x i32> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <12 x i32> %phi_value to <6 x double>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <6 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <6 x double> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v6i64_to_v12i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v6i64_to_v12i32(i32 %cond, ptr addrspace(1) %out, <6 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <6 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <6 x i64> %phi_value to <12 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <12 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <12 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v7i64_to_v14i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v7i64_to_v14i32(i32 %cond, ptr addrspace(1) %out, <7 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <7 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <7 x i64> %phi_value to <14 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <14 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <14 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v7f64_to_v14i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v7f64_to_v14i32(i32 %cond, ptr addrspace(1) %out, <7 x double> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <7 x double> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <7 x double> %phi_value to <14 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <14 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <14 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v9i64_to_v18i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v9i64_to_v18i32(i32 %cond, ptr addrspace(1) %out, <9 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <9 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <9 x i64> %phi_value to <18 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <18 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <18 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v10i64_to_v20i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v10i64_to_v20i32(i32 %cond, ptr addrspace(1) %out, <10 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <10 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <10 x i64> %phi_value to <20 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <20 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <20 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v11i64_to_v20i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v11i64_to_v20i32(i32 %cond, ptr addrspace(1) %out, <11 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <11 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <11 x i64> %phi_value to <22 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <22 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <22 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v12i64_to_v22i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v12i64_to_v22i32(i32 %cond, ptr addrspace(1) %out, <12 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <12 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <12 x i64> %phi_value to <24 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <24 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <24 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v13i64_to_v24i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v13i64_to_v24i32(i32 %cond, ptr addrspace(1) %out, <13 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <13 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <13 x i64> %phi_value to <26 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <26 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <26 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v14i64_to_v26i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v14i64_to_v26i32(i32 %cond, ptr addrspace(1) %out, <14 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <14 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <14 x i64> %phi_value to <28 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <28 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <28 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v15i64_to_v26i32:
+; SI: ScratchSize: 0
+define amdgpu_kernel void @bitcast_v15i64_to_v26i32(i32 %cond, ptr addrspace(1) %out, <15 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %phi_value = phi <15 x i64> [zeroinitializer, %entry], [%value, %if]
+  %cast = bitcast <15 x i64> %phi_value to <30 x i32>
+  %cmp1 = icmp eq i32 %cond, 1
+  br i1 %cmp1, label %if, label %end
+
+end:
+  %phi_cast = phi <30 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <30 x i32> %phi_cast, ptr addrspace(1) %out
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index b006c6b2aff7a..3145c1c3e868b 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -171,20 +171,14 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_mov_b32 s26, -1
-; SI-NEXT:    s_mov_b32 s27, 0xe8f000
-; SI-NEXT:    s_add_u32 s24, s24, s3
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_addc_u32 s25, s25, 0
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_mov_b32 s18, s10
 ; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_mov_b32 s22, s10
 ; SI-NEXT:    s_mov_b32 s23, s11
@@ -203,30 +197,24 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa
 ;
 ; VI-LABEL: test_copy_v4i8_x4:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_mov_b32 s91, 0xe80000
-; VI-NEXT:    s_add_u32 s88, s88, s3
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x44
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_mov_b32 s11, 0xf000
 ; VI-NEXT:    s_mov_b32 s10, -1
-; VI-NEXT:    s_addc_u32 s89, s89, 0
+; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_mov_b32 s15, s11
 ; VI-NEXT:    s_mov_b32 s18, s10
 ; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s22, s10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s0
 ; VI-NEXT:    s_mov_b32 s9, s1
-; VI-NEXT:    s_mov_b32 s22, s10
 ; VI-NEXT:    s_mov_b32 s23, s11
 ; VI-NEXT:    s_mov_b32 s12, s2
 ; VI-NEXT:    s_mov_b32 s13, s3

diff  --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
index 2b4651487eff6..2b8a712b28c05 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll
@@ -7,27 +7,21 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa
 ; RRLIST-LABEL: sccClobber:
 ; RRLIST:       ; %bb.0: ; %entry
 ; RRLIST-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; RRLIST-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
-; RRLIST-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
-; RRLIST-NEXT:    s_mov_b32 s22, -1
-; RRLIST-NEXT:    s_mov_b32 s23, 0xe00000
-; RRLIST-NEXT:    s_add_u32 s20, s20, s3
+; RRLIST-NEXT:    v_mov_b32_e32 v2, 0
 ; RRLIST-NEXT:    s_waitcnt lgkmcnt(0)
 ; RRLIST-NEXT:    s_load_dword s16, s[8:9], 0x0
 ; RRLIST-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; RRLIST-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
 ; RRLIST-NEXT:    s_load_dwordx2 s[14:15], s[0:1], 0x44
 ; RRLIST-NEXT:    s_load_dword s17, s[10:11], 0x0
-; RRLIST-NEXT:    s_addc_u32 s21, s21, 0
 ; RRLIST-NEXT:    s_waitcnt lgkmcnt(0)
-; RRLIST-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; RRLIST-NEXT:    s_min_i32 s4, s16, 0
+; RRLIST-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; RRLIST-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
 ; RRLIST-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; RRLIST-NEXT:    s_cselect_b32 s0, s16, s17
 ; RRLIST-NEXT:    s_cmp_eq_u64 s[12:13], s[2:3]
 ; RRLIST-NEXT:    s_cselect_b32 s0, s4, s0
-; RRLIST-NEXT:    v_mov_b32_e32 v2, 0
 ; RRLIST-NEXT:    v_mov_b32_e32 v0, s0
 ; RRLIST-NEXT:    global_store_dword v2, v0, s[14:15]
 ; RRLIST-NEXT:    s_endpgm
@@ -35,27 +29,21 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa
 ; FAST-LABEL: sccClobber:
 ; FAST:       ; %bb.0: ; %entry
 ; FAST-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; FAST-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
-; FAST-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
-; FAST-NEXT:    s_mov_b32 s22, -1
-; FAST-NEXT:    s_mov_b32 s23, 0xe00000
-; FAST-NEXT:    s_add_u32 s20, s20, s3
+; FAST-NEXT:    v_mov_b32_e32 v2, 0
 ; FAST-NEXT:    s_waitcnt lgkmcnt(0)
 ; FAST-NEXT:    s_load_dword s16, s[8:9], 0x0
 ; FAST-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; FAST-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
 ; FAST-NEXT:    s_load_dwordx2 s[14:15], s[0:1], 0x44
 ; FAST-NEXT:    s_load_dword s17, s[10:11], 0x0
-; FAST-NEXT:    s_addc_u32 s21, s21, 0
 ; FAST-NEXT:    s_waitcnt lgkmcnt(0)
-; FAST-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; FAST-NEXT:    s_min_i32 s4, s16, 0
+; FAST-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; FAST-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
 ; FAST-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; FAST-NEXT:    s_cselect_b32 s0, s16, s17
 ; FAST-NEXT:    s_cmp_eq_u64 s[12:13], s[2:3]
 ; FAST-NEXT:    s_cselect_b32 s0, s4, s0
-; FAST-NEXT:    v_mov_b32_e32 v2, 0
 ; FAST-NEXT:    v_mov_b32_e32 v0, s0
 ; FAST-NEXT:    global_store_dword v2, v0, s[14:15]
 ; FAST-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 41a4998b3ba91..027f3c360b426 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -4032,14 +4032,8 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN1-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
 ; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x11
-; GCN1-NEXT:    s_mov_b32 s14, -1
-; GCN1-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN1-NEXT:    s_add_u32 s12, s12, s3
-; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
@@ -4063,14 +4057,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GCN2-NEXT:    s_mov_b32 s90, -1
-; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
-; GCN2-NEXT:    s_add_u32 s88, s88, s3
-; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -4231,14 +4219,8 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
-; GCN1-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN1-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
 ; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x11
-; GCN1-NEXT:    s_mov_b32 s14, -1
-; GCN1-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN1-NEXT:    s_add_u32 s12, s12, s3
-; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; GCN1-NEXT:    s_add_u32 s2, s4, s2
@@ -4260,14 +4242,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GCN2-NEXT:    s_mov_b32 s90, -1
-; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
-; GCN2-NEXT:    s_add_u32 s88, s88, s3
-; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; GCN2-NEXT:    s_add_u32 s2, s4, s2

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index 48aa210665e79..d295efc6d015f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -5010,17 +5010,12 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) {
 ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
-; CI-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; CI-NEXT:    s_mov_b32 s18, -1
 ; CI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
-; CI-NEXT:    s_mov_b32 s19, 0xe8f000
-; CI-NEXT:    s_add_u32 s16, s16, s3
-; CI-NEXT:    s_addc_u32 s17, s17, 0
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshl_b64 s[10:11], s[10:11], 3
-; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    v_mov_b32_e32 v4, s10
 ; CI-NEXT:    s_mov_b32 s0, s6
 ; CI-NEXT:    s_mov_b32 s1, s7
@@ -5035,20 +5030,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
 ; CI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
-; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_mov_b32 s91, 0xe80000
-; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_addc_u32 s89, s89, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -5074,18 +5062,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s14, -1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
-; GFX9-NEXT:    s_add_u32 s12, s12, s3
-; GFX9-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; GFX9-NEXT:    s_add_u32 s2, s4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    s_addc_u32 s3, s5, s3
@@ -5296,17 +5278,12 @@ entry:
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) {
 ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
-; CI-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; CI-NEXT:    s_mov_b32 s18, -1
 ; CI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
-; CI-NEXT:    s_mov_b32 s19, 0xe8f000
-; CI-NEXT:    s_add_u32 s16, s16, s3
-; CI-NEXT:    s_addc_u32 s17, s17, 0
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshl_b64 s[10:11], s[10:11], 3
-; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    v_mov_b32_e32 v4, s10
 ; CI-NEXT:    s_mov_b32 s0, s6
 ; CI-NEXT:    s_mov_b32 s1, s7
@@ -5321,20 +5298,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
 ; CI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
-; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_mov_b32 s91, 0xe80000
-; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_addc_u32 s89, s89, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; VI-NEXT:    s_add_u32 s2, s4, s2
@@ -5358,18 +5328,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s14, -1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
-; GFX9-NEXT:    s_add_u32 s12, s12, s3
-; GFX9-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; GFX9-NEXT:    s_add_u32 s2, s4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    s_addc_u32 s3, s5, s3

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 032b8b89fb4ee..03e1960ca7c6a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -610,16 +610,10 @@ entry:
 define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) {
 ; GCN-LABEL: double5_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GCN-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GCN-NEXT:    s_mov_b32 s18, -1
-; GCN-NEXT:    s_mov_b32 s19, 0xe80000
-; GCN-NEXT:    s_add_u32 s16, s16, s3
 ; GCN-NEXT:    s_load_dword s12, s[0:1], 0xa4
 ; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x84
 ; GCN-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x24
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x64
-; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s12, 4
 ; GCN-NEXT:    s_cselect_b32 s9, 0x3ff00000, s9

diff  --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index abee0b2d9c5b4..be01fa8ab1e63 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -5,19 +5,14 @@
 define amdgpu_kernel void @select_f16(
 ; SI-LABEL: select_f16:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_mov_b32 s26, -1
-; SI-NEXT:    s_mov_b32 s27, 0xe8f000
-; SI-NEXT:    s_add_u32 s24, s24, s3
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
+; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s16, s6
 ; SI-NEXT:    s_mov_b32 s17, s7
-; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_mov_b32 s19, s3
 ; SI-NEXT:    s_mov_b32 s20, s8
 ; SI-NEXT:    s_mov_b32 s21, s9
@@ -39,7 +34,6 @@ define amdgpu_kernel void @select_f16(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_addc_u32 s25, s25, 0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -52,19 +46,14 @@ define amdgpu_kernel void @select_f16(
 ;
 ; VI-LABEL: select_f16:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_mov_b32 s26, -1
-; VI-NEXT:    s_mov_b32 s27, 0xe80000
-; VI-NEXT:    s_add_u32 s24, s24, s3
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
+; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s16, s6
 ; VI-NEXT:    s_mov_b32 s17, s7
-; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_mov_b32 s19, s3
 ; VI-NEXT:    s_mov_b32 s20, s8
 ; VI-NEXT:    s_mov_b32 s21, s9
@@ -86,7 +75,6 @@ define amdgpu_kernel void @select_f16(
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_addc_u32 s25, s25, 0
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -432,19 +420,14 @@ entry:
 define amdgpu_kernel void @select_v2f16(
 ; SI-LABEL: select_v2f16:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_mov_b32 s26, -1
 ; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
-; SI-NEXT:    s_mov_b32 s27, 0xe8f000
-; SI-NEXT:    s_add_u32 s24, s24, s3
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s16, s6
 ; SI-NEXT:    s_mov_b32 s17, s7
-; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_mov_b32 s19, s3
 ; SI-NEXT:    s_mov_b32 s20, s8
 ; SI-NEXT:    s_mov_b32 s21, s9
@@ -462,7 +445,6 @@ define amdgpu_kernel void @select_v2f16(
 ; SI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_addc_u32 s25, s25, 0
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
@@ -492,13 +474,8 @@ define amdgpu_kernel void @select_v2f16(
 ;
 ; VI-LABEL: select_v2f16:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
-; VI-NEXT:    s_mov_b32 s26, -1
-; VI-NEXT:    s_mov_b32 s27, 0xe80000
-; VI-NEXT:    s_add_u32 s24, s24, s3
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_mov_b32 s14, s2
@@ -522,7 +499,6 @@ define amdgpu_kernel void @select_v2f16(
 ; VI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_addc_u32 s25, s25, 0
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)

diff  --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index 433368bf616eb..9b92e03a01c11 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -67,19 +67,14 @@ entry:
 define amdgpu_kernel void @madak_f16_use_2(
 ; SI-LABEL: madak_f16_use_2:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_mov_b32 s22, -1
-; SI-NEXT:    s_mov_b32 s23, 0xe8f000
 ; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
-; SI-NEXT:    s_add_u32 s20, s20, s3
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s16, s8
 ; SI-NEXT:    s_mov_b32 s17, s9
-; SI-NEXT:    s_mov_b32 s18, s2
 ; SI-NEXT:    s_mov_b32 s19, s3
 ; SI-NEXT:    s_mov_b32 s8, s10
 ; SI-NEXT:    s_mov_b32 s9, s11
@@ -96,7 +91,6 @@ define amdgpu_kernel void @madak_f16_use_2(
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x41200000
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_addc_u32 s21, s21, 0
 ; SI-NEXT:    s_mov_b32 s8, s6
 ; SI-NEXT:    s_mov_b32 s9, s7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -112,19 +106,14 @@ define amdgpu_kernel void @madak_f16_use_2(
 ;
 ; VI-LABEL: madak_f16_use_2:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_mov_b32 s22, -1
-; VI-NEXT:    s_mov_b32 s23, 0xe80000
 ; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
-; VI-NEXT:    s_add_u32 s20, s20, s3
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s16, s8
 ; VI-NEXT:    s_mov_b32 s17, s9
-; VI-NEXT:    s_mov_b32 s18, s2
 ; VI-NEXT:    s_mov_b32 s19, s3
 ; VI-NEXT:    s_mov_b32 s8, s10
 ; VI-NEXT:    s_mov_b32 s9, s11
@@ -141,7 +130,6 @@ define amdgpu_kernel void @madak_f16_use_2(
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x4900
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_addc_u32 s21, s21, 0
 ; VI-NEXT:    s_mov_b32 s8, s6
 ; VI-NEXT:    s_mov_b32 s9, s7
 ; VI-NEXT:    v_madak_f16 v1, v0, v1, 0x4900

diff  --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 584d6354840a9..e6bc773c272bd 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1660,19 +1660,13 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b96 off, v[4:6], s32
-; GFX11-NEXT:    global_load_b96 v[4:6], v[2:3], off
-; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s32
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b96 off, v[4:6], s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:16
+; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX11-NEXT:    global_load_b96 v[0:2], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[3:4], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <6 x half>, ptr addrspace(1) %arg0
   %val1 = load <6 x half>, ptr addrspace(1) %arg1