[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

Vikram Hegde via cfe-commits cfe-commits at lists.llvm.org
Wed Jun 12 04:42:17 PDT 2024


https://github.com/vikramRH updated https://github.com/llvm/llvm-project/pull/89217

>From aa4e757ad3b14ab132a9cc3a0e50b8e4a00e89aa Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Wed, 10 Apr 2024 11:53:16 +0000
Subject: [PATCH 01/30] [AMDGPU] add support for i64 readlane

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  2 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 43 +++++++++++++++++++++++
 llvm/lib/Target/AMDGPU/SIInstructions.td  |  8 +++++
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ee9a5d7a34398..45e9c10f2306c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2184,7 +2184,7 @@ def int_amdgcn_readfirstlane :
 // current wave. Otherwise, the result is undefined.
 def int_amdgcn_readlane :
   ClangBuiltin<"__builtin_amdgcn_readlane">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // The value to write and lane select arguments must be uniform across the
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0c706d51cb665..aa8389f696044 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5065,6 +5065,49 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     MI.eraseFromParent();
     return BB;
   }
+  case AMDGPU::V_READLANE_PSEUDO_B64: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    const DebugLoc &DL = MI.getDebugLoc();
+
+    MachineOperand &Dest = MI.getOperand(0);
+    MachineOperand &Src0 = MI.getOperand(1);
+    MachineOperand &Src1 = MI.getOperand(2);
+
+    Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+    const TargetRegisterClass *Src1RC = Src1.isReg()
+                                        ? MRI.getRegClass(Src1.getReg())
+                                        : &AMDGPU::SReg_32RegClass;
+    
+    const TargetRegisterClass *Src0SubRC =
+        TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
+    
+    MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+
+    MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+    
+    MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub0)
+                           .add(SrcReg0Sub0)
+                           .add(Src1);
+    MachineInstr *HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub1)
+                           .add(SrcReg0Sub1)
+                           .add(Src1);
+
+    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+          .addReg(DestSub0)
+          .addImm(AMDGPU::sub0)
+          .addReg(DestSub1)
+          .addImm(AMDGPU::sub1);
+
+    MI.eraseFromParent();
+    return BB;
+  }
   case AMDGPU::SI_INIT_M0: {
     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d6d49889656bb..0f02b4ce518ac 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -288,6 +288,14 @@ def V_SUB_U64_PSEUDO : VPseudoInstSI <
 >;
 } // End usesCustomInserter = 1, Defs = [VCC]
 
+
+let usesCustomInserter = 1 in {
+  def V_READLANE_PSEUDO_B64 : VPseudoInstSI <
+  (outs SReg_64:$sdst), (ins VReg_64:$src0, SSrc_b32:$src1),
+  [(set i64:$sdst, (int_amdgcn_readlane i64:$src0, i32:$src1))]
+  >;
+}
+
 let usesCustomInserter = 1, Defs = [SCC] in {
 def S_ADD_U64_PSEUDO : SPseudoInstSI <
   (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),

>From b895dd5861bfa9781eb8990c86edd11d466c48f3 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Fri, 12 Apr 2024 10:27:42 +0000
Subject: [PATCH 02/30] add support for i64 readfirstlane and writelane
 intrinsics

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  10 +-
 .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp   |  20 +--
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  95 ++++++++++++++
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  12 +-
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |   2 +-
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |   6 +-
 .../atomic_optimizations_mul_one.ll           |  13 +-
 .../test/CodeGen/AMDGPU/global-atomic-scan.ll |  48 +++----
 .../AMDGPU/global_atomic_optimizer_fp_rtn.ll  | 120 +++++++++---------
 .../AMDGPU/global_atomics_iterative_scan.ll   |   2 +-
 .../global_atomics_iterative_scan_fp.ll       |   8 +-
 .../global_atomics_optimizer_fp_no_rtn.ll     |  24 ++--
 .../InstCombine/AMDGPU/amdgcn-intrinsics.ll   |  32 ++---
 13 files changed, 248 insertions(+), 144 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 45e9c10f2306c..34feee1c56be8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2177,7 +2177,7 @@ def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
 
 def int_amdgcn_readfirstlane :
   ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // The lane argument must be uniform across the currently active threads of the
@@ -2192,10 +2192,10 @@ def int_amdgcn_readlane :
 // undefined.
 def int_amdgcn_writelane :
   ClangBuiltin<"__builtin_amdgcn_writelane">,
-  Intrinsic<[llvm_i32_ty], [
-    llvm_i32_ty,    // uniform value to write: returned by the selected lane
-    llvm_i32_ty,    // uniform lane select
-    llvm_i32_ty     // returned by all lanes other than the selected one
+  Intrinsic<[llvm_any_ty], [
+    LLVMMatchType<0>,    // uniform value to write: returned by the selected lane
+    llvm_i32_ty,        // uniform lane select
+    LLVMMatchType<0>     // returned by all lanes other than the selected one
   ],
   [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index dbb3de76b4dda..5b3fa148e5619 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -433,7 +433,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
   // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
   // combine them with a scalar operation.
   Function *ReadLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
   V = B.CreateBitCast(V, IntNTy);
   Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
   Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
@@ -493,7 +493,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
     if (!ST->isWave32()) {
       // Combine lane 31 into lanes 32..63.
       V = B.CreateBitCast(V, IntNTy);
-      Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+      Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, B.getInt32Ty(),
                                               {V, B.getInt32(31)});
 
       Value *UpdateDPPCall = B.CreateCall(
@@ -524,9 +524,9 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
                       B.getInt32(0xf), B.getFalse()});
   } else {
     Function *ReadLane =
-        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
     Function *WriteLane =
-        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
 
     // On GFX10 all DPP operations are confined to a single row. To get cross-
     // row operations we have to use permlane or readlane.
@@ -599,7 +599,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
   // Get the value required for atomic operation
   V = B.CreateBitCast(V, IntNTy);
   Value *LaneValue =
-      B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt});
+      B.CreateIntrinsic(Intrinsic::amdgcn_readlane, B.getInt32Ty(), {V, LaneIdxInt});
   LaneValue = B.CreateBitCast(LaneValue, Ty);
 
   // Perform writelane if intermediate scan results are required later in the
@@ -607,7 +607,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
   Value *OldValue = nullptr;
   if (NeedResult) {
     OldValue =
-        B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {},
+        B.CreateIntrinsic(Intrinsic::amdgcn_writelane, B.getInt32Ty(),
                           {B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt,
                            B.CreateBitCast(OldValuePhi, IntNTy)});
     OldValue = B.CreateBitCast(OldValue, Ty);
@@ -789,7 +789,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
         Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
         assert(TyBitWidth == 32);
         NewV = B.CreateBitCast(NewV, IntNTy);
-        NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+        NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, B.getInt32Ty(),
                                  {NewV, LastLaneIdx});
         NewV = B.CreateBitCast(NewV, Ty);
       }
@@ -926,9 +926,9 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
       Value *const ExtractHi =
           B.CreateTrunc(B.CreateLShr(CastedPhi, 32), Int32Ty);
       CallInst *const ReadFirstLaneLo =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
+          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, Int32Ty, ExtractLo);
       CallInst *const ReadFirstLaneHi =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
+          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, Int32Ty, ExtractHi);
       Value *const PartialInsert = B.CreateInsertElement(
           PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
       Value *const Insert =
@@ -937,7 +937,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
     } else if (TyBitWidth == 32) {
       Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
       BroadcastI =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, CastedPhi);
+          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, Int32Ty, CastedPhi);
       BroadcastI = B.CreateBitCast(BroadcastI, Ty);
 
     } else {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index aa8389f696044..4ae17ccfe8067 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5108,6 +5108,101 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     MI.eraseFromParent();
     return BB;
   }
+  case AMDGPU::V_READFIRSTLANE_PSEUDO_B64: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    const DebugLoc &DL = MI.getDebugLoc();
+
+    MachineOperand &Dest = MI.getOperand(0);
+    MachineOperand &Src0 = MI.getOperand(1);
+
+    Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+
+    const TargetRegisterClass *Src0SubRC =
+        TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
+    
+    MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+
+    MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+
+    MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub0)
+                           .add(SrcReg0Sub0);
+    MachineInstr *HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub1)
+                           .add(SrcReg0Sub1);
+    
+    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+          .addReg(DestSub0)
+          .addImm(AMDGPU::sub0)
+          .addReg(DestSub1)
+          .addImm(AMDGPU::sub1);
+
+    MI.eraseFromParent();
+    return BB;
+  }
+  case AMDGPU::V_WRITELANE_PSEUDO_B64: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    const DebugLoc &DL = MI.getDebugLoc();
+
+    MachineOperand &Dest = MI.getOperand(0);
+    MachineOperand &Src0 = MI.getOperand(1);
+    MachineOperand &Src1 = MI.getOperand(2);
+    MachineOperand &Src2 = MI.getOperand(3);
+
+    Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+    const TargetRegisterClass *Src1RC = Src1.isReg()
+                                        ? MRI.getRegClass(Src1.getReg())
+                                        : &AMDGPU::SReg_32RegClass;
+    const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());                                     
+    
+    const TargetRegisterClass *Src0SubRC =
+        TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
+    
+    MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+
+    MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+
+    
+    const TargetRegisterClass *Src2SubRC =
+        TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
+    
+    MachineOperand SrcReg2Sub0 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src2, Src2RC, AMDGPU::sub0, Src2SubRC);
+
+    MachineOperand SrcReg2Sub1 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src2, Src2RC, AMDGPU::sub1, Src2SubRC);        
+    
+    
+    MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub0)
+                           .add(SrcReg0Sub0)
+                           .add(Src1)
+                           .add(SrcReg2Sub0);
+    MachineInstr *HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub1)
+                           .add(SrcReg0Sub1)
+                           .add(Src1)
+                           .add(SrcReg2Sub1);
+    
+    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+          .addReg(DestSub0)
+          .addImm(AMDGPU::sub0)
+          .addReg(DestSub1)
+          .addImm(AMDGPU::sub1);
+
+    MI.eraseFromParent();
+    return BB;   
+  }
   case AMDGPU::SI_INIT_M0: {
     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 0f02b4ce518ac..decd59be5eb29 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -294,6 +294,16 @@ let usesCustomInserter = 1 in {
   (outs SReg_64:$sdst), (ins VReg_64:$src0, SSrc_b32:$src1),
   [(set i64:$sdst, (int_amdgcn_readlane i64:$src0, i32:$src1))]
   >;
+
+  def V_READFIRSTLANE_PSEUDO_B64 : VPseudoInstSI <
+  (outs SReg_64:$sdst), (ins VReg_64:$src0),
+  [(set i64:$sdst, (int_amdgcn_readfirstlane i64:$src0))]
+  >;
+  
+  def V_WRITELANE_PSEUDO_B64 : VPseudoInstSI <
+  (outs VReg_64:$sdst), (ins SReg_64:$src0, SSrc_b32:$src1, VReg_64:$src2),
+  [(set i64:$sdst, (int_amdgcn_writelane i64:$src0, i32:$src1, i64:$src2))]
+  >;
 }
 
 let usesCustomInserter = 1, Defs = [SCC] in {
@@ -3413,7 +3423,7 @@ def : GCNPat<
 // FIXME: Should also do this for readlane, but tablegen crashes on
 // the ignored src1.
 def : GCNPat<
-  (int_amdgcn_readfirstlane (i32 imm:$src)),
+  (i32 (int_amdgcn_readfirstlane (i32 imm:$src))),
   (S_MOV_B32 SReg_32:$src)
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 2341e0d9d32bb..0ee80f45c9160 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -112,7 +112,7 @@ class getVOP1Pat <SDPatternOperator node, VOPProfile P> : LetDummies {
         !if(P.HasOMod,
             [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0,
                                                   i1:$clamp, i32:$omod))))],
-            [(set P.DstVT:$vdst, (node P.Src0RC32:$src0))]
+            [(set P.DstVT:$vdst, (node (P.Src0VT P.Src0RC32:$src0)))]
         )
     );
 }
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 26c85e83b53ad..74d2f53d7b317 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -56,9 +56,9 @@ define amdgpu_kernel void @mov_dpp8(ptr addrspace(1) %out, i32 %in) #0 {
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2)
+; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
 define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 {
-  %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2)
+  %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
   store i32 %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -237,7 +237,7 @@ declare i32 @llvm.amdgcn.permlanex16.var(i32, i32, i32, i1, i1) #1
 declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
 declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1
-declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1
+declare i32 @llvm.amdgcn.writelane.i32(i32, i32, i32) #1
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1
 declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
index 220dc70165e87..bdfafa89cd047 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
@@ -74,7 +73,7 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
 ; IR-NEXT:    br label [[TMP11]]
 ; IR:       11:
 ; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
 ; IR-NEXT:    ret void
@@ -172,7 +171,7 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
 ; IR-NEXT:    br label [[TMP11]]
 ; IR:       11:
 ; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
 ; IR-NEXT:    ret void
@@ -273,7 +272,7 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = and i32 [[TMP5]], 1
 ; IR-NEXT:    [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP16]], i32 0, i32 0, i32 0)
@@ -374,7 +373,7 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
 ; IR-NEXT:    br label [[TMP11]]
 ; IR:       11:
 ; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
 ; IR-NEXT:    [[ARG_VEC:%.*]] = bitcast i128 [[ARG_INT]] to <4 x i32>
@@ -476,7 +475,7 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) {
 ; IR-NEXT:    br label [[TMP11]]
 ; IR:       11:
 ; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
 ; IR-NEXT:    [[ARG_VEC:%.*]] = bitcast i128 [[ARG_INT]] to <4 x i32>
@@ -581,7 +580,7 @@ define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) {
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = and i32 [[TMP5]], 1
 ; IR-NEXT:    [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
index 6b47f81bccb71..6c61c837881c4 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
@@ -130,7 +130,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -193,7 +193,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -251,7 +251,7 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -310,7 +310,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -364,7 +364,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -421,7 +421,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -473,7 +473,7 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -526,7 +526,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -586,7 +586,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -649,7 +649,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -707,7 +707,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -766,7 +766,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
@@ -820,7 +820,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -878,7 +878,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -931,7 +931,7 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -985,7 +985,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1040,7 +1040,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1098,7 +1098,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1151,7 +1151,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1205,7 +1205,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1260,7 +1260,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1318,7 +1318,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1371,7 +1371,7 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
@@ -1425,7 +1425,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
index b71728096093c..baaf50377338c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
@@ -29,7 +29,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_uns
 ; IR:       16:
 ; IR-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
 ; IR-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]])
+; IR-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]])
 ; IR-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
 ; IR-NEXT:    [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float
 ; IR-NEXT:    [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]]
@@ -62,7 +62,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
 ; IR-ITERATIVE:       12:
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP28:%.*]]
 ; IR-ITERATIVE-NEXT:    br label [[TMP18]]
@@ -76,11 +76,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]])
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]])
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
+; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
 ; IR-ITERATIVE-NEXT:    [[TMP28]] = bitcast i32 [[TMP27]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP29]] = fadd float [[ACCUMULATOR]], [[TMP24]]
 ; IR-ITERATIVE-NEXT:    [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -120,7 +120,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = fadd float [[TMP22]], [[TMP23]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false)
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63)
+; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63)
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]])
 ; IR-DPP-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -131,7 +131,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
 ; IR-DPP:       33:
 ; IR-DPP-NEXT:    [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
 ; IR-DPP-NEXT:    [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]])
+; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]])
 ; IR-DPP-NEXT:    [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
 ; IR-DPP-NEXT:    [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
 ; IR-DPP-NEXT:    [[TMP39:%.*]] = fadd float [[TMP37]], [[TMP38]]
@@ -167,7 +167,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un
 ; IR-ITERATIVE:       16:
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
@@ -199,7 +199,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un
 ; IR-DPP:       16:
 ; IR-DPP-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
 ; IR-DPP-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -232,7 +232,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
 ; IR-ITERATIVE:       12:
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    br label [[TMP18]]
@@ -246,11 +246,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP28]] = bitcast i32 [[TMP27]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -290,7 +290,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -301,7 +301,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
 ; IR-DPP:       33:
 ; IR-DPP-NEXT:    [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
 ; IR-DPP-NEXT:    [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
 ; IR-DPP-NEXT:    [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP39:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -337,7 +337,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str
 ; IR-ITERATIVE:       16:
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
@@ -369,7 +369,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str
 ; IR-DPP:       16:
 ; IR-DPP-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
 ; IR-DPP-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -402,7 +402,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
 ; IR-ITERATIVE:       12:
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    br label [[TMP18]]
@@ -416,11 +416,11 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP28]] = bitcast i32 [[TMP27]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -460,7 +460,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -471,7 +471,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
 ; IR-DPP:       33:
 ; IR-DPP-NEXT:    [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
 ; IR-DPP-NEXT:    [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
 ; IR-DPP-NEXT:    [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP39:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -503,7 +503,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_uns
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
 ; IR-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
 ; IR-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-NEXT:    [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float
 ; IR-NEXT:    [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
@@ -536,7 +536,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
 ; IR-ITERATIVE:       12:
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]])
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP28:%.*]])
 ; IR-ITERATIVE-NEXT:    br label [[TMP18]]
@@ -550,11 +550,11 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]])
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]])
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
+; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]])
 ; IR-ITERATIVE-NEXT:    [[TMP28]] = bitcast i32 [[TMP27]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP29]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP24]])
 ; IR-ITERATIVE-NEXT:    [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -594,7 +594,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.minnum.f32(float [[TMP22]], float [[TMP23]])
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP24]], i32 312, i32 15, i32 15, i1 false)
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63)
+; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63)
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]])
 ; IR-DPP-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -605,7 +605,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
 ; IR-DPP:       33:
 ; IR-DPP-NEXT:    [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
 ; IR-DPP-NEXT:    [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]])
+; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]])
 ; IR-DPP-NEXT:    [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
 ; IR-DPP-NEXT:    [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
 ; IR-DPP-NEXT:    [[TMP39:%.*]] = call float @llvm.minnum.f32(float [[TMP37]], float [[TMP38]])
@@ -637,7 +637,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns
 ; IR-ITERATIVE:       12:
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]]
@@ -665,7 +665,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns
 ; IR-DPP:       12:
 ; IR-DPP-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
 ; IR-DPP-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-DPP-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-DPP-NEXT:    [[TMP17:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP18:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]]
@@ -698,7 +698,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
 ; IR-ITERATIVE:       12:
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    br label [[TMP18]]
@@ -712,11 +712,11 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP28]] = bitcast i32 [[TMP27]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP29]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -756,7 +756,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP22]], float [[TMP23]], metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -767,7 +767,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
 ; IR-DPP:       33:
 ; IR-DPP-NEXT:    [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
 ; IR-DPP-NEXT:    [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
 ; IR-DPP-NEXT:    [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP39:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP37]], float [[TMP38]], metadata !"fpexcept.strict") #[[ATTR8]]
@@ -803,7 +803,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st
 ; IR-ITERATIVE:       16:
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
@@ -835,7 +835,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st
 ; IR-DPP:       16:
 ; IR-DPP-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
 ; IR-DPP-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -868,7 +868,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
 ; IR-ITERATIVE:       12:
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ]
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    br label [[TMP18]]
@@ -882,11 +882,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP28]] = bitcast i32 [[TMP27]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP30:%.*]] = shl i64 1, [[TMP20]]
@@ -926,7 +926,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -937,7 +937,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
 ; IR-DPP:       33:
 ; IR-DPP-NEXT:    [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ]
 ; IR-DPP-NEXT:    [[TMP35:%.*]] = bitcast float [[TMP34]] to i32
-; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
 ; IR-DPP-NEXT:    [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP39:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
@@ -1084,8 +1084,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_s
 ; IR-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
 ; IR-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
 ; IR-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]])
-; IR-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]])
+; IR-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]])
+; IR-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]])
 ; IR-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; IR-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
 ; IR-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1136,8 +1136,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_
 ; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1174,8 +1174,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_
 ; IR-DPP-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
 ; IR-DPP-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1226,8 +1226,8 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s
 ; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1264,8 +1264,8 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s
 ; IR-DPP-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
 ; IR-DPP-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1312,8 +1312,8 @@ define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_s
 ; IR-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-NEXT:    [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
 ; IR-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]])
-; IR-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]])
+; IR-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]])
+; IR-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]])
 ; IR-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
 ; IR-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
 ; IR-NEXT:    [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
@@ -1360,8 +1360,8 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
 ; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
@@ -1394,8 +1394,8 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_
 ; IR-DPP-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-DPP-NEXT:    [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
 ; IR-DPP-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
-; IR-DPP-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR8]]
-; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
 ; IR-DPP-NEXT:    [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
@@ -1446,8 +1446,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_
 ; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
 ; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
 ; IR-ITERATIVE-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
@@ -1484,8 +1484,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_
 ; IR-DPP-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
 ; IR-DPP-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
-; IR-DPP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
-; IR-DPP-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll
index f954560d0f5ca..4b4c99b3cd14c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll
@@ -83,7 +83,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, ptr addrspace(
 ; IR-NEXT:    [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP6]], [[ENTRY]] ], [ [[TMP16:%.*]], [[COMPUTELOOP]] ]
 ; IR-NEXT:    [[TMP10:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; IR-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
-; IR-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[VALUE]], i32 [[TMP11]])
+; IR-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[VALUE]], i32 [[TMP11]])
 ; IR-NEXT:    [[TMP13]] = add i32 [[ACCUMULATOR]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = shl i64 1, [[TMP10]]
 ; IR-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], -1
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
index 86e3d9338e078..38823681d1bb5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
@@ -69,7 +69,7 @@ define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #
 ; IR-ITERATIVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; IR-ITERATIVE-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]])
+; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP13]], i32 [[TMP12]])
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]]
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = shl i64 1, [[TMP11]]
@@ -107,7 +107,7 @@ define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false)
 ; IR-DPP-NEXT:    [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]]
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
-; IR-DPP-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP23]], i32 63)
+; IR-DPP-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 63)
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP6]], 0
@@ -191,7 +191,7 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #
 ; IR-ITERATIVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; IR-ITERATIVE-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]])
+; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP13]], i32 [[TMP12]])
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]]
 ; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = shl i64 1, [[TMP11]]
@@ -229,7 +229,7 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #
 ; IR-DPP-NEXT:    [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false)
 ; IR-DPP-NEXT:    [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]]
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
-; IR-DPP-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP23]], i32 63)
+; IR-DPP-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 63)
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float
 ; IR-DPP-NEXT:    [[TMP26:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]])
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP6]], 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
index b9234f47df192..83453354320fe 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
@@ -61,7 +61,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]])
+; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]])
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP19]] = fadd float [[ACCUMULATOR]], [[TMP18]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -100,7 +100,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false)
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = fadd float [[TMP22]], [[TMP23]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63)
+; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63)
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]])
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -196,7 +196,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -235,7 +235,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -331,7 +331,7 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -370,7 +370,7 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -438,7 +438,7 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]])
+; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]])
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP19]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP18]])
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -477,7 +477,7 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false)
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.minnum.f32(float [[TMP22]], float [[TMP23]])
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63)
+; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63)
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]])
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -565,7 +565,7 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP19]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -604,7 +604,7 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP22]], float [[TMP23]], metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
@@ -700,7 +700,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str
 ; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
 ; IR-ITERATIVE-NEXT:    [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
 ; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = shl i64 1, [[TMP14]]
@@ -739,7 +739,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str
 ; IR-DPP-NEXT:    [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
-; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
+; IR-DPP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
 ; IR-DPP-NEXT:    [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]]
 ; IR-DPP-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 94c32e3cbe99f..483ea8ad57d1b 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -2714,7 +2714,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
 
 define amdgpu_kernel void @readfirstlane_constant(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_constant(
-; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    store volatile i32 [[VAR]], ptr undef, align 4
 ; CHECK-NEXT:    store volatile i32 0, ptr undef, align 4
 ; CHECK-NEXT:    store volatile i32 123, ptr undef, align 4
@@ -2737,7 +2737,7 @@ define amdgpu_kernel void @readfirstlane_constant(i32 %arg) {
 
 define i32 @readfirstlane_idempotent(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_idempotent(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    ret i32 [[READ0]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2748,7 +2748,7 @@ define i32 @readfirstlane_idempotent(i32 %arg) {
 
 define i32 @readfirstlane_readlane(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readlane(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    ret i32 [[READ0]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2759,10 +2759,10 @@ define i32 @readfirstlane_readlane(i32 %arg) {
 define i32 @readfirstlane_readfirstlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readfirstlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]])
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
 bb0:
@@ -2777,10 +2777,10 @@ bb1:
 define i32 @readfirstlane_readlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0)
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 0)
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]])
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
 bb0:
@@ -2800,7 +2800,7 @@ declare i32 @llvm.amdgcn.readlane(i32, i32)
 
 define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_constant(
-; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 7)
+; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 7)
 ; CHECK-NEXT:    store volatile i32 [[VAR]], ptr undef, align 4
 ; CHECK-NEXT:    store volatile i32 0, ptr undef, align 4
 ; CHECK-NEXT:    store volatile i32 123, ptr undef, align 4
@@ -2823,7 +2823,7 @@ define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) {
 
 define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_idempotent(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
 ; CHECK-NEXT:    ret i32 [[READ0]]
 ;
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane)
@@ -2833,8 +2833,8 @@ define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
 
 define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) {
 ; CHECK-LABEL: @readlane_idempotent_different_lanes(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE1:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE1:%.*]])
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0)
@@ -2844,7 +2844,7 @@ define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1
 
 define i32 @readlane_readfirstlane(i32 %arg) {
 ; CHECK-LABEL: @readlane_readfirstlane(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    ret i32 [[READ0]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2855,10 +2855,10 @@ define i32 @readlane_readfirstlane(i32 %arg) {
 define i32 @readlane_idempotent_different_block(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_idempotent_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE]])
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE]])
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
 bb0:
@@ -2874,10 +2874,10 @@ bb1:
 define i32 @readlane_readfirstlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readlane_readfirstlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0)
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 0)
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
 bb0:

>From dfa321965462920f8e656695a425c7a7c24dcf42 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Mon, 15 Apr 2024 09:11:14 +0000
Subject: [PATCH 03/30] Fix issues with writelane expansion

---
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 3 ++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  | 3 ---
 llvm/lib/Target/AMDGPU/SIInstructions.td   | 4 +++-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 8b21c22b44971..d722b6fb56bcc 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -691,7 +691,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
 
         break;
       }
-      case AMDGPU::V_WRITELANE_B32: {
+      case AMDGPU::V_WRITELANE_B32:
+      case AMDGPU::V_WRITELANE_PSEUDO_B64: {
         // Some architectures allow more than one constant bus access without
         // SGPR restriction
         if (ST.getConstantBusLimit(MI.getOpcode()) != 1)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4ae17ccfe8067..e59e286b408a1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5160,9 +5160,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
-    const TargetRegisterClass *Src1RC = Src1.isReg()
-                                        ? MRI.getRegClass(Src1.getReg())
-                                        : &AMDGPU::SReg_32RegClass;
     const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());                                     
     
     const TargetRegisterClass *Src0SubRC =
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index decd59be5eb29..6a3ae238a6e84 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -303,7 +303,9 @@ let usesCustomInserter = 1 in {
   def V_WRITELANE_PSEUDO_B64 : VPseudoInstSI <
   (outs VReg_64:$sdst), (ins SReg_64:$src0, SSrc_b32:$src1, VReg_64:$src2),
   [(set i64:$sdst, (int_amdgcn_writelane i64:$src0, i32:$src1, i64:$src2))]
-  >;
+  > {
+    let UseNamedOperandTable = 1;
+  }
 }
 
 let usesCustomInserter = 1, Defs = [SCC] in {

>From fcc0a1a3ab4a133871d8279eead5d3579585ebcc Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Thu, 18 Apr 2024 05:42:37 +0000
Subject: [PATCH 04/30] code refactor and add patterns for f64

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 227 +++++++++-------------
 llvm/lib/Target/AMDGPU/SIInstructions.td  |  35 +++-
 2 files changed, 117 insertions(+), 145 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e59e286b408a1..5d39f9e9b183d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4822,6 +4822,94 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
   return RetBB;
 }
 
+static MachineBasicBlock* lowerPseudoLaneOp(MachineInstr &MI,
+                                            MachineBasicBlock *BB,
+                                            const GCNSubtarget &ST,
+                                            unsigned Opc) {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    const DebugLoc &DL = MI.getDebugLoc();
+    const SIInstrInfo *TII = ST.getInstrInfo();
+
+    MachineOperand &Dest = MI.getOperand(0);
+    MachineOperand &Src0 = MI.getOperand(1);
+
+    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+    const TargetRegisterClass *Src0SubRC =
+                              TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
+
+    Register DestSub0 = MRI.createVirtualRegister(
+      (Opc == AMDGPU::V_WRITELANE_PSEUDO_B64) ? &AMDGPU::VGPR_32RegClass: &AMDGPU::SGPR_32RegClass);
+    Register DestSub1 = MRI.createVirtualRegister(
+      (Opc == AMDGPU::V_WRITELANE_PSEUDO_B64) ? &AMDGPU::VGPR_32RegClass: &AMDGPU::SGPR_32RegClass);
+
+
+    MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+
+    MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+
+    MachineInstr *LoHalf, *HighHalf;
+    switch(Opc) {
+      case AMDGPU::V_READLANE_PSEUDO_B64: {
+        MachineOperand &Src1 = MI.getOperand(2);
+        LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub0)
+                              .add(SrcReg0Sub0)
+                              .add(Src1);
+        HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub1)
+                                .add(SrcReg0Sub1)
+                                .add(Src1);
+         break;
+        }
+      case AMDGPU::V_READFIRSTLANE_PSEUDO_B64: {
+        LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub0)
+                               .add(SrcReg0Sub0);
+        HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub1)
+                                 .add(SrcReg0Sub1);
+        break;
+        }
+      case AMDGPU::V_WRITELANE_PSEUDO_B64: {
+         MachineOperand &Src1 = MI.getOperand(2);
+         MachineOperand &Src2 = MI.getOperand(3);
+
+         const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
+         const TargetRegisterClass *Src2SubRC =
+                                    TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
+
+        MachineOperand SrcReg2Sub0 = TII->buildExtractSubRegOrImm(
+              MI, MRI, Src2, Src2RC, AMDGPU::sub0, Src2SubRC);
+
+        MachineOperand SrcReg2Sub1 = TII->buildExtractSubRegOrImm(
+              MI, MRI, Src2, Src2RC, AMDGPU::sub1, Src2SubRC);
+
+        LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub0)
+                        .add(SrcReg0Sub0)
+                        .add(Src1)
+                        .add(SrcReg2Sub0);
+        HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub1)
+                          .add(SrcReg0Sub1)
+                          .add(Src1)
+                          .add(SrcReg2Sub1);
+        break;
+        }
+        default:
+          llvm_unreachable("should not occur");
+      }
+
+    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+          .addReg(DestSub0)
+          .addImm(AMDGPU::sub0)
+          .addReg(DestSub1)
+          .addImm(AMDGPU::sub1);
+
+    TII->legalizeOperands(*LoHalf);
+    TII->legalizeOperands(*HighHalf);
+
+    MI.eraseFromParent();
+    return BB;
+}
+
 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   MachineInstr &MI, MachineBasicBlock *BB) const {
 
@@ -5065,141 +5153,10 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     MI.eraseFromParent();
     return BB;
   }
-  case AMDGPU::V_READLANE_PSEUDO_B64: {
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-    const SIRegisterInfo *TRI = ST.getRegisterInfo();
-    const DebugLoc &DL = MI.getDebugLoc();
-
-    MachineOperand &Dest = MI.getOperand(0);
-    MachineOperand &Src0 = MI.getOperand(1);
-    MachineOperand &Src1 = MI.getOperand(2);
-
-    Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-    Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-
-    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
-    const TargetRegisterClass *Src1RC = Src1.isReg()
-                                        ? MRI.getRegClass(Src1.getReg())
-                                        : &AMDGPU::SReg_32RegClass;
-    
-    const TargetRegisterClass *Src0SubRC =
-        TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
-    
-    MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
-        MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
-
-    MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
-        MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
-    
-    MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub0)
-                           .add(SrcReg0Sub0)
-                           .add(Src1);
-    MachineInstr *HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub1)
-                           .add(SrcReg0Sub1)
-                           .add(Src1);
-
-    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
-          .addReg(DestSub0)
-          .addImm(AMDGPU::sub0)
-          .addReg(DestSub1)
-          .addImm(AMDGPU::sub1);
-
-    MI.eraseFromParent();
-    return BB;
-  }
-  case AMDGPU::V_READFIRSTLANE_PSEUDO_B64: {
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-    const SIRegisterInfo *TRI = ST.getRegisterInfo();
-    const DebugLoc &DL = MI.getDebugLoc();
-
-    MachineOperand &Dest = MI.getOperand(0);
-    MachineOperand &Src0 = MI.getOperand(1);
-
-    Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-    Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-
-    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
-
-    const TargetRegisterClass *Src0SubRC =
-        TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
-    
-    MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
-        MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
-
-    MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
-        MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
-
-    MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub0)
-                           .add(SrcReg0Sub0);
-    MachineInstr *HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub1)
-                           .add(SrcReg0Sub1);
-    
-    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
-          .addReg(DestSub0)
-          .addImm(AMDGPU::sub0)
-          .addReg(DestSub1)
-          .addImm(AMDGPU::sub1);
-
-    MI.eraseFromParent();
-    return BB;
-  }
-  case AMDGPU::V_WRITELANE_PSEUDO_B64: {
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-    const SIRegisterInfo *TRI = ST.getRegisterInfo();
-    const DebugLoc &DL = MI.getDebugLoc();
-
-    MachineOperand &Dest = MI.getOperand(0);
-    MachineOperand &Src0 = MI.getOperand(1);
-    MachineOperand &Src1 = MI.getOperand(2);
-    MachineOperand &Src2 = MI.getOperand(3);
-
-    Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
-    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
-    const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());                                     
-    
-    const TargetRegisterClass *Src0SubRC =
-        TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
-    
-    MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
-        MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
-
-    MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
-        MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
-
-    
-    const TargetRegisterClass *Src2SubRC =
-        TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
-    
-    MachineOperand SrcReg2Sub0 = TII->buildExtractSubRegOrImm(
-        MI, MRI, Src2, Src2RC, AMDGPU::sub0, Src2SubRC);
-
-    MachineOperand SrcReg2Sub1 = TII->buildExtractSubRegOrImm(
-        MI, MRI, Src2, Src2RC, AMDGPU::sub1, Src2SubRC);        
-    
-    
-    MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub0)
-                           .add(SrcReg0Sub0)
-                           .add(Src1)
-                           .add(SrcReg2Sub0);
-    MachineInstr *HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub1)
-                           .add(SrcReg0Sub1)
-                           .add(Src1)
-                           .add(SrcReg2Sub1);
-    
-    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
-          .addReg(DestSub0)
-          .addImm(AMDGPU::sub0)
-          .addReg(DestSub1)
-          .addImm(AMDGPU::sub1);
-
-    MI.eraseFromParent();
-    return BB;   
-  }
+  case AMDGPU::V_READLANE_PSEUDO_B64:
+  case AMDGPU::V_READFIRSTLANE_PSEUDO_B64:
+  case AMDGPU::V_WRITELANE_PSEUDO_B64:
+      return lowerPseudoLaneOp(MI, BB, *getSubtarget(), MI.getOpcode());
   case AMDGPU::SI_INIT_M0: {
     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6a3ae238a6e84..e8ece71fe07b7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -291,22 +291,37 @@ def V_SUB_U64_PSEUDO : VPseudoInstSI <
 
 let usesCustomInserter = 1 in {
   def V_READLANE_PSEUDO_B64 : VPseudoInstSI <
-  (outs SReg_64:$sdst), (ins VReg_64:$src0, SSrc_b32:$src1),
-  [(set i64:$sdst, (int_amdgcn_readlane i64:$src0, i32:$src1))]
-  >;
+  (outs SReg_64:$sdst), (ins VReg_64:$src0, SSrc_b32:$src1)>;
 
   def V_READFIRSTLANE_PSEUDO_B64 : VPseudoInstSI <
-  (outs SReg_64:$sdst), (ins VReg_64:$src0),
-  [(set i64:$sdst, (int_amdgcn_readfirstlane i64:$src0))]
-  >;
+  (outs SReg_64:$sdst), (ins VReg_64:$src0)>;
   
   def V_WRITELANE_PSEUDO_B64 : VPseudoInstSI <
-  (outs VReg_64:$sdst), (ins SReg_64:$src0, SSrc_b32:$src1, VReg_64:$src2),
-  [(set i64:$sdst, (int_amdgcn_writelane i64:$src0, i32:$src1, i64:$src2))]
-  > {
+  (outs VReg_64:$sdst), (ins SReg_64:$src0, SSrc_b32:$src1, VReg_64:$src2)> {
     let UseNamedOperandTable = 1;
   }
-}
+} // End usesCustomInserter = 1
+
+class ReadLanePseudoPat <ValueType vt> : GCNPat <
+  (vt (int_amdgcn_readlane vt:$src0, i32:$src1)),
+  (V_READLANE_PSEUDO_B64 VReg_64:$src0, SSrc_b32:$src1)>;
+
+def : ReadLanePseudoPat<i64>;
+def : ReadLanePseudoPat<f64>;
+
+class WriteLanePseudoPat <ValueType vt> : GCNPat <
+  (vt (int_amdgcn_writelane vt:$src0, i32:$src1, vt:$src2)),
+  (V_WRITELANE_PSEUDO_B64 SReg_64:$src0, SSrc_b32:$src1, VReg_64:$src2)>;
+
+def : WriteLanePseudoPat<i64>;
+def : WriteLanePseudoPat<f64>;
+
+class ReadFirstLanePseudoPat <ValueType vt> : GCNPat <
+  (vt (int_amdgcn_readfirstlane vt:$src0)),
+  (V_READFIRSTLANE_PSEUDO_B64 VReg_64:$src0)>;
+
+def : ReadFirstLanePseudoPat<i64>;
+def : ReadFirstLanePseudoPat<f64>;
 
 let usesCustomInserter = 1, Defs = [SCC] in {
 def S_ADD_U64_PSEUDO : SPseudoInstSI <

>From 4e71a0649b775f40962cea4050c3a50f3ee1fa2e Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Thu, 18 Apr 2024 06:06:08 +0000
Subject: [PATCH 05/30] clang format

---
 .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp   |  28 ++--
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 151 +++++++++---------
 2 files changed, 91 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 5b3fa148e5619..0ec77d66e596d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -493,8 +493,8 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
     if (!ST->isWave32()) {
       // Combine lane 31 into lanes 32..63.
       V = B.CreateBitCast(V, IntNTy);
-      Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, B.getInt32Ty(),
-                                              {V, B.getInt32(31)});
+      Value *const Lane31 = B.CreateIntrinsic(
+          Intrinsic::amdgcn_readlane, B.getInt32Ty(), {V, B.getInt32(31)});
 
       Value *UpdateDPPCall = B.CreateCall(
           UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
@@ -523,10 +523,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
                      {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
                       B.getInt32(0xf), B.getFalse()});
   } else {
-    Function *ReadLane =
-        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
-    Function *WriteLane =
-        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
+    Function *ReadLane = Intrinsic::getDeclaration(
+        M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
+    Function *WriteLane = Intrinsic::getDeclaration(
+        M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
 
     // On GFX10 all DPP operations are confined to a single row. To get cross-
     // row operations we have to use permlane or readlane.
@@ -598,8 +598,8 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
 
   // Get the value required for atomic operation
   V = B.CreateBitCast(V, IntNTy);
-  Value *LaneValue =
-      B.CreateIntrinsic(Intrinsic::amdgcn_readlane, B.getInt32Ty(), {V, LaneIdxInt});
+  Value *LaneValue = B.CreateIntrinsic(Intrinsic::amdgcn_readlane,
+                                       B.getInt32Ty(), {V, LaneIdxInt});
   LaneValue = B.CreateBitCast(LaneValue, Ty);
 
   // Perform writelane if intermediate scan results are required later in the
@@ -925,10 +925,10 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
       Value *const ExtractLo = B.CreateTrunc(CastedPhi, Int32Ty);
       Value *const ExtractHi =
           B.CreateTrunc(B.CreateLShr(CastedPhi, 32), Int32Ty);
-      CallInst *const ReadFirstLaneLo =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, Int32Ty, ExtractLo);
-      CallInst *const ReadFirstLaneHi =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, Int32Ty, ExtractHi);
+      CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(
+          Intrinsic::amdgcn_readfirstlane, Int32Ty, ExtractLo);
+      CallInst *const ReadFirstLaneHi = B.CreateIntrinsic(
+          Intrinsic::amdgcn_readfirstlane, Int32Ty, ExtractHi);
       Value *const PartialInsert = B.CreateInsertElement(
           PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
       Value *const Insert =
@@ -936,8 +936,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
       BroadcastI = B.CreateBitCast(Insert, Ty);
     } else if (TyBitWidth == 32) {
       Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
-      BroadcastI =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, Int32Ty, CastedPhi);
+      BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, Int32Ty,
+                                     CastedPhi);
       BroadcastI = B.CreateBitCast(BroadcastI, Ty);
 
     } else {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5d39f9e9b183d..b070ecafd950f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4822,92 +4822,95 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
   return RetBB;
 }
 
-static MachineBasicBlock* lowerPseudoLaneOp(MachineInstr &MI,
+static MachineBasicBlock *lowerPseudoLaneOp(MachineInstr &MI,
                                             MachineBasicBlock *BB,
                                             const GCNSubtarget &ST,
                                             unsigned Opc) {
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    const SIRegisterInfo *TRI = ST.getRegisterInfo();
-    const DebugLoc &DL = MI.getDebugLoc();
-    const SIInstrInfo *TII = ST.getInstrInfo();
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const DebugLoc &DL = MI.getDebugLoc();
+  const SIInstrInfo *TII = ST.getInstrInfo();
 
-    MachineOperand &Dest = MI.getOperand(0);
-    MachineOperand &Src0 = MI.getOperand(1);
+  MachineOperand &Dest = MI.getOperand(0);
+  MachineOperand &Src0 = MI.getOperand(1);
 
-    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
-    const TargetRegisterClass *Src0SubRC =
-                              TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
+  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+  const TargetRegisterClass *Src0SubRC =
+      TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
 
-    Register DestSub0 = MRI.createVirtualRegister(
-      (Opc == AMDGPU::V_WRITELANE_PSEUDO_B64) ? &AMDGPU::VGPR_32RegClass: &AMDGPU::SGPR_32RegClass);
-    Register DestSub1 = MRI.createVirtualRegister(
-      (Opc == AMDGPU::V_WRITELANE_PSEUDO_B64) ? &AMDGPU::VGPR_32RegClass: &AMDGPU::SGPR_32RegClass);
+  Register DestSub0 = MRI.createVirtualRegister(
+      (Opc == AMDGPU::V_WRITELANE_PSEUDO_B64) ? &AMDGPU::VGPR_32RegClass
+                                              : &AMDGPU::SGPR_32RegClass);
+  Register DestSub1 = MRI.createVirtualRegister(
+      (Opc == AMDGPU::V_WRITELANE_PSEUDO_B64) ? &AMDGPU::VGPR_32RegClass
+                                              : &AMDGPU::SGPR_32RegClass);
 
+  MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
+      MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
 
-    MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
-        MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+  MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
+      MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
 
-    MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
-        MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+  MachineInstr *LoHalf, *HighHalf;
+  switch (Opc) {
+  case AMDGPU::V_READLANE_PSEUDO_B64: {
+    MachineOperand &Src1 = MI.getOperand(2);
+    LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub0)
+                 .add(SrcReg0Sub0)
+                 .add(Src1);
+    HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub1)
+                   .add(SrcReg0Sub1)
+                   .add(Src1);
+    break;
+  }
+  case AMDGPU::V_READFIRSTLANE_PSEUDO_B64: {
+    LoHalf =
+        BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub0)
+            .add(SrcReg0Sub0);
+    HighHalf =
+        BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub1)
+            .add(SrcReg0Sub1);
+    break;
+  }
+  case AMDGPU::V_WRITELANE_PSEUDO_B64: {
+    MachineOperand &Src1 = MI.getOperand(2);
+    MachineOperand &Src2 = MI.getOperand(3);
 
-    MachineInstr *LoHalf, *HighHalf;
-    switch(Opc) {
-      case AMDGPU::V_READLANE_PSEUDO_B64: {
-        MachineOperand &Src1 = MI.getOperand(2);
-        LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub0)
-                              .add(SrcReg0Sub0)
-                              .add(Src1);
-        HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub1)
-                                .add(SrcReg0Sub1)
-                                .add(Src1);
-         break;
-        }
-      case AMDGPU::V_READFIRSTLANE_PSEUDO_B64: {
-        LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub0)
-                               .add(SrcReg0Sub0);
-        HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub1)
-                                 .add(SrcReg0Sub1);
-        break;
-        }
-      case AMDGPU::V_WRITELANE_PSEUDO_B64: {
-         MachineOperand &Src1 = MI.getOperand(2);
-         MachineOperand &Src2 = MI.getOperand(3);
-
-         const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
-         const TargetRegisterClass *Src2SubRC =
-                                    TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
-
-        MachineOperand SrcReg2Sub0 = TII->buildExtractSubRegOrImm(
-              MI, MRI, Src2, Src2RC, AMDGPU::sub0, Src2SubRC);
-
-        MachineOperand SrcReg2Sub1 = TII->buildExtractSubRegOrImm(
-              MI, MRI, Src2, Src2RC, AMDGPU::sub1, Src2SubRC);
-
-        LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub0)
-                        .add(SrcReg0Sub0)
-                        .add(Src1)
-                        .add(SrcReg2Sub0);
-        HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub1)
-                          .add(SrcReg0Sub1)
-                          .add(Src1)
-                          .add(SrcReg2Sub1);
-        break;
-        }
-        default:
-          llvm_unreachable("should not occur");
-      }
+    const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
+    const TargetRegisterClass *Src2SubRC =
+        TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
+
+    MachineOperand SrcReg2Sub0 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src2, Src2RC, AMDGPU::sub0, Src2SubRC);
+
+    MachineOperand SrcReg2Sub1 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src2, Src2RC, AMDGPU::sub1, Src2SubRC);
+
+    LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub0)
+                 .add(SrcReg0Sub0)
+                 .add(Src1)
+                 .add(SrcReg2Sub0);
+    HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub1)
+                   .add(SrcReg0Sub1)
+                   .add(Src1)
+                   .add(SrcReg2Sub1);
+    break;
+  }
+  default:
+    llvm_unreachable("should not occur");
+  }
 
-    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
-          .addReg(DestSub0)
-          .addImm(AMDGPU::sub0)
-          .addReg(DestSub1)
-          .addImm(AMDGPU::sub1);
+  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+      .addReg(DestSub0)
+      .addImm(AMDGPU::sub0)
+      .addReg(DestSub1)
+      .addImm(AMDGPU::sub1);
 
-    TII->legalizeOperands(*LoHalf);
-    TII->legalizeOperands(*HighHalf);
+  TII->legalizeOperands(*LoHalf);
+  TII->legalizeOperands(*HighHalf);
 
-    MI.eraseFromParent();
-    return BB;
+  MI.eraseFromParent();
+  return BB;
 }
 
 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
@@ -5156,7 +5159,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::V_READLANE_PSEUDO_B64:
   case AMDGPU::V_READFIRSTLANE_PSEUDO_B64:
   case AMDGPU::V_WRITELANE_PSEUDO_B64:
-      return lowerPseudoLaneOp(MI, BB, *getSubtarget(), MI.getOpcode());
+    return lowerPseudoLaneOp(MI, BB, *getSubtarget(), MI.getOpcode());
   case AMDGPU::SI_INIT_M0: {
     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)

>From c7ff0e53511f485cad9c2370a54e68c77ef2a957 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Thu, 18 Apr 2024 11:26:08 +0000
Subject: [PATCH 06/30] fix corner case with regkill and add readlane tests

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  12 ++
 .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll    | 187 ++++++++++++++++--
 2 files changed, 180 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b070ecafd950f..79a9f451589b1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4855,9 +4855,15 @@ static MachineBasicBlock *lowerPseudoLaneOp(MachineInstr &MI,
   switch (Opc) {
   case AMDGPU::V_READLANE_PSEUDO_B64: {
     MachineOperand &Src1 = MI.getOperand(2);
+    auto IsKill = (Src1.isReg() && Src1.isKill());
+    if (IsKill)
+      Src1.setIsKill(false);
     LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub0)
                  .add(SrcReg0Sub0)
                  .add(Src1);
+
+    if (IsKill)
+      Src1.setIsKill(true);
     HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub1)
                    .add(SrcReg0Sub1)
                    .add(Src1);
@@ -4875,6 +4881,7 @@ static MachineBasicBlock *lowerPseudoLaneOp(MachineInstr &MI,
   case AMDGPU::V_WRITELANE_PSEUDO_B64: {
     MachineOperand &Src1 = MI.getOperand(2);
     MachineOperand &Src2 = MI.getOperand(3);
+    auto IsKill = (Src1.isReg() && Src1.isKill());
 
     const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
     const TargetRegisterClass *Src2SubRC =
@@ -4886,10 +4893,15 @@ static MachineBasicBlock *lowerPseudoLaneOp(MachineInstr &MI,
     MachineOperand SrcReg2Sub1 = TII->buildExtractSubRegOrImm(
         MI, MRI, Src2, Src2RC, AMDGPU::sub1, Src2SubRC);
 
+    if (IsKill)
+      Src1.setIsKill(false);
     LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub0)
                  .add(SrcReg0Sub0)
                  .add(Src1)
                  .add(SrcReg2Sub0);
+
+    if (IsKill)
+      Src1.setIsKill(true);
     HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub1)
                    .add(SrcReg0Sub1)
                    .add(Src1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 51465f6bd10ce..49bb8ca262e85 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -1,46 +1,141 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
 
-declare i32 @llvm.amdgcn.readlane(i32, i32) #0
+declare i32 @llvm.amdgcn.readlane.i32(i32, i32) #0
+declare i64 @llvm.amdgcn.readlane.i64(i64, i32) #0
+declare double @llvm.amdgcn.readlane.f64(double, i32) #0
 
-; CHECK-LABEL: {{^}}test_readlane_sreg_sreg:
+; CHECK-LABEL: {{^}}test_readlane_sreg_sreg_i32:
 ; CHECK-NOT: v_readlane_b32
-define amdgpu_kernel void @test_readlane_sreg_sreg(i32 %src0, i32 %src1) #1 {
-  %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
+define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 {
+  %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %src0, i32 %src1)
   call void asm sideeffect "; use $0", "s"(i32 %readlane)
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vreg_sreg:
+; TODO: should optimize this as for i32
+; CHECK-LABEL: {{^}}test_readlane_sreg_sreg_i64:
+; CHECK: v_mov_b32_e32 [[VREG0:v[0-9]+]], {{s[0-9]+}}
+; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], {{s[0-9]+}}
+; CHECK: v_readlane_b32 {{s[0-9]+}}, [[VREG0]], {{s[0-9]+}}
+; CHECK: v_readlane_b32 {{s[0-9]+}}, [[VREG1]], {{s[0-9]+}}
+define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 {
+  %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %src0, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(i64 %readlane)
+  ret void
+}
+
+; TODO: should optimize this as for i32
+; CHECK-LABEL: {{^}}test_readlane_sreg_sreg_f64:
+; CHECK: v_mov_b32_e32 [[VREG0:v[0-9]+]], {{s[0-9]+}}
+; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], {{s[0-9]+}}
+; CHECK: v_readlane_b32 {{s[0-9]+}}, [[VREG0]], {{s[0-9]+}}
+; CHECK: v_readlane_b32 {{s[0-9]+}}, [[VREG1]], {{s[0-9]+}}
+define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) #1 {
+  %readlane = call double @llvm.amdgcn.readlane.f64(double %src0, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(double %readlane)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_vreg_sreg_i32:
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_readlane_vreg_sreg(i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 {
   %vgpr = call i32 asm sideeffect "; def $0", "=v"()
-  %readlane = call i32 @llvm.amdgcn.readlane(i32 %vgpr, i32 %src1)
+  %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %vgpr, i32 %src1)
   call void asm sideeffect "; use $0", "s"(i32 %readlane)
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_imm_sreg:
+; CHECK-LABEL: {{^}}test_readlane_vreg_sreg_i64:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 {
+  %vgpr = call i64 asm sideeffect "; def $0", "=v"()
+  %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %vgpr, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(i64 %readlane)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_vreg_sreg_f64:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) #1 {
+  %vgpr = call double asm sideeffect "; def $0", "=v"()
+  %readlane = call double @llvm.amdgcn.readlane.f64(double %vgpr, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(double %readlane)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_imm_sreg_i32:
 ; CHECK-NOT: v_readlane_b32
-define amdgpu_kernel void @test_readlane_imm_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
-  %readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1)
+define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
+  %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 32, i32 %src1)
   store i32 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vregs:
+; CHECK-LABEL: {{^}}test_readlane_imm_sreg_i64:
+; CHECK-NOT: v_readlane_b32
+define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 {
+  %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 32, i32 %src1)
+  store i64 %readlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_imm_sreg_f64:
+; CHECK-NOT: v_readlane_b32
+define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 {
+  %readlane = call double @llvm.amdgcn.readlane.f64(double 32.0, i32 %src1)
+  store double %readlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_vregs_i32:
 ; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
-define amdgpu_kernel void @test_readlane_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid
   %args = load <2 x i32>, ptr addrspace(1) %gep.in
   %value = extractelement <2 x i32> %args, i32 0
   %lane = extractelement <2 x i32> %args, i32 1
-  %readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane)
+  %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %value, i32 %lane)
   store i32 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_readlane_vregs_i64:
+; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
+define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
+  %args = load <2 x i64>, ptr addrspace(1) %gep.in
+  %value = extractelement <2 x i64> %args, i32 0
+  %lane = extractelement <2 x i64> %args, i32 1
+  %lane32 = trunc i64 %lane to i32
+  %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %value, i32 %lane32)
+  store i64 %readlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_vregs_f64:
+; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
+define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr <2 x double>, ptr addrspace(1) %in, i32 %tid
+  %args = load <2 x double>, ptr addrspace(1) %gep.in
+  %value = extractelement <2 x double> %args, i32 0
+  %lane = extractelement <2 x double> %args, i32 1
+  %lane_cast = bitcast double %lane to i64
+  %lane32 = trunc i64 %lane_cast to i32
+  %readlane = call double @llvm.amdgcn.readlane.f64(double %value, i32 %lane32)
+  store double %readlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 ; TODO: m0 should be folded.
 ; CHECK-LABEL: {{^}}test_readlane_m0_sreg:
 ; CHECK: s_mov_b32 m0, -1
@@ -53,16 +148,36 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vgpr_imm:
+; CHECK-LABEL: {{^}}test_readlane_vgpr_imm_i32:
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
-define amdgpu_kernel void @test_readlane_vgpr_imm(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 {
   %vgpr = call i32 asm sideeffect "; def $0", "=v"()
-  %readlane = call i32 @llvm.amdgcn.readlane(i32 %vgpr, i32 32) #0
+  %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %vgpr, i32 32) #0
   store i32 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_copy_from_sgpr:
+; CHECK-LABEL: {{^}}test_readlane_vgpr_imm_i64:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
+define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 {
+  %vgpr = call i64 asm sideeffect "; def $0", "=v"()
+  %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %vgpr, i32 32) #0
+  store i64 %readlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_vgpr_imm_f64:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
+define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 {
+  %vgpr = call double asm sideeffect "; def $0", "=v"()
+  %readlane = call double @llvm.amdgcn.readlane.f64(double %vgpr, i32 32) #0
+  store double %readlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_copy_from_sgpr_i32:
 ; CHECK: ;;#ASMSTART
 ; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]]
 ; CHECK: ;;#ASMEND
@@ -70,13 +185,47 @@ define amdgpu_kernel void @test_readlane_vgpr_imm(ptr addrspace(1) %out) #1 {
 ; CHECK-NOT: readlane
 ; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]]
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
-define amdgpu_kernel void @test_readlane_copy_from_sgpr(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 {
   %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
-  %readfirstlane = call i32 @llvm.amdgcn.readlane(i32 %sgpr, i32 7)
+  %readfirstlane = call i32 @llvm.amdgcn.readlane.i32(i32 %sgpr, i32 7)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
+; TODO: should optimize this as for i32
+; CHECK-LABEL: {{^}}test_readlane_copy_from_sgpr_i64:
+; CHECK: ;;#ASMSTART
+; CHECK-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}
+; CHECK: ;;#ASMEND
+; CHECK: v_readlane_b32 [[SGPR0:s[0-9]+]], {{v[0-9]+}}, 7
+; CHECK: v_readlane_b32 [[SGPR1:s[0-9]+]], {{v[0-9]+}}, 7
+; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR0]]
+; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR1]]
+; CHECK: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
+define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 {
+  %sgpr = call i64 asm "s_mov_b64 $0, 0", "=s"()
+  %readfirstlane = call i64 @llvm.amdgcn.readlane.i64(i64 %sgpr, i32 7)
+  store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; TODO: should optimize this as for i32
+; CHECK-LABEL: {{^}}test_readlane_copy_from_sgpr_f64:
+; CHECK: ;;#ASMSTART
+; CHECK-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}
+; CHECK: ;;#ASMEND
+; CHECK: v_readlane_b32 [[SGPR0:s[0-9]+]], {{v[0-9]+}}, 7
+; CHECK: v_readlane_b32 [[SGPR1:s[0-9]+]], {{v[0-9]+}}, 7
+; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR0]]
+; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR1]]
+; CHECK: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
+define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 {
+  %sgpr = call double asm "s_mov_b64 $0, 0", "=s"()
+  %readfirstlane = call double @llvm.amdgcn.readlane.f64(double %sgpr, i32 7)
+  store double %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }

>From d6a8ce4c5f54b19cef43ffb470698689ab6dfee6 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Fri, 19 Apr 2024 07:13:44 +0000
Subject: [PATCH 07/30] update builtin handling for readlane and readfirstlane

---
 clang/lib/CodeGen/CGBuiltin.cpp             | 18 ++++++++++++++++++
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl |  4 ++--
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td    |  3 ---
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a05874e63c73c..980d8c0887c4e 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18410,6 +18410,24 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
     return Builder.CreateCall(F, Args);
   }
+  case AMDGPU::BI__builtin_amdgcn_readlane:
+  case AMDGPU::BI__builtin_amdgcn_readfirstlane: {
+    llvm::SmallVector<llvm::Value *, 6> Args;
+    unsigned ICEArguments = 0;
+    ASTContext::GetBuiltinTypeError Error;
+    Intrinsic::ID IID = (BuiltinID == AMDGPU::BI__builtin_amdgcn_readlane)
+                            ? Intrinsic::amdgcn_readlane
+                            : Intrinsic::amdgcn_readfirstlane;
+
+    getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
+    assert(Error == ASTContext::GE_None && "Should not codegen an error");
+    for (unsigned I = 0; I != E->getNumArgs(); ++I) {
+      Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, I, E));
+    }
+
+    Function *F = CGM.getIntrinsic(IID, Args[0]->getType());
+    return Builder.CreateCall(F, Args);
+  }
   case AMDGPU::BI__builtin_amdgcn_div_fixup:
   case AMDGPU::BI__builtin_amdgcn_div_fixupf:
   case AMDGPU::BI__builtin_amdgcn_div_fixuph:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index bdca97c887867..f93d5ac29a2cc 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -306,14 +306,14 @@ void test_ds_bpermute(global int* out, int a, int b)
 }
 
 // CHECK-LABEL: @test_readfirstlane
-// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a)
+// CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
 void test_readfirstlane(global int* out, int a)
 {
   *out = __builtin_amdgcn_readfirstlane(a);
 }
 
 // CHECK-LABEL: @test_readlane
-// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
+// CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
 void test_readlane(global int* out, int a, int b)
 {
   *out = __builtin_amdgcn_readlane(a, b);
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 34feee1c56be8..2afdfa1b1273d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2176,14 +2176,12 @@ def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
 
 def int_amdgcn_readfirstlane :
-  ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
   Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // The lane argument must be uniform across the currently active threads of the
 // current wave. Otherwise, the result is undefined.
 def int_amdgcn_readlane :
-  ClangBuiltin<"__builtin_amdgcn_readlane">,
   Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
@@ -2191,7 +2189,6 @@ def int_amdgcn_readlane :
 // currently active threads of the current wave. Otherwise, the result is
 // undefined.
 def int_amdgcn_writelane :
-  ClangBuiltin<"__builtin_amdgcn_writelane">,
   Intrinsic<[llvm_any_ty], [
     LLVMMatchType<0>,    // uniform value to write: returned by the selected lane
     llvm_i32_ty,        // uniform lane select

>From 15cbd9052fa974bb0cce0a58bb0b14a2a99164d1 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Fri, 19 Apr 2024 19:04:23 +0000
Subject: [PATCH 08/30] add and update tests, fixes to writelane src0 imm
 handling

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   4 +-
 .../AMDGPU/llvm.amdgcn.readfirstlane.ll       | 119 +++++++++--
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   | 199 ++++++++++++++++--
 3 files changed, 288 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 79a9f451589b1..ae928a6813a84 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4834,7 +4834,8 @@ static MachineBasicBlock *lowerPseudoLaneOp(MachineInstr &MI,
   MachineOperand &Dest = MI.getOperand(0);
   MachineOperand &Src0 = MI.getOperand(1);
 
-  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+  const TargetRegisterClass *Src0RC =
+      Src0.isReg() ? MRI.getRegClass(Src0.getReg()) : &AMDGPU::SReg_64RegClass;
   const TargetRegisterClass *Src0SubRC =
       TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
 
@@ -4895,6 +4896,7 @@ static MachineBasicBlock *lowerPseudoLaneOp(MachineInstr &MI,
 
     if (IsKill)
       Src1.setIsKill(false);
+
     LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub0)
                  .add(SrcReg0Sub0)
                  .add(Src1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 0284f44f5f14d..c5e0e9ffd3a9b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -1,35 +1,96 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
 
 declare i32 @llvm.amdgcn.readfirstlane(i32) #0
+declare i64 @llvm.amdgcn.readfirstlane.i64(i64) #0
+declare double @llvm.amdgcn.readfirstlane.f64(double) #0
 
-; CHECK-LABEL: {{^}}test_readfirstlane:
+; CHECK-LABEL: {{^}}test_readfirstlane_i32:
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
-define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
+define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) #1 {
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %src)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_imm:
+; CHECK-LABEL: {{^}}test_readfirstlane_i64:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}}
+define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) #1 {
+  %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %src)
+  store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_f64:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}}
+define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) #1 {
+  %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double %src)
+  store double %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_imm_i32:
 ; CHECK: s_mov_b32 [[SGPR_VAL:s[0-9]]], 32
 ; CHECK-NOT: [[SGPR_VAL]]
 ; CHECK: ; use [[SGPR_VAL]]
-define amdgpu_kernel void @test_readfirstlane_imm(ptr addrspace(1) %out) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) #1 {
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
   call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_imm_fold:
+; CHECK-LABEL: {{^}}test_readfirstlane_imm_i64:
+; CHECK: s_mov_b64 [[SGPR_VAL:s\[[0-9]+:[0-9]+\]]], 32
+; CHECK: use [[SGPR_VAL]]
+define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) #1 {
+  %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 32)
+  call void asm sideeffect "; use $0", "s"(i64 %readfirstlane)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_imm_f64:
+; CHECK: s_mov_b32 s[[VAL0:[0-9]+]], 0
+; CHECK: s_mov_b32 s[[VAL1:[0-9]+]], 0x40400000
+; use s[[VAL0\:VAL1]]
+define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1 {
+  %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double 32.0)
+  call void asm sideeffect "; use $0", "s"(double %readfirstlane)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_imm_fold_i32:
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
 ; CHECK-NOT: [[VVAL]]
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
-define amdgpu_kernel void @test_readfirstlane_imm_fold(ptr addrspace(1) %out) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #1 {
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_readfirstlane_imm_fold_i64:
+; CHECK: s_mov_b64 s[[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]], 32
+; CHECK: v_mov_b32_e32 v[[RES0:[0-9]+]], s[[VAL0]]
+; CHECK: v_mov_b32_e32 v[[RES1:[0-9]+]], s[[VAL1]]
+; CHECK: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[RES0]]:[[RES1]]]
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #1 {
+  %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 32)
+  store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK: s_mov_b32 s[[VAL0:[0-9]+]], 0
+; CHECK: s_mov_b32 s[[VAL1:[0-9]+]], 0x40400000
+; CHECK: v_mov_b32_e32 v[[RES0:[0-9]+]], s[[VAL0]]
+; CHECK: v_mov_b32_e32 v[[RES1:[0-9]+]], s[[VAL1]]
+; CHECK: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[RES0]]:[[RES1]]]
+define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #1 {
+  %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double 32.0)
+  store double %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 ; CHECK-LABEL: {{^}}test_readfirstlane_m0:
 ; CHECK: s_mov_b32 m0, -1
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
@@ -41,7 +102,7 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr:
+; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr_i32:
 ; CHECK: ;;#ASMSTART
 ; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]]
 ; CHECK: ;;#ASMEND
@@ -49,13 +110,47 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
 ; CHECK-NOT: readfirstlane
 ; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]]
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 {
   %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sgpr)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
+; TODO: should optimize this as for i32
+; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr_i64:
+; CHECK: ;;#ASMSTART
+; CHECK-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}
+; CHECK: ;;#ASMEND
+; CHECK: v_readfirstlane_b32 [[SGPR0:s[0-9]+]], {{v[0-9]+}}
+; CHECK: v_readfirstlane_b32 [[SGPR1:s[0-9]+]], {{v[0-9]+}}
+; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR0]]
+; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR1]]
+; CHECK: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 {
+  %sgpr = call i64 asm "s_mov_b64 $0, 0", "=s"()
+  %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %sgpr)
+  store i64 %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; TODO: should optimize this as for i32
+; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr_f64:
+; CHECK: ;;#ASMSTART
+; CHECK-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}
+; CHECK: ;;#ASMEND
+; CHECK: v_readfirstlane_b32 [[SGPR0:s[0-9]+]], {{v[0-9]+}}
+; CHECK: v_readfirstlane_b32 [[SGPR1:s[0-9]+]], {{v[0-9]+}}
+; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR0]]
+; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR1]]
+; CHECK: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 {
+  %sgpr = call double asm "s_mov_b64 $0, 0", "=s"()
+  %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double %sgpr)
+  store double %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 ; Make sure this doesn't crash.
 ; CHECK-LABEL: {{^}}test_readfirstlane_fi:
 ; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 37951669dbe75..9e7f5eb001a21 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -4,82 +4,239 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
 
 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
+declare i64 @llvm.amdgcn.writelane.i64(i64, i32, i64) #0
+declare double @llvm.amdgcn.writelane.f64(double, i32, double) #0
 
-; CHECK-LABEL: {{^}}test_writelane_sreg:
+; CHECK-LABEL: {{^}}test_writelane_sreg_i32:
 ; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
 ; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_sreg(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
   %oldval = load i32, ptr addrspace(1) %out
-  %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
+  %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_imm_sreg:
+; CHECK-LABEL: {{^}}test_writelane_sreg_i64:
+; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
+; CIGFX9-NEXT: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
+; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GFX10-NEXT: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+  %oldval = load i64, ptr addrspace(1) %out
+  %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval)
+  store i64 %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_sreg_f64:
+; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
+; CIGFX9-NEXT: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
+; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GFX10-NEXT: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+  %oldval = load double, ptr addrspace(1) %out
+  %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval)
+  store double %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_imm_sreg_i32:
 ; CHECK: v_writelane_b32 v{{[0-9]+}}, 32, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_imm_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
   %oldval = load i32, ptr addrspace(1) %out
-  %writelane = call i32 @llvm.amdgcn.writelane(i32 32, i32 %src1, i32 %oldval)
+  %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 32, i32 %src1, i32 %oldval)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_vreg_lane:
+; CHECK-LABEL: {{^}}test_writelane_imm_sreg_i64:
+; CHECK: v_writelane_b32 v{{[0-9]+}}, 32, s{{[0-9]+}}
+; CHECK-NEXT: v_writelane_b32 v{{[0-9]+}}, 0, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 {
+  %oldval = load i64, ptr addrspace(1) %out
+  %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 32, i32 %src1, i64 %oldval)
+  store i64 %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; TODO: fold both SGPR's
+; CHECK-LABEL: {{^}}test_writelane_imm_sreg_f64:
+; CHECK: s_mov_b32 [[SGPR:s[0-9]+]], 0x40400000
+; CIGFX9: v_writelane_b32 v{{[0-9]+}}, 0, m0
+; CIGFX9-NEXT: v_writelane_b32 v{{[0-9]+}}, [[SGPR]], m0
+; GFX10: v_writelane_b32 v{{[0-9]+}}, 0, s{{[0-9]+}}
+; GFX10-NEXT: v_writelane_b32 v{{[0-9]+}}, [[SGPR]], s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 {
+  %oldval = load double, ptr addrspace(1) %out
+  %writelane = call double @llvm.amdgcn.writelane.f64(double 32.0, i32 %src1, double %oldval)
+  store double %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_vreg_lane_i32:
 ; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
 ; CHECK: v_writelane_b32 v{{[0-9]+}}, 12, [[LANE]]
-define amdgpu_kernel void @test_writelane_vreg_lane(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid
   %args = load <2 x i32>, ptr addrspace(1) %gep.in
   %oldval = load i32, ptr addrspace(1) %out
   %lane = extractelement <2 x i32> %args, i32 1
-  %writelane = call i32 @llvm.amdgcn.writelane(i32 12, i32 %lane, i32 %oldval)
+  %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 12, i32 %lane, i32 %oldval)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_m0_sreg:
+; CHECK-LABEL: {{^}}test_writelane_vreg_lane_i64:
+; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
+; CHECK: v_writelane_b32 v{{[0-9]+}}, 12, [[LANE]]
+; CHECK-NEXT: v_writelane_b32 v{{[0-9]+}}, 0, [[LANE]]
+define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
+  %args = load <2 x i64>, ptr addrspace(1) %gep.in
+  %oldval = load i64, ptr addrspace(1) %out
+  %lane = extractelement <2 x i64> %args, i32 1
+  %lane32 = trunc i64 %lane to i32
+  %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 12, i32 %lane32, i64 %oldval)
+  store i64 %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; TODO: fold both SGPR's
+; CHECK-LABEL: {{^}}test_writelane_vreg_lane_f64:
+; CHECK: s_mov_b32 [[SGPR:s[0-9]+]], 0x40280000
+; CHECK: v_readfirstlane_b32 [[LANE:.*]], v{{[0-9]+}}
+; CHECK: v_writelane_b32 v{{[0-9]+}}, 0, [[LANE]]
+; CHECK-NEXT: v_writelane_b32 v{{[0-9]+}}, [[SGPR]], [[LANE]]
+define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr <2 x double>, ptr addrspace(1) %in, i32 %tid
+  %args = load <2 x double>, ptr addrspace(1) %gep.in
+  %oldval = load double, ptr addrspace(1) %out
+  %lane = extractelement <2 x double> %args, i32 1
+  %lane_cast = bitcast double %lane to i64
+  %lane32 = trunc i64 %lane_cast to i32
+  %writelane = call double @llvm.amdgcn.writelane.f64(double 12.0, i32 %lane32, double %oldval)
+  store double %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_m0_sreg_i32:
 ; CHECK: s_mov_b32 m0, -1
 ; CIGFX9: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; CIGFX9: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], m0
 ; GFX10: v_writelane_b32 v{{[0-9]+}}, m0, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
   %oldval = load i32, ptr addrspace(1) %out
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
-  %writelane = call i32 @llvm.amdgcn.writelane(i32 %m0, i32 %src1, i32 %oldval)
+  %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %m0, i32 %src1, i32 %oldval)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_imm:
+; CHECK-LABEL: {{^}}test_writelane_imm_i32:
 ; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
-define amdgpu_kernel void @test_writelane_imm(ptr addrspace(1) %out, i32 %src0) #1 {
+define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %src0) #1 {
   %oldval = load i32, ptr addrspace(1) %out
-  %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 32, i32 %oldval) #0
+  %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 32, i32 %oldval) #0
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_sreg_oldval:
+; CHECK-LABEL: {{^}}test_writelane_imm_i64:
+; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
+; CHECK-NEXT: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
+define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %src0) #1 {
+  %oldval = load i64, ptr addrspace(1) %out
+  %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 32, i64 %oldval) #0
+  store i64 %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_imm_f64:
+; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
+; CHECK-NEXT: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
+define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double %src0) #1 {
+  %oldval = load double, ptr addrspace(1) %out
+  %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 32, double %oldval) #0
+  store double %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_sreg_oldval_i32:
 ; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}}
 ; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
 ; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
-  %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
+define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+  %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_imm_oldval:
+; CHECK-LABEL: {{^}}test_writelane_sreg_oldval_i64:
+; CHECK: v_mov_b32_e32 [[OLDSUB0:v[0-9]+]], s{{[0-9]+}}
+; CHECK: v_mov_b32_e32 [[OLDSUB1:v[0-9]+]], s{{[0-9]+}}
+; CIGFX9: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, m0
+; CIGFX9-NEXT: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, m0
+; GFX10: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, s{{[0-9]+}}
+; GFX10: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+  %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval)
+  store i64 %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_sreg_oldval_f64:
+; CHECK: v_mov_b32_e32 [[OLDSUB0:v[0-9]+]], s{{[0-9]+}}
+; CHECK: v_mov_b32_e32 [[OLDSUB1:v[0-9]+]], s{{[0-9]+}}
+; CIGFX9: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, m0
+; CIGFX9-NEXT: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, m0
+; GFX10: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, s{{[0-9]+}}
+; GFX10: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+  %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval)
+  store double %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_imm_oldval_i32:
 ; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42
 ; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
 ; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_imm_oldval(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
-  %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 42)
+define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+  %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 42)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_writelane_imm_oldval_i64:
+; CHECK: v_mov_b32_e32 [[OLDSUB0:v[0-9]+]], 42
+; CHECK: v_mov_b32_e32 [[OLDSUB1:v[0-9]+]], 0
+; CIGFX9: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, m0
+; CIGFX9-NEXT: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, m0
+; GFX10: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, s{{[0-9]+}}
+; GFX10-NEXT: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+  %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 42)
+  store i64 %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_imm_oldval_f64:
+; CHECK: v_mov_b32_e32 [[OLDSUB0:v[0-9]+]], 0
+; CHECK: v_mov_b32_e32 [[OLDSUB1:v[0-9]+]], 0x40450000
+; CIGFX9: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, m0
+; CIGFX9-NEXT: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, m0
+; GFX10: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, s{{[0-9]+}}
+; GFX10-NEXT: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+  %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double 42.0)
+  store double %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }

>From 776a4c667044edb75da9bfc21d88e54cac9b221e Mon Sep 17 00:00:00 2001
From: vikramRH <vikhegde at amd.com>
Date: Mon, 22 Apr 2024 09:58:11 +0000
Subject: [PATCH 09/30] address review comments

---
 clang/lib/CodeGen/CGBuiltin.cpp | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 980d8c0887c4e..2def06a9a3880 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18411,23 +18411,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(F, Args);
   }
   case AMDGPU::BI__builtin_amdgcn_readlane:
-  case AMDGPU::BI__builtin_amdgcn_readfirstlane: {
-    llvm::SmallVector<llvm::Value *, 6> Args;
-    unsigned ICEArguments = 0;
-    ASTContext::GetBuiltinTypeError Error;
-    Intrinsic::ID IID = (BuiltinID == AMDGPU::BI__builtin_amdgcn_readlane)
-                            ? Intrinsic::amdgcn_readlane
-                            : Intrinsic::amdgcn_readfirstlane;
-
-    getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
-    assert(Error == ASTContext::GE_None && "Should not codegen an error");
-    for (unsigned I = 0; I != E->getNumArgs(); ++I) {
-      Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, I, E));
-    }
-
-    Function *F = CGM.getIntrinsic(IID, Args[0]->getType());
-    return Builder.CreateCall(F, Args);
-  }
+    return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane);
+  case AMDGPU::BI__builtin_amdgcn_readfirstlane:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane);
   case AMDGPU::BI__builtin_amdgcn_div_fixup:
   case AMDGPU::BI__builtin_amdgcn_div_fixupf:
   case AMDGPU::BI__builtin_amdgcn_div_fixuph:

>From 82da530567a2e5facfdac0ee7707527a15289bb8 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Thu, 2 May 2024 17:07:30 +0000
Subject: [PATCH 10/30] Implement lowering in legalizer for legal types

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |    3 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |    4 +
 llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td     |   16 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   92 +
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |    3 +
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp    |    3 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  177 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   37 +-
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |    2 +-
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    |    4 +-
 .../AMDGPU/llvm.amdgcn.readfirstlane.ll       |  365 ++-
 .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll    |  593 +++-
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   | 2585 ++++++++++++++++-
 13 files changed, 3493 insertions(+), 391 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index db69d50799e70..b2f4e9c3eda59 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5459,6 +5459,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(LDS)
   NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
   NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
+  NODE_NAME_CASE(READLANE)
+  NODE_NAME_CASE(READFIRSTLANE)
+  NODE_NAME_CASE(WRITELANE)
   NODE_NAME_CASE(DUMMY_CHAIN)
   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
   NODE_NAME_CASE(LOAD_D16_HI)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index f10a357125e56..624f0ffa50844 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -541,6 +541,10 @@ enum NodeType : unsigned {
   FPTRUNC_ROUND_UPWARD,
   FPTRUNC_ROUND_DOWNWARD,
 
+  READLANE,
+  READFIRSTLANE,
+  WRITELANE,
+
   DUMMY_CHAIN,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
   LOAD_D16_HI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 82f58ea38fd0a..a591fe76ff48e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -342,6 +342,10 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
 
 def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
 
+def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", SDTIntBinOp>;
+def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", SDTIntUnaryOp>;
+def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDTIntTernaryOp>;
+
 // SI+ export
 def AMDGPUExportOp : SDTypeProfile<0, 8, [
   SDTCisInt<0>,       // i8 tgt
@@ -504,3 +508,15 @@ def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc
 def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2),
   [(int_amdgcn_perm node:$src0, node:$src1, node:$src2),
    (AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>;
+
+def AMDGPUreadlane : PatFrags<(ops node:$src0, node:$src1),
+  [(int_amdgcn_readlane node:$src0, node:$src1),
+   (AMDGPUreadlane_impl node:$src0, node:$src1)]>;
+
+def AMDGPUreadfirstlane : PatFrags<(ops node:$src),
+  [(int_amdgcn_readfirstlane node:$src),
+   (AMDGPUreadfirstlane_impl node:$src)]>;
+
+def AMDGPUwritelane : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+  [(int_amdgcn_writelane node:$src0, node:$src1, node:$src2),
+   (AMDGPUwritelane_impl node:$src0, node:$src1, node:$src2)]>;
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e55d1de01b4fd..8f0286164a7f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5386,6 +5386,94 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
+                                         MachineInstr &MI,
+                                         Intrinsic::ID IID) const {
+
+  MachineIRBuilder &B = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *B.getMRI();
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(2).getReg();
+
+  LLT Ty = MRI.getType(DstReg);
+  unsigned Size = Ty.getSizeInBits();
+
+  if (Size == 32)
+    return true;
+
+  if (Size < 32) {
+    auto Ext = B.buildAnyExt(LLT::scalar(32), Src0).getReg(0);
+    auto LaneOpDst =
+        B.buildIntrinsic(Intrinsic::amdgcn_readlane, {S32}).addUse(Ext);
+    if (IID == Intrinsic::amdgcn_readlane ||
+        IID == Intrinsic::amdgcn_writelane) {
+      auto Src1 = MI.getOperand(3).getReg();
+      LaneOpDst = LaneOpDst.addUse(Src1);
+      if (IID == Intrinsic::amdgcn_writelane) {
+        auto Src2 = MI.getOperand(4).getReg();
+        auto Ext2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
+        LaneOpDst = LaneOpDst.addUse(Ext2);
+      }
+    }
+    B.buildTrunc(DstReg, LaneOpDst).getReg(0);
+  } else if ((Size % 32) == 0) {
+    SmallVector<Register, 2> Src0Parts, PartialRes;
+    unsigned NumParts = Size / 32;
+    auto WideReg = MRI.createGenericVirtualRegister(LLT::scalar(NumParts * 32));
+    for (unsigned i = 0; i < NumParts; ++i) {
+      Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
+    }
+
+    B.buildUnmerge(Src0Parts, Src0);
+
+    switch (IID) {
+    case Intrinsic::amdgcn_readlane: {
+      auto Src1 = MI.getOperand(3).getReg();
+      for (unsigned i = 0; i < NumParts; ++i)
+        PartialRes.push_back(
+            (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {S32})
+                 .addUse(Src0Parts[i])
+                 .addUse(Src1))
+                .getReg(0));
+      break;
+    }
+    case Intrinsic::amdgcn_readfirstlane: {
+
+      for (unsigned i = 0; i < NumParts; ++i)
+        PartialRes.push_back(
+            (B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {S32})
+                 .addUse(Src0Parts[i]))
+                .getReg(0));
+
+      break;
+    }
+    case Intrinsic::amdgcn_writelane: {
+      auto Src1 = MI.getOperand(3).getReg();
+      auto Src2 = MI.getOperand(4).getReg();
+      SmallVector<Register, 2> Src2Parts;
+      for (unsigned i = 0; i < NumParts; ++i) {
+        Src2Parts.push_back(MRI.createGenericVirtualRegister(S32));
+      }
+      B.buildUnmerge(Src2Parts, Src2);
+
+      for (unsigned i = 0; i < NumParts; ++i)
+        PartialRes.push_back(
+            (B.buildIntrinsic(Intrinsic::amdgcn_writelane, {S32})
+                 .addUse(Src0Parts[i])
+                 .addUse(Src1)
+                 .addUse(Src2Parts[i]))
+                .getReg(0));
+    }
+    }
+    B.buildMergeLikeInstr(DstReg, PartialRes);
+  } else
+    return false;
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
                                             MachineRegisterInfo &MRI,
                                             MachineIRBuilder &B) const {
@@ -7319,6 +7407,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     Observer.changedInstr(MI);
     return true;
   }
+  case Intrinsic::amdgcn_readlane:
+  case Intrinsic::amdgcn_writelane:
+  case Intrinsic::amdgcn_readfirstlane:
+    return legalizeLaneOp(Helper, MI, IntrID);
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index e5ba84a74a0f8..40e056154527f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -208,6 +208,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
                             Intrinsic::ID IID) const;
 
+  bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI,
+                      Intrinsic::ID IID) const;
+
   bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
 
   bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index d722b6fb56bcc..8b21c22b44971 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -691,8 +691,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
 
         break;
       }
-      case AMDGPU::V_WRITELANE_B32:
-      case AMDGPU::V_WRITELANE_PSEUDO_B64: {
+      case AMDGPU::V_WRITELANE_B32: {
         // Some architectures allow more than one constant bus access without
         // SGPR restriction
         if (ST.getConstantBusLimit(MI.getOpcode()) != 1)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ae928a6813a84..83f77c916834b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4822,111 +4822,6 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
   return RetBB;
 }
 
-static MachineBasicBlock *lowerPseudoLaneOp(MachineInstr &MI,
-                                            MachineBasicBlock *BB,
-                                            const GCNSubtarget &ST,
-                                            unsigned Opc) {
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-  const DebugLoc &DL = MI.getDebugLoc();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-
-  MachineOperand &Dest = MI.getOperand(0);
-  MachineOperand &Src0 = MI.getOperand(1);
-
-  const TargetRegisterClass *Src0RC =
-      Src0.isReg() ? MRI.getRegClass(Src0.getReg()) : &AMDGPU::SReg_64RegClass;
-  const TargetRegisterClass *Src0SubRC =
-      TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
-
-  Register DestSub0 = MRI.createVirtualRegister(
-      (Opc == AMDGPU::V_WRITELANE_PSEUDO_B64) ? &AMDGPU::VGPR_32RegClass
-                                              : &AMDGPU::SGPR_32RegClass);
-  Register DestSub1 = MRI.createVirtualRegister(
-      (Opc == AMDGPU::V_WRITELANE_PSEUDO_B64) ? &AMDGPU::VGPR_32RegClass
-                                              : &AMDGPU::SGPR_32RegClass);
-
-  MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
-      MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
-
-  MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
-      MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
-
-  MachineInstr *LoHalf, *HighHalf;
-  switch (Opc) {
-  case AMDGPU::V_READLANE_PSEUDO_B64: {
-    MachineOperand &Src1 = MI.getOperand(2);
-    auto IsKill = (Src1.isReg() && Src1.isKill());
-    if (IsKill)
-      Src1.setIsKill(false);
-    LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub0)
-                 .add(SrcReg0Sub0)
-                 .add(Src1);
-
-    if (IsKill)
-      Src1.setIsKill(true);
-    HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub1)
-                   .add(SrcReg0Sub1)
-                   .add(Src1);
-    break;
-  }
-  case AMDGPU::V_READFIRSTLANE_PSEUDO_B64: {
-    LoHalf =
-        BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub0)
-            .add(SrcReg0Sub0);
-    HighHalf =
-        BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub1)
-            .add(SrcReg0Sub1);
-    break;
-  }
-  case AMDGPU::V_WRITELANE_PSEUDO_B64: {
-    MachineOperand &Src1 = MI.getOperand(2);
-    MachineOperand &Src2 = MI.getOperand(3);
-    auto IsKill = (Src1.isReg() && Src1.isKill());
-
-    const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
-    const TargetRegisterClass *Src2SubRC =
-        TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
-
-    MachineOperand SrcReg2Sub0 = TII->buildExtractSubRegOrImm(
-        MI, MRI, Src2, Src2RC, AMDGPU::sub0, Src2SubRC);
-
-    MachineOperand SrcReg2Sub1 = TII->buildExtractSubRegOrImm(
-        MI, MRI, Src2, Src2RC, AMDGPU::sub1, Src2SubRC);
-
-    if (IsKill)
-      Src1.setIsKill(false);
-
-    LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub0)
-                 .add(SrcReg0Sub0)
-                 .add(Src1)
-                 .add(SrcReg2Sub0);
-
-    if (IsKill)
-      Src1.setIsKill(true);
-    HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub1)
-                   .add(SrcReg0Sub1)
-                   .add(Src1)
-                   .add(SrcReg2Sub1);
-    break;
-  }
-  default:
-    llvm_unreachable("should not occur");
-  }
-
-  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
-      .addReg(DestSub0)
-      .addImm(AMDGPU::sub0)
-      .addReg(DestSub1)
-      .addImm(AMDGPU::sub1);
-
-  TII->legalizeOperands(*LoHalf);
-  TII->legalizeOperands(*HighHalf);
-
-  MI.eraseFromParent();
-  return BB;
-}
-
 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   MachineInstr &MI, MachineBasicBlock *BB) const {
 
@@ -5170,10 +5065,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     MI.eraseFromParent();
     return BB;
   }
-  case AMDGPU::V_READLANE_PSEUDO_B64:
-  case AMDGPU::V_READFIRSTLANE_PSEUDO_B64:
-  case AMDGPU::V_WRITELANE_PSEUDO_B64:
-    return lowerPseudoLaneOp(MI, BB, *getSubtarget(), MI.getOpcode());
   case AMDGPU::SI_INIT_M0: {
     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
@@ -6091,6 +5982,70 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
       DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
 }
 
+static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
+                           SelectionDAG &DAG) {
+  auto VT = N->getValueType(0);
+  unsigned ValSize = VT.getSizeInBits();
+  unsigned IntrinsicID = N->getConstantOperandVal(0);
+  SDValue Src0 = N->getOperand(1);
+  SDLoc SL(N);
+  MVT IntVT = MVT::getIntegerVT(ValSize);
+
+  auto createLaneOp = [&](SDValue &Src0, SDValue &Src1, SDValue &Src2,
+                          MVT VT) -> SDValue {
+    return (Src2.getNode()
+                ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2})
+            : Src1.getNode()
+                ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1})
+                : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0}));
+  };
+
+  SDValue Src1, Src2, Src0Valid, Src2Valid;
+  if (IntrinsicID == Intrinsic::amdgcn_readlane ||
+      IntrinsicID == Intrinsic::amdgcn_writelane) {
+    Src1 = N->getOperand(2);
+    if (IntrinsicID == Intrinsic::amdgcn_writelane)
+      Src2 = N->getOperand(3);
+  }
+
+  if (ValSize == 32) {
+    if (VT == MVT::i32)
+      // Already legal
+      return SDValue();
+    Src0Valid = DAG.getBitcast(IntVT, Src0);
+    if (Src2.getNode())
+      Src2Valid = DAG.getBitcast(IntVT, Src2);
+    auto LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid, MVT::i32);
+    return DAG.getBitcast(VT, LaneOp);
+  }
+
+  if (ValSize < 32) {
+    auto InitBitCast = DAG.getBitcast(IntVT, Src0);
+    Src0Valid = DAG.getAnyExtOrTrunc(InitBitCast, SL, MVT::i32);
+    if (Src2.getNode()) {
+      auto Src2Cast = DAG.getBitcast(IntVT, Src2);
+      Src2Valid = DAG.getAnyExtOrTrunc(Src2Cast, SL, MVT::i32);
+    }
+    auto LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid, MVT::i32);
+    auto Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
+    return DAG.getBitcast(VT, Trunc);
+  }
+
+  if ((ValSize % 32) == 0) {
+    MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
+    Src0Valid = DAG.getBitcast(VecVT, Src0);
+
+    if (Src2.getNode())
+      Src2Valid = DAG.getBitcast(VecVT, Src2);
+
+    auto LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid, VecVT);
+    auto UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode());
+    return DAG.getBitcast(VT, UnrolledLaneOp);
+  }
+
+  return SDValue();
+}
+
 void SITargetLowering::ReplaceNodeResults(SDNode *N,
                                           SmallVectorImpl<SDValue> &Results,
                                           SelectionDAG &DAG) const {
@@ -8553,6 +8508,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_addrspacecast_nonnull:
     return lowerADDRSPACECAST(Op, DAG);
+  case Intrinsic::amdgcn_readlane:
+  case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_writelane:
+    return lowerLaneOp(*this, Op.getNode(), DAG);
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e8ece71fe07b7..6a3b6bf83f096 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -288,41 +288,6 @@ def V_SUB_U64_PSEUDO : VPseudoInstSI <
 >;
 } // End usesCustomInserter = 1, Defs = [VCC]
 
-
-let usesCustomInserter = 1 in {
-  def V_READLANE_PSEUDO_B64 : VPseudoInstSI <
-  (outs SReg_64:$sdst), (ins VReg_64:$src0, SSrc_b32:$src1)>;
-
-  def V_READFIRSTLANE_PSEUDO_B64 : VPseudoInstSI <
-  (outs SReg_64:$sdst), (ins VReg_64:$src0)>;
-  
-  def V_WRITELANE_PSEUDO_B64 : VPseudoInstSI <
-  (outs VReg_64:$sdst), (ins SReg_64:$src0, SSrc_b32:$src1, VReg_64:$src2)> {
-    let UseNamedOperandTable = 1;
-  }
-} // End usesCustomInserter = 1
-
-class ReadLanePseudoPat <ValueType vt> : GCNPat <
-  (vt (int_amdgcn_readlane vt:$src0, i32:$src1)),
-  (V_READLANE_PSEUDO_B64 VReg_64:$src0, SSrc_b32:$src1)>;
-
-def : ReadLanePseudoPat<i64>;
-def : ReadLanePseudoPat<f64>;
-
-class WriteLanePseudoPat <ValueType vt> : GCNPat <
-  (vt (int_amdgcn_writelane vt:$src0, i32:$src1, vt:$src2)),
-  (V_WRITELANE_PSEUDO_B64 SReg_64:$src0, SSrc_b32:$src1, VReg_64:$src2)>;
-
-def : WriteLanePseudoPat<i64>;
-def : WriteLanePseudoPat<f64>;
-
-class ReadFirstLanePseudoPat <ValueType vt> : GCNPat <
-  (vt (int_amdgcn_readfirstlane vt:$src0)),
-  (V_READFIRSTLANE_PSEUDO_B64 VReg_64:$src0)>;
-
-def : ReadFirstLanePseudoPat<i64>;
-def : ReadFirstLanePseudoPat<f64>;
-
 let usesCustomInserter = 1, Defs = [SCC] in {
 def S_ADD_U64_PSEUDO : SPseudoInstSI <
   (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
@@ -3440,7 +3405,7 @@ def : GCNPat<
 // FIXME: Should also do this for readlane, but tablegen crashes on
 // the ignored src1.
 def : GCNPat<
-  (i32 (int_amdgcn_readfirstlane (i32 imm:$src))),
+  (i32 (AMDGPUreadfirstlane (i32 imm:$src))),
   (S_MOV_B32 SReg_32:$src)
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 0ee80f45c9160..e9059c617ee9f 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -243,7 +243,7 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
 // FIXME: Specify SchedRW for READFIRSTLANE_B32
 // TODO: There is VOP3 encoding also
 def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
-                                       getVOP1Pat<int_amdgcn_readfirstlane,
+                                       getVOP1Pat<AMDGPUreadfirstlane,
                                                   VOP_READFIRSTLANE>.ret, 1> {
   let isConvergent = 1;
 }
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index c001c5de81e0b..34dee083c33a3 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -781,10 +781,10 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag,
 // These are special and do not read the exec mask.
 let isConvergent = 1, Uses = []<Register> in {
 def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
-  [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>;
+  [(set i32:$vdst, (AMDGPUreadlane i32:$src0, i32:$src1))]>;
 let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
 def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
-  [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>;
+  [(set i32:$vdst, (AMDGPUwritelane i32:$src0, i32:$src1, i32:$vdst_in))]>;
 } // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in
 } // End isConvergent = 1
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index c5e0e9ffd3a9b..08447f2a395ac 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -1,160 +1,387 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
 
 declare i32 @llvm.amdgcn.readfirstlane(i32) #0
 declare i64 @llvm.amdgcn.readfirstlane.i64(i64) #0
 declare double @llvm.amdgcn.readfirstlane.f64(double) #0
 
-; CHECK-LABEL: {{^}}test_readfirstlane_i32:
-; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
 define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %src)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_i64:
-; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}}
-; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}}
 define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s5
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %src)
   store i64 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_f64:
-; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}}
-; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}}
 define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s5
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double %src)
   store double %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_imm_i32:
-; CHECK: s_mov_b32 [[SGPR_VAL:s[0-9]]], 32
-; CHECK-NOT: [[SGPR_VAL]]
-; CHECK: ; use [[SGPR_VAL]]
 define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_mov_b32 s0, 32
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_mov_b32 s0, 32
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
   call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_imm_i64:
-; CHECK: s_mov_b64 [[SGPR_VAL:s\[[0-9]+:[0-9]+\]]], 32
-; CHECK: use [[SGPR_VAL]]
 define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_mov_b64 s[0:1], 32
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_mov_b64 s[0:1], 32
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
   %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 32)
   call void asm sideeffect "; use $0", "s"(i64 %readfirstlane)
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_imm_f64:
-; CHECK: s_mov_b32 s[[VAL0:[0-9]+]], 0
-; CHECK: s_mov_b32 s[[VAL1:[0-9]+]], 0x40400000
-; use s[[VAL0\:VAL1]]
 define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_mov_b32 s0, 0
+; CHECK-SDAG-NEXT:    s_mov_b32 s1, 0x40400000
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_mov_b32 s0, 0
+; CHECK-GISEL-NEXT:    s_mov_b32 s1, 0x40400000
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
   %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double 32.0)
   call void asm sideeffect "; use $0", "s"(double %readfirstlane)
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_imm_fold_i32:
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
-; CHECK-NOT: [[VVAL]]
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
 define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, 32
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, 32
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_imm_fold_i64:
-; CHECK: s_mov_b64 s[[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]], 32
-; CHECK: v_mov_b32_e32 v[[RES0:[0-9]+]], s[[VAL0]]
-; CHECK: v_mov_b32_e32 v[[RES1:[0-9]+]], s[[VAL1]]
-; CHECK: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[RES0]]:[[RES1]]]
 define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
   %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 32)
   store i64 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK: s_mov_b32 s[[VAL0:[0-9]+]], 0
-; CHECK: s_mov_b32 s[[VAL1:[0-9]+]], 0x40400000
-; CHECK: v_mov_b32_e32 v[[RES0:[0-9]+]], s[[VAL0]]
-; CHECK: v_mov_b32_e32 v[[RES1:[0-9]+]], s[[VAL1]]
-; CHECK: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[RES0]]:[[RES1]]]
 define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
   %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double 32.0)
   store double %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_m0:
-; CHECK: s_mov_b32 m0, -1
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
 define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_m0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b32 m0, -1
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_m0:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b32 m0, -1
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr_i32:
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]]
-; CHECK: ;;#ASMEND
-; CHECK-NOT: [[SGPR]]
-; CHECK-NOT: readfirstlane
-; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]]
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b32 s2, 0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
   %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sgpr)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; TODO: should optimize this as for i32
-; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr_i64:
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}
-; CHECK: ;;#ASMEND
-; CHECK: v_readfirstlane_b32 [[SGPR0:s[0-9]+]], {{v[0-9]+}}
-; CHECK: v_readfirstlane_b32 [[SGPR1:s[0-9]+]], {{v[0-9]+}}
-; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR0]]
-; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR1]]
-; CHECK: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
   %sgpr = call i64 asm "s_mov_b64 $0, 0", "=s"()
   %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %sgpr)
   store i64 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; TODO: should optimize this as for i32
-; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr_f64:
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}
-; CHECK: ;;#ASMEND
-; CHECK: v_readfirstlane_b32 [[SGPR0:s[0-9]+]], {{v[0-9]+}}
-; CHECK: v_readfirstlane_b32 [[SGPR1:s[0-9]+]], {{v[0-9]+}}
-; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR0]]
-; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR1]]
-; CHECK: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
   %sgpr = call double asm "s_mov_b64 $0, 0", "=s"()
   %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double %sgpr)
   store double %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; Make sure this doesn't crash.
-; CHECK-LABEL: {{^}}test_readfirstlane_fi:
-; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 0
 define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readfirstlane_fi:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-SDAG-NEXT:    s_mov_b32 s4, 0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_fi:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-GISEL-NEXT:    s_mov_b32 s4, 0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
   %alloca = alloca i32, addrspace(5)
   %int = ptrtoint ptr addrspace(5) %alloca to i32
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 49bb8ca262e85..7d2454182e8ec 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -1,98 +1,301 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s
 
 declare i32 @llvm.amdgcn.readlane.i32(i32, i32) #0
 declare i64 @llvm.amdgcn.readlane.i64(i64, i32) #0
 declare double @llvm.amdgcn.readlane.f64(double, i32) #0
 
-; CHECK-LABEL: {{^}}test_readlane_sreg_sreg_i32:
-; CHECK-NOT: v_readlane_b32
 define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
   %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %src0, i32 %src1)
   call void asm sideeffect "; use $0", "s"(i32 %readlane)
   ret void
 }
 
-; TODO: should optimize this as for i32
-; CHECK-LABEL: {{^}}test_readlane_sreg_sreg_i64:
-; CHECK: v_mov_b32_e32 [[VREG0:v[0-9]+]], {{s[0-9]+}}
-; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], {{s[0-9]+}}
-; CHECK: v_readlane_b32 {{s[0-9]+}}, [[VREG0]], {{s[0-9]+}}
-; CHECK: v_readlane_b32 {{s[0-9]+}}, [[VREG1]], {{s[0-9]+}}
 define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
   %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %src0, i32 %src1)
   call void asm sideeffect "; use $0", "s"(i64 %readlane)
   ret void
 }
 
-; TODO: should optimize this as for i32
-; CHECK-LABEL: {{^}}test_readlane_sreg_sreg_f64:
-; CHECK: v_mov_b32_e32 [[VREG0:v[0-9]+]], {{s[0-9]+}}
-; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], {{s[0-9]+}}
-; CHECK: v_readlane_b32 {{s[0-9]+}}, [[VREG0]], {{s[0-9]+}}
-; CHECK: v_readlane_b32 {{s[0-9]+}}, [[VREG1]], {{s[0-9]+}}
 define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
   %readlane = call double @llvm.amdgcn.readlane.f64(double %src0, i32 %src1)
   call void asm sideeffect "; use $0", "s"(double %readlane)
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vreg_sreg_i32:
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; def v0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; def v0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readlane_b32 s0, v0, s0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
   %vgpr = call i32 asm sideeffect "; def $0", "=v"()
   %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %vgpr, i32 %src1)
   call void asm sideeffect "; use $0", "s"(i32 %readlane)
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vreg_sreg_i64:
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x8
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; def v[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT:    v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dword s1, s[4:5], 0x8
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; def v[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readlane_b32 s0, v0, s1
+; CHECK-GISEL-NEXT:    v_readlane_b32 s1, v1, s1
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
   %vgpr = call i64 asm sideeffect "; def $0", "=v"()
   %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %vgpr, i32 %src1)
   call void asm sideeffect "; use $0", "s"(i64 %readlane)
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vreg_sreg_f64:
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x8
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; def v[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT:    v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dword s1, s[4:5], 0x8
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; def v[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readlane_b32 s0, v0, s1
+; CHECK-GISEL-NEXT:    v_readlane_b32 s1, v1, s1
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_endpgm
   %vgpr = call double asm sideeffect "; def $0", "=v"()
   %readlane = call double @llvm.amdgcn.readlane.f64(double %vgpr, i32 %src1)
   call void asm sideeffect "; use $0", "s"(double %readlane)
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_imm_sreg_i32:
-; CHECK-NOT: v_readlane_b32
 define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, 32
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, 32
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
   %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 32, i32 %src1)
   store i32 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_imm_sreg_i64:
-; CHECK-NOT: v_readlane_b32
 define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
   %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 32, i32 %src1)
   store i64 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_imm_sreg_f64:
-; CHECK-NOT: v_readlane_b32
 define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_imm_sreg_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
   %readlane = call double @llvm.amdgcn.readlane.f64(double 32.0, i32 %src1)
   store double %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vregs_i32:
-; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
 define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vregs_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; CHECK-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-SDAG-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    flat_store_dword v[2:3], v0
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vregs_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; CHECK-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s2, v1
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid
   %args = load <2 x i32>, ptr addrspace(1) %gep.in
@@ -103,11 +306,49 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vregs_i64:
-; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
 define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vregs_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; CHECK-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-SDAG-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v4, s1
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT:    v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vregs_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; CHECK-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-GISEL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s3, v2
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, s3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, s3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
   %args = load <2 x i64>, ptr addrspace(1) %gep.in
@@ -119,11 +360,49 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vregs_f64:
-; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
 define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vregs_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; CHECK-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-SDAG-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v4, s1
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s1, v1, s0
+; CHECK-SDAG-NEXT:    v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vregs_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; CHECK-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-GISEL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s3, v2
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, s3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, s3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <2 x double>, ptr addrspace(1) %in, i32 %tid
   %args = load <2 x double>, ptr addrspace(1) %gep.in
@@ -136,90 +415,242 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
   ret void
 }
 
-; TODO: m0 should be folded.
-; CHECK-LABEL: {{^}}test_readlane_m0_sreg:
-; CHECK: s_mov_b32 m0, -1
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
 define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
+; CHECK-SDAG-LABEL: test_readlane_m0_sreg:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b32 m0, -1
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_m0_sreg:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b32 m0, -1
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1)
   store i32 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vgpr_imm_i32:
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
 define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; def v0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; def v0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
   %vgpr = call i32 asm sideeffect "; def $0", "=v"()
   %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %vgpr, i32 32) #0
   store i32 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vgpr_imm_i64:
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
 define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; def v[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v1, 32
+; CHECK-SDAG-NEXT:    v_readlane_b32 s3, v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; def v[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, 32
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
   %vgpr = call i64 asm sideeffect "; def $0", "=v"()
   %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %vgpr, i32 32) #0
   store i64 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_vgpr_imm_f64:
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
-; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
 define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; def v[0:1]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v1, 32
+; CHECK-SDAG-NEXT:    v_readlane_b32 s3, v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; def v[0:1]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, 32
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
   %vgpr = call double asm sideeffect "; def $0", "=v"()
   %readlane = call double @llvm.amdgcn.readlane.f64(double %vgpr, i32 32) #0
   store double %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_readlane_copy_from_sgpr_i32:
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]]
-; CHECK: ;;#ASMEND
-; CHECK-NOT: [[SGPR]]
-; CHECK-NOT: readlane
-; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]]
-; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
 define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b32 s2, 0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT:    s_endpgm
   %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
   %readfirstlane = call i32 @llvm.amdgcn.readlane.i32(i32 %sgpr, i32 7)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; TODO: should optimize this as for i32
-; CHECK-LABEL: {{^}}test_readlane_copy_from_sgpr_i64:
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}
-; CHECK: ;;#ASMEND
-; CHECK: v_readlane_b32 [[SGPR0:s[0-9]+]], {{v[0-9]+}}, 7
-; CHECK: v_readlane_b32 [[SGPR1:s[0-9]+]], {{v[0-9]+}}, 7
-; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR0]]
-; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR1]]
-; CHECK: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
   %sgpr = call i64 asm "s_mov_b64 $0, 0", "=s"()
   %readfirstlane = call i64 @llvm.amdgcn.readlane.i64(i64 %sgpr, i32 7)
   store i64 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; TODO: should optimize this as for i32
-; CHECK-LABEL: {{^}}test_readlane_copy_from_sgpr_f64:
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}
-; CHECK: ;;#ASMEND
-; CHECK: v_readlane_b32 [[SGPR0:s[0-9]+]], {{v[0-9]+}}, 7
-; CHECK: v_readlane_b32 [[SGPR1:s[0-9]+]], {{v[0-9]+}}, 7
-; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR0]]
-; CHECK: v_mov_b32_e32 {{v[0-9]+}}, [[SGPR1]]
-; CHECK: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 {
+; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT:    s_endpgm
+;
+; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-GISEL-NEXT:    s_endpgm
   %sgpr = call double asm "s_mov_b64 $0, 0", "=s"()
   %readfirstlane = call double @llvm.amdgcn.readlane.f64(double %sgpr, i32 7)
   store double %readfirstlane, ptr addrspace(1) %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 9e7f5eb001a21..f5f8fa3907f97 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -1,83 +1,993 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CIGFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CIGFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX700-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX802-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
+
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX700-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX802-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX1010-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel < %s | FileCheck -check-prefixes=GFX1100-GISEL %s
 
 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
 declare i64 @llvm.amdgcn.writelane.i64(i64, i32, i64) #0
 declare double @llvm.amdgcn.writelane.f64(double, i32, double) #0
 
-; CHECK-LABEL: {{^}}test_writelane_sreg_i32:
-; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+; GFX700-SDAG-LABEL: test_writelane_sreg_i32:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    s_mov_b32 m0, s3
+; GFX700-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; GFX700-SDAG-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX700-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_sreg_i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
+; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1010-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_sreg_i32:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    s_mov_b32 m0, s3
+; GFX700-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX700-GISEL-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX700-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
+; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1010-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i32, ptr addrspace(1) %out
   %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_sreg_i64:
-; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
-; CIGFX9-NEXT: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GFX10-NEXT: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+; GFX700-SDAG-LABEL: test_writelane_sreg_i64:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x4
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX700-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX700-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_sreg_i64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_i64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_i64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x10
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s7, s2
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_sreg_i64:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x4
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX700-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_i64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_i64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_i64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x10
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s7, s2
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i64, ptr addrspace(1) %out
   %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval)
   store i64 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_sreg_f64:
-; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
-; CIGFX9-NEXT: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GFX10-NEXT: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+; GFX700-SDAG-LABEL: test_writelane_sreg_f64:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x4
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX700-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX700-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_sreg_f64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_f64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_f64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x10
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s7, s2
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_sreg_f64:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x4
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX700-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_f64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_f64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_f64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x10
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s7, s2
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load double, ptr addrspace(1) %out
   %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval)
   store double %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_imm_sreg_i32:
-; CHECK: v_writelane_b32 v{{[0-9]+}}, 32, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX700-SDAG-LABEL: test_writelane_imm_sreg_i32:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; GFX700-SDAG-NEXT:    v_writelane_b32 v2, 32, s2
+; GFX700-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, 32, s2
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, 32, s2
+; GFX1010-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 32, s0
+; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_imm_sreg_i32:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX700-GISEL-NEXT:    v_writelane_b32 v2, 32, s2
+; GFX700-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, 32, s2
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, 32, s2
+; GFX1010-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, 32, s0
+; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i32, ptr addrspace(1) %out
   %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 32, i32 %src1, i32 %oldval)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_imm_sreg_i64:
-; CHECK: v_writelane_b32 v{{[0-9]+}}, 32, s{{[0-9]+}}
-; CHECK-NEXT: v_writelane_b32 v{{[0-9]+}}, 0, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX700-SDAG-LABEL: test_writelane_imm_sreg_i64:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x2
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX700-SDAG-NEXT:    v_writelane_b32 v1, 0, s4
+; GFX700-SDAG-NEXT:    v_writelane_b32 v0, 32, s4
+; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x8
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, 0, s4
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, 32, s4
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, 0, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, 32, s6
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s4, s[0:1], 0x8
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, 0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 32, s4
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_imm_sreg_i64:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x2
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX700-GISEL-NEXT:    v_writelane_b32 v0, 32, s4
+; GFX700-GISEL-NEXT:    v_writelane_b32 v1, 0, s4
+; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x8
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, 32, s4
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, 0, s4
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, 32, s6
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, 0, s6
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s4, s[0:1], 0x8
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, 32, s4
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, 0, s4
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i64, ptr addrspace(1) %out
   %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 32, i32 %src1, i64 %oldval)
   store i64 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; TODO: fold both SGPR's
-; CHECK-LABEL: {{^}}test_writelane_imm_sreg_f64:
-; CHECK: s_mov_b32 [[SGPR:s[0-9]+]], 0x40400000
-; CIGFX9: v_writelane_b32 v{{[0-9]+}}, 0, m0
-; CIGFX9-NEXT: v_writelane_b32 v{{[0-9]+}}, [[SGPR]], m0
-; GFX10: v_writelane_b32 v{{[0-9]+}}, 0, s{{[0-9]+}}
-; GFX10-NEXT: v_writelane_b32 v{{[0-9]+}}, [[SGPR]], s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX700-SDAG-LABEL: test_writelane_imm_sreg_f64:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x2
+; GFX700-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX700-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s5, m0
+; GFX700-SDAG-NEXT:    v_writelane_b32 v0, 0, s4
+; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_imm_sreg_f64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x8
+; GFX802-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, 0, s4
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_f64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT:    s_mov_b32 s2, 0x40400000
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s2, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, 0, s6
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_f64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s4, s[0:1], 0x8
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-SDAG-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 0, s4
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_imm_sreg_f64:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x2
+; GFX700-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX700-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX700-GISEL-NEXT:    v_writelane_b32 v0, 0, s4
+; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
+; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_sreg_f64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x8
+; GFX802-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, 0, s4
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_f64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1010-GISEL-NEXT:    s_mov_b32 s2, 0x40400000
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, 0, s6
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s2, s6
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_f64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s4, s[0:1], 0x8
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1100-GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, 0, s4
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s0, s4
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load double, ptr addrspace(1) %out
   %writelane = call double @llvm.amdgcn.writelane.f64(double 32.0, i32 %src1, double %oldval)
   store double %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_vreg_lane_i32:
-; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
-; CHECK: v_writelane_b32 v{{[0-9]+}}, 12, [[LANE]]
 define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GFX700-SDAG-LABEL: test_writelane_vreg_lane_i32:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX700-SDAG-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX700-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX700-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; GFX700-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX700-SDAG-NEXT:    flat_load_dword v0, v[0:1]
+; GFX700-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX700-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX700-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-SDAG-NEXT:    s_nop 2
+; GFX700-SDAG-NEXT:    v_writelane_b32 v2, 12, s2
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    flat_load_dword v0, v[0:1]
+; GFX802-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    s_nop 2
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, 12, s2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
+; GFX1010-SDAG-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX1010-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v0, v0, s[2:3] offset:4
+; GFX1100-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX1100-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_vreg_lane_i32:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX700-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX700-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX700-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; GFX700-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX700-GISEL-NEXT:    flat_load_dword v0, v[0:1]
+; GFX700-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX700-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX700-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-GISEL-NEXT:    s_nop 2
+; GFX700-GISEL-NEXT:    v_writelane_b32 v2, 12, s2
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    flat_load_dword v0, v[0:1]
+; GFX802-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    s_nop 2
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, 12, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
+; GFX1010-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX1010-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b32 v0, v0, s[2:3] offset:4
+; GFX1100-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX1100-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid
   %args = load <2 x i32>, ptr addrspace(1) %gep.in
@@ -88,11 +998,184 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_vreg_lane_i64:
-; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
-; CHECK: v_writelane_b32 v{{[0-9]+}}, 12, [[LANE]]
-; CHECK-NEXT: v_writelane_b32 v{{[0-9]+}}, 0, [[LANE]]
 define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GFX700-SDAG-LABEL: test_writelane_vreg_lane_i64:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX700-SDAG-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX700-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX700-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
+; GFX700-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX700-SDAG-NEXT:    flat_load_dword v2, v[0:1]
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX700-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX700-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-SDAG-NEXT:    s_nop 2
+; GFX700-SDAG-NEXT:    v_writelane_b32 v1, 0, s2
+; GFX700-SDAG-NEXT:    v_writelane_b32 v0, 12, s2
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    flat_load_dword v2, v[0:1]
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    s_nop 2
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, 0, s2
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, 12, s2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v0, v0, s[2:3] offset:8
+; GFX1010-SDAG-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, 0, s3
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, 12, s3
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v0, v0, s[2:3] offset:8
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, 0, s3
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 12, s3
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_vreg_lane_i64:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX700-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX700-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX700-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
+; GFX700-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX700-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v4, s1
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s0
+; GFX700-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX700-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX700-GISEL-NEXT:    s_nop 3
+; GFX700-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX700-GISEL-NEXT:    v_writelane_b32 v2, 0, s2
+; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v4, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s0
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX802-GISEL-NEXT:    s_nop 3
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, 0, s2
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:8
+; GFX1010-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v2, 0, s2
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:8
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v2, 0, s2
+; GFX1100-GISEL-NEXT:    global_store_b64 v0, v[1:2], s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
   %args = load <2 x i64>, ptr addrspace(1) %gep.in
@@ -104,13 +1187,196 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
   ret void
 }
 
-; TODO: fold both SGPR's
-; CHECK-LABEL: {{^}}test_writelane_vreg_lane_f64:
-; CHECK: s_mov_b32 [[SGPR:s[0-9]+]], 0x40280000
-; CHECK: v_readfirstlane_b32 [[LANE:.*]], v{{[0-9]+}}
-; CHECK: v_writelane_b32 v{{[0-9]+}}, 0, [[LANE]]
-; CHECK-NEXT: v_writelane_b32 v{{[0-9]+}}, [[SGPR]], [[LANE]]
 define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GFX700-SDAG-LABEL: test_writelane_vreg_lane_f64:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX700-SDAG-NEXT:    s_mov_b32 s4, 0x40280000
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX700-SDAG-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX700-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX700-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
+; GFX700-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX700-SDAG-NEXT:    flat_load_dword v2, v[0:1]
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX700-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX700-SDAG-NEXT:    v_readfirstlane_b32 m0, v2
+; GFX700-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-SDAG-NEXT:    s_nop 1
+; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-SDAG-NEXT:    v_writelane_b32 v0, 0, s2
+; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_vreg_lane_f64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX802-SDAG-NEXT:    s_mov_b32 s4, 0x40280000
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    flat_load_dword v2, v[0:1]
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v2
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, 0, s2
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_f64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v0, v0, s[2:3] offset:8
+; GFX1010-SDAG-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1010-SDAG-NEXT:    s_mov_b32 s2, 0x40280000
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s2, s3
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, 0, s3
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_f64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v0, v0, s[2:3] offset:8
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1100-SDAG-NEXT:    s_mov_b32 s2, 0x40280000
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s2, s3
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 0, s3
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_vreg_lane_f64:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; GFX700-GISEL-NEXT:    s_mov_b32 s4, 0x40280000
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX700-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX700-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX700-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
+; GFX700-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX700-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v4, s1
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s0
+; GFX700-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX700-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX700-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX700-GISEL-NEXT:    s_nop 2
+; GFX700-GISEL-NEXT:    v_writelane_b32 v1, 0, s2
+; GFX700-GISEL-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_vreg_lane_f64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; GFX802-GISEL-NEXT:    s_mov_b32 s4, 0x40280000
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v4, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s0
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX802-GISEL-NEXT:    s_nop 2
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, 0, s2
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_f64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:8
+; GFX1010-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1010-GISEL-NEXT:    s_mov_b32 s3, 0x40280000
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, 0, s2
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v2, s3, s2
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_f64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:8
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1100-GISEL-NEXT:    s_mov_b32 s3, 0x40280000
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, 0, s2
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v2, s3, s2
+; GFX1100-GISEL-NEXT:    global_store_b64 v0, v[1:2], s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <2 x double>, ptr addrspace(1) %in, i32 %tid
   %args = load <2 x double>, ptr addrspace(1) %gep.in
@@ -123,12 +1389,154 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_m0_sreg_i32:
-; CHECK: s_mov_b32 m0, -1
-; CIGFX9: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
-; CIGFX9: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], m0
-; GFX10: v_writelane_b32 v{{[0-9]+}}, m0, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
+; GFX700-SDAG-LABEL: test_writelane_m0_sreg_i32:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX700-SDAG-NEXT:    ;;#ASMSTART
+; GFX700-SDAG-NEXT:    s_mov_b32 m0, -1
+; GFX700-SDAG-NEXT:    ;;#ASMEND
+; GFX700-SDAG-NEXT:    s_mov_b32 s4, m0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX700-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; GFX700-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX700-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_m0_sreg_i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX802-SDAG-NEXT:    ;;#ASMSTART
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, -1
+; GFX802-SDAG-NEXT:    ;;#ASMEND
+; GFX802-SDAG-NEXT:    s_mov_b32 s4, m0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_m0_sreg_i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT:    ;;#ASMSTART
+; GFX1010-SDAG-NEXT:    s_mov_b32 m0, -1
+; GFX1010-SDAG-NEXT:    ;;#ASMEND
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, m0, s2
+; GFX1010-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_m0_sreg_i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT:    ;;#ASMSTART
+; GFX1100-SDAG-NEXT:    s_mov_b32 m0, -1
+; GFX1100-SDAG-NEXT:    ;;#ASMEND
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, m0, s0
+; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_m0_sreg_i32:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX700-GISEL-NEXT:    ;;#ASMSTART
+; GFX700-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX700-GISEL-NEXT:    ;;#ASMEND
+; GFX700-GISEL-NEXT:    s_mov_b32 s4, m0
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX700-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX700-GISEL-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX700-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_m0_sreg_i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX802-GISEL-NEXT:    ;;#ASMSTART
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX802-GISEL-NEXT:    ;;#ASMEND
+; GFX802-GISEL-NEXT:    s_mov_b32 s4, m0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_m0_sreg_i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX1010-GISEL-NEXT:    ;;#ASMSTART
+; GFX1010-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX1010-GISEL-NEXT:    ;;#ASMEND
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, m0, s2
+; GFX1010-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_m0_sreg_i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX1100-GISEL-NEXT:    ;;#ASMSTART
+; GFX1100-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX1100-GISEL-NEXT:    ;;#ASMEND
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, m0, s0
+; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i32, ptr addrspace(1) %out
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
   %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %m0, i32 %src1, i32 %oldval)
@@ -136,102 +1544,1097 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_imm_i32:
-; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
 define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %src0) #1 {
+; GFX700-SDAG-LABEL: test_writelane_imm_i32:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; GFX700-SDAG-NEXT:    v_writelane_b32 v2, s2, 32
+; GFX700-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_imm_i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s2, 32
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1010-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s0, 32
+; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_imm_i32:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX700-GISEL-NEXT:    v_writelane_b32 v2, s2, 32
+; GFX700-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s2, 32
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1010-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s0, 32
+; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i32, ptr addrspace(1) %out
   %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 32, i32 %oldval) #0
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_imm_i64:
-; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
-; CHECK-NEXT: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
 define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %src0) #1 {
+; GFX700-SDAG-LABEL: test_writelane_imm_i64:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX700-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_imm_i64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_i64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_i64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_imm_i64:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_i64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_i64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_i64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i64, ptr addrspace(1) %out
   %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 32, i64 %oldval) #0
   store i64 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_imm_f64:
-; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
-; CHECK-NEXT: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
 define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double %src0) #1 {
+; GFX700-SDAG-LABEL: test_writelane_imm_f64:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX700-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_imm_f64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_f64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_f64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_imm_f64:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_f64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_f64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_f64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load double, ptr addrspace(1) %out
   %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 32, double %oldval) #0
   store double %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_sreg_oldval_i32:
-; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}}
-; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+; GFX700-SDAG-LABEL: test_writelane_sreg_oldval_i32:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s6
+; GFX700-SDAG-NEXT:    s_mov_b32 m0, s3
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-SDAG-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s6
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1010-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x8
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_sreg_oldval_i32:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX700-GISEL-NEXT:    s_mov_b32 m0, s3
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-GISEL-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1010-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x8
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_sreg_oldval_i64:
-; CHECK: v_mov_b32_e32 [[OLDSUB0:v[0-9]+]], s{{[0-9]+}}
-; CHECK: v_mov_b32_e32 [[OLDSUB1:v[0-9]+]], s{{[0-9]+}}
-; CIGFX9: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, m0
-; CIGFX9-NEXT: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, s{{[0-9]+}}
-; GFX10: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+; GFX700-SDAG-LABEL: test_writelane_sreg_oldval_i64:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x6
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX700-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
+; GFX700-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x18
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x2
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x18
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x2
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x18
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s3, s0
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s0
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_sreg_oldval_i64:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x6
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
+; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x18
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x2
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x18
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x2
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x18
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, s0
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s3, s0
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval)
   store i64 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_sreg_oldval_f64:
-; CHECK: v_mov_b32_e32 [[OLDSUB0:v[0-9]+]], s{{[0-9]+}}
-; CHECK: v_mov_b32_e32 [[OLDSUB1:v[0-9]+]], s{{[0-9]+}}
-; CIGFX9: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, m0
-; CIGFX9-NEXT: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, s{{[0-9]+}}
-; GFX10: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+; GFX700-SDAG-LABEL: test_writelane_sreg_oldval_f64:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x6
+; GFX700-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX700-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
+; GFX700-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x18
+; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX802-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x2
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x18
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x2
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x18
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s3, s0
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s0
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_sreg_oldval_f64:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x6
+; GFX700-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
+; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_f64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x18
+; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x2
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX1010-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x18
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_f64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x2
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x10
+; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x18
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, s0
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s3, s0
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval)
   store double %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_imm_oldval_i32:
-; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42
-; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+; GFX700-SDAG-LABEL: test_writelane_imm_oldval_i32:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, 42
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    s_mov_b32 m0, s3
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-SDAG-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, 42
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1010-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_imm_oldval_i32:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, 42
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    s_mov_b32 m0, s3
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX700-GISEL-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX700-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, 42
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s2, m0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1010-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
+; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 42)
   store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_imm_oldval_i64:
-; CHECK: v_mov_b32_e32 [[OLDSUB0:v[0-9]+]], 42
-; CHECK: v_mov_b32_e32 [[OLDSUB1:v[0-9]+]], 0
-; CIGFX9: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, m0
-; CIGFX9-NEXT: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, s{{[0-9]+}}
-; GFX10-NEXT: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
+; GFX700-SDAG-LABEL: test_writelane_imm_oldval_i64:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x4
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX700-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x10
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x10
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s7, s0
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s6, s0
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_imm_oldval_i64:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x4
+; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x10
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s6, s0
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s7, s0
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 42)
   store i64 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}test_writelane_imm_oldval_f64:
-; CHECK: v_mov_b32_e32 [[OLDSUB0:v[0-9]+]], 0
-; CHECK: v_mov_b32_e32 [[OLDSUB1:v[0-9]+]], 0x40450000
-; CIGFX9: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, m0
-; CIGFX9-NEXT: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, m0
-; GFX10: v_writelane_b32 [[OLDSUB0]], s{{[0-9]+}}, s{{[0-9]+}}
-; GFX10-NEXT: v_writelane_b32 [[OLDSUB1]], s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
+; GFX700-SDAG-LABEL: test_writelane_imm_oldval_f64:
+; GFX700-SDAG:       ; %bb.0:
+; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x4
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX700-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-SDAG-NEXT:    s_endpgm
+;
+; GFX802-SDAG-LABEL: test_writelane_imm_oldval_f64:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x10
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-SDAG-NEXT:    s_endpgm
+;
+; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_f64:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-SDAG-NEXT:    s_endpgm
+;
+; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_f64:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x10
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s7, s0
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s6, s0
+; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT:    s_nop 0
+; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-SDAG-NEXT:    s_endpgm
+;
+; GFX700-GISEL-LABEL: test_writelane_imm_oldval_f64:
+; GFX700-GISEL:       ; %bb.0:
+; GFX700-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x4
+; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX700-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX700-GISEL-NEXT:    s_endpgm
+;
+; GFX802-GISEL-LABEL: test_writelane_imm_oldval_f64:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX802-GISEL-NEXT:    s_endpgm
+;
+; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_f64:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1010-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX1010-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v0, s2, s6
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v1, s3, s6
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-GISEL-NEXT:    s_endpgm
+;
+; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_f64:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x10
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s6, s0
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s7, s0
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT:    s_nop 0
+; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1100-GISEL-NEXT:    s_endpgm
   %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double 42.0)
   store double %writelane, ptr addrspace(1) %out, align 4
   ret void

>From 14fcf445e6757b5716e5a56f2d8d372d88075d68 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Mon, 6 May 2024 05:40:45 +0000
Subject: [PATCH 11/30] refactor/improve GIsel lowering, added new tests

1. Review comments
2. improve GIsel lowering
3. add tests for half, bfloat, float2, ptr, vector of ptr and int
4. removed gfx700 checks from writelane tests since it caused issues with f16 legalization. is this required ?
---
 llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td     |    3 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  112 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   24 +-
 .../AMDGPU/llvm.amdgcn.readfirstlane.ll       |  235 +++
 .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll    |  271 +++
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   | 1555 ++++++++++-------
 6 files changed, 1503 insertions(+), 697 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index a591fe76ff48e..ebb6e4de36c01 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -519,4 +519,5 @@ def AMDGPUreadfirstlane : PatFrags<(ops node:$src),
 
 def AMDGPUwritelane : PatFrags<(ops node:$src0, node:$src1, node:$src2),
   [(int_amdgcn_writelane node:$src0, node:$src1, node:$src2),
-   (AMDGPUwritelane_impl node:$src0, node:$src1, node:$src2)]>;
\ No newline at end of file
+   (AMDGPUwritelane_impl node:$src0, node:$src1, node:$src2)]>;
+   
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8f0286164a7f1..faf70ca6fbdc9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5396,44 +5396,78 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   Register DstReg = MI.getOperand(0).getReg();
   Register Src0 = MI.getOperand(2).getReg();
 
+  auto createLaneOp = [&](Register &Src0, Register &Src1,
+                          Register &Src2) -> Register {
+    auto LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0);
+    if (Src2.isValid())
+      return (LaneOpDst.addUse(Src1).addUse(Src2)).getReg(0);
+    if (Src1.isValid())
+      return (LaneOpDst.addUse(Src1)).getReg(0);
+    return LaneOpDst.getReg(0);
+  };
+
+  Register Src1, Src2, Src0Valid, Src2Valid;
+  if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
+    Src1 = MI.getOperand(3).getReg();
+    if (IID == Intrinsic::amdgcn_writelane) {
+      Src2 = MI.getOperand(4).getReg();
+    }
+  }
+
   LLT Ty = MRI.getType(DstReg);
   unsigned Size = Ty.getSizeInBits();
 
-  if (Size == 32)
+  if (Size == 32) {
+    if (Ty.isScalar())
+      // Already legal
+      return true;
+
+    Register Src0Valid = B.buildBitcast(S32, Src0).getReg(0);
+    if (Src2.isValid())
+      Src2Valid = B.buildBitcast(S32, Src2).getReg(0);
+    Register LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid);
+    B.buildBitcast(DstReg, LaneOp);
+    MI.eraseFromParent();
     return true;
+  }
 
   if (Size < 32) {
-    auto Ext = B.buildAnyExt(LLT::scalar(32), Src0).getReg(0);
-    auto LaneOpDst =
-        B.buildIntrinsic(Intrinsic::amdgcn_readlane, {S32}).addUse(Ext);
-    if (IID == Intrinsic::amdgcn_readlane ||
-        IID == Intrinsic::amdgcn_writelane) {
-      auto Src1 = MI.getOperand(3).getReg();
-      LaneOpDst = LaneOpDst.addUse(Src1);
-      if (IID == Intrinsic::amdgcn_writelane) {
-        auto Src2 = MI.getOperand(4).getReg();
-        auto Ext2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
-        LaneOpDst = LaneOpDst.addUse(Ext2);
-      }
+    Register Src0Cast = MRI.getType(Src0).isScalar()
+                            ? Src0
+                            : B.buildBitcast(LLT::scalar(Size), Src0).getReg(0);
+    Src0Valid = B.buildAnyExt(S32, Src0Cast).getReg(0);
+
+    if (Src2.isValid()) {
+      Register Src2Cast =
+          MRI.getType(Src2).isScalar()
+              ? Src2
+              : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
+      Src2Valid = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0);
     }
-    B.buildTrunc(DstReg, LaneOpDst).getReg(0);
-  } else if ((Size % 32) == 0) {
-    SmallVector<Register, 2> Src0Parts, PartialRes;
-    unsigned NumParts = Size / 32;
-    auto WideReg = MRI.createGenericVirtualRegister(LLT::scalar(NumParts * 32));
-    for (unsigned i = 0; i < NumParts; ++i) {
-      Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
+    Register LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid);
+    if (Ty.isScalar())
+      B.buildTrunc(DstReg, LaneOp);
+    else {
+      auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOp);
+      B.buildBitcast(DstReg, Trunc);
     }
 
-    B.buildUnmerge(Src0Parts, Src0);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if ((Size % 32) == 0) {
+    SmallVector<Register, 2> PartialRes;
+    unsigned NumParts = Size / 32;
+    auto Src0Parts = B.buildUnmerge(S32, Src0);
 
     switch (IID) {
     case Intrinsic::amdgcn_readlane: {
-      auto Src1 = MI.getOperand(3).getReg();
+      Register Src1 = MI.getOperand(3).getReg();
       for (unsigned i = 0; i < NumParts; ++i)
         PartialRes.push_back(
             (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {S32})
-                 .addUse(Src0Parts[i])
+                 .addUse(Src0Parts.getReg(i))
                  .addUse(Src1))
                 .getReg(0));
       break;
@@ -5443,35 +5477,37 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
       for (unsigned i = 0; i < NumParts; ++i)
         PartialRes.push_back(
             (B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {S32})
-                 .addUse(Src0Parts[i]))
+                 .addUse(Src0Parts.getReg(i)))
                 .getReg(0));
 
       break;
     }
     case Intrinsic::amdgcn_writelane: {
-      auto Src1 = MI.getOperand(3).getReg();
-      auto Src2 = MI.getOperand(4).getReg();
-      SmallVector<Register, 2> Src2Parts;
-      for (unsigned i = 0; i < NumParts; ++i) {
-        Src2Parts.push_back(MRI.createGenericVirtualRegister(S32));
-      }
-      B.buildUnmerge(Src2Parts, Src2);
+      Register Src1 = MI.getOperand(3).getReg();
+      Register Src2 = MI.getOperand(4).getReg();
+      auto Src2Parts = B.buildUnmerge(S32, Src2);
 
       for (unsigned i = 0; i < NumParts; ++i)
         PartialRes.push_back(
             (B.buildIntrinsic(Intrinsic::amdgcn_writelane, {S32})
-                 .addUse(Src0Parts[i])
+                 .addUse(Src0Parts.getReg(i))
                  .addUse(Src1)
-                 .addUse(Src2Parts[i]))
+                 .addUse(Src2Parts.getReg(i)))
                 .getReg(0));
     }
     }
-    B.buildMergeLikeInstr(DstReg, PartialRes);
-  } else
-    return false;
 
-  MI.eraseFromParent();
-  return true;
+    if (Ty.isPointerVector()) {
+      auto MergedVec = B.buildMergeLikeInstr(
+          LLT::vector(ElementCount::getFixed(NumParts), S32), PartialRes);
+      B.buildBitcast(DstReg, MergedVec);
+    } else
+      B.buildMergeLikeInstr(DstReg, PartialRes);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
 }
 
 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 83f77c916834b..478211c32f845 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5984,7 +5984,7 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
 
 static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
                            SelectionDAG &DAG) {
-  auto VT = N->getValueType(0);
+  EVT VT = N->getValueType(0);
   unsigned ValSize = VT.getSizeInBits();
   unsigned IntrinsicID = N->getConstantOperandVal(0);
   SDValue Src0 = N->getOperand(1);
@@ -5993,11 +5993,9 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
 
   auto createLaneOp = [&](SDValue &Src0, SDValue &Src1, SDValue &Src2,
                           MVT VT) -> SDValue {
-    return (Src2.getNode()
-                ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2})
-            : Src1.getNode()
-                ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1})
-                : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0}));
+    return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2})
+            : Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1})
+                   : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0}));
   };
 
   SDValue Src1, Src2, Src0Valid, Src2Valid;
@@ -6015,19 +6013,19 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
     Src0Valid = DAG.getBitcast(IntVT, Src0);
     if (Src2.getNode())
       Src2Valid = DAG.getBitcast(IntVT, Src2);
-    auto LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid, MVT::i32);
+    SDValue LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid, MVT::i32);
     return DAG.getBitcast(VT, LaneOp);
   }
 
   if (ValSize < 32) {
-    auto InitBitCast = DAG.getBitcast(IntVT, Src0);
+    SDValue InitBitCast = DAG.getBitcast(IntVT, Src0);
     Src0Valid = DAG.getAnyExtOrTrunc(InitBitCast, SL, MVT::i32);
     if (Src2.getNode()) {
-      auto Src2Cast = DAG.getBitcast(IntVT, Src2);
+      SDValue Src2Cast = DAG.getBitcast(IntVT, Src2);
       Src2Valid = DAG.getAnyExtOrTrunc(Src2Cast, SL, MVT::i32);
     }
-    auto LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid, MVT::i32);
-    auto Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
+    SDValue LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid, MVT::i32);
+    SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
     return DAG.getBitcast(VT, Trunc);
   }
 
@@ -6038,8 +6036,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
     if (Src2.getNode())
       Src2Valid = DAG.getBitcast(VecVT, Src2);
 
-    auto LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid, VecVT);
-    auto UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode());
+    SDValue LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid, VecVT);
+    SDValue UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode());
     return DAG.getBitcast(VT, UnrolledLaneOp);
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 08447f2a395ac..8600480b1148c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -389,5 +389,240 @@ define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
   ret void
 }
 
+define void @test_readfirstlane_half(ptr addrspace(1) %out, half %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_half:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_half:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call half @llvm.amdgcn.readfirstlane.f16(half %src)
+  call void asm sideeffect "; use $0", "s"(half %x)
+  ret void
+}
+
+define void @test_readfirstlane_float(ptr addrspace(1) %out, float %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_float:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_float:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call float @llvm.amdgcn.readfirstlane.f32(float %src)
+  call void asm sideeffect "; use $0", "s"(float %x)
+  ret void
+}
+
+define void @test_readfirstlane_bfloat(ptr addrspace(1) %out, bfloat %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_bfloat:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_bfloat:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call bfloat @llvm.amdgcn.readfirstlane.bf16(bfloat %src)
+  call void asm sideeffect "; use $0", "s"(bfloat %x)
+  ret void
+}
+
+define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_i16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0xffff
+; CHECK-SDAG-NEXT:    v_and_b32_e32 v0, s4, v0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use v0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call i16 @llvm.amdgcn.readfirstlane.i16(i16 %src)
+  call void asm sideeffect "; use $0", "s"(i16 %x)
+  ret void
+}
+
+define void @test_readfirstlane_v2f16(ptr addrspace(1) %out, <2 x half> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v2f16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v2f16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> %src)
+  call void asm sideeffect "; use $0", "s"(<2 x half> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v2f32(ptr addrspace(1) %out, <2 x float> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v2f32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:5]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v2f32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:5]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <2 x float> @llvm.amdgcn.readfirstlane.v2f32(<2 x float> %src)
+  call void asm sideeffect "; use $0", "s"(<2 x float> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v7i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:10]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v7i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:10]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <7 x i32> @llvm.amdgcn.readfirstlane.v7i32(<7 x i32> %src)
+  call void asm sideeffect "; use $0", "s"(<7 x i32> %x)
+  ret void
+}
+
+define void @test_readfirstlane_p0(ptr addrspace(1) %out, ptr %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:5]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_p0:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:5]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %src)
+  call void asm sideeffect "; use $0", "s"(ptr %x)
+  ret void
+}
+
+define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:9]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v3p0:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:9]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr> @llvm.amdgcn.readfirstlane.v3p0(<3 x ptr> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
+  ret void
+}
+
 attributes #0 = { nounwind readnone convergent }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 7d2454182e8ec..47486d75630f3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -657,6 +657,277 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
   ret void
 }
 
+define void @test_readlane_half(ptr addrspace(1) %out, half %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_half:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_half:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call half @llvm.amdgcn.readlane.f16(half %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(half %x)
+  ret void
+}
+
+define void @test_readlane_float(ptr addrspace(1) %out, float %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_float:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_float:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call float @llvm.amdgcn.readlane.f32(float %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(float %x)
+  ret void
+}
+
+define void @test_readlane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_bfloat:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_bfloat:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call bfloat @llvm.amdgcn.readlane.bf16(bfloat %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(bfloat %x)
+  ret void
+}
+
+define void @test_readlane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_i16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0xffff
+; CHECK-SDAG-NEXT:    s_nop 2
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    v_and_b32_e32 v0, s4, v0
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use v0
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_i16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call i16 @llvm.amdgcn.readlane.i16(i16 %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(i16 %x)
+  ret void
+}
+
+define void @test_readlane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v2f16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v2f16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <2 x half> @llvm.amdgcn.readlane.v2f16(<2 x half> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<2 x half> %x)
+  ret void
+}
+
+define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v2f32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:5]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v2f32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s5
+; CHECK-GISEL-NEXT:    v_readlane_b32 s5, v3, s5
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:5]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <2 x float> @llvm.amdgcn.readlane.v2f32(<2 x float> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<2 x float> %x)
+  ret void
+}
+
+define void @test_readlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v7i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v9
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s10, v8, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s9, v7, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s8, v6, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:10]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v7i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s10, v9
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s5, v3, s10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s6, v4, s10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s7, v5, s10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s8, v6, s10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s9, v7, s10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s10, v8, s10
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:10]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <7 x i32> @llvm.amdgcn.readlane.v7i32(<7 x i32> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<7 x i32> %x)
+  ret void
+}
+
+define void @test_readlane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:5]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_p0:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s5
+; CHECK-GISEL-NEXT:    v_readlane_b32 s5, v3, s5
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:5]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr @llvm.amdgcn.readlane.p0(ptr %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(ptr %x)
+  ret void
+}
+
+define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v8
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s9, v7, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s8, v6, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:9]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v3p0:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s9, v8
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s9
+; CHECK-GISEL-NEXT:    v_readlane_b32 s5, v3, s9
+; CHECK-GISEL-NEXT:    v_readlane_b32 s6, v4, s9
+; CHECK-GISEL-NEXT:    v_readlane_b32 s7, v5, s9
+; CHECK-GISEL-NEXT:    v_readlane_b32 s8, v6, s9
+; CHECK-GISEL-NEXT:    v_readlane_b32 s9, v7, s9
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:9]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr> @llvm.amdgcn.readlane.v3p0(<3 x ptr> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index f5f8fa3907f97..8c25cf3977858 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -1,10 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX700-SDAG %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX802-SDAG %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
 
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX700-GISEL %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX802-GISEL %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX1010-GISEL %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel < %s | FileCheck -check-prefixes=GFX1100-GISEL %s
@@ -14,20 +12,6 @@ declare i64 @llvm.amdgcn.writelane.i64(i64, i32, i64) #0
 declare double @llvm.amdgcn.writelane.f64(double, i32, double) #0
 
 define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
-; GFX700-SDAG-LABEL: test_writelane_sreg_i32:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    s_mov_b32 m0, s3
-; GFX700-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX700-SDAG-NEXT:    v_writelane_b32 v2, s2, m0
-; GFX700-SDAG-NEXT:    flat_store_dword v[0:1], v2
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_sreg_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -68,20 +52,6 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_sreg_i32:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    s_mov_b32 m0, s3
-; GFX700-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX700-GISEL-NEXT:    v_writelane_b32 v2, s2, m0
-; GFX700-GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -128,23 +98,6 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 }
 
 define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
-; GFX700-SDAG-LABEL: test_writelane_sreg_i64:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x4
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX700-SDAG-NEXT:    s_mov_b32 m0, s6
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s5
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
-; GFX700-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
-; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_sreg_i64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -196,23 +149,6 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_sreg_i64:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x4
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX700-GISEL-NEXT:    s_mov_b32 m0, s6
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
-; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
-; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_i64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -270,23 +206,6 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 }
 
 define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
-; GFX700-SDAG-LABEL: test_writelane_sreg_f64:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x4
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX700-SDAG-NEXT:    s_mov_b32 m0, s6
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s5
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
-; GFX700-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
-; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_sreg_f64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -338,23 +257,6 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_sreg_f64:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x4
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX700-GISEL-NEXT:    s_mov_b32 m0, s6
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
-; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
-; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_f64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -412,20 +314,6 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
 }
 
 define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
-; GFX700-SDAG-LABEL: test_writelane_imm_sreg_i32:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX700-SDAG-NEXT:    v_writelane_b32 v2, 32, s2
-; GFX700-SDAG-NEXT:    flat_store_dword v[0:1], v2
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -470,20 +358,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_imm_sreg_i32:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX700-GISEL-NEXT:    v_writelane_b32 v2, 32, s2
-; GFX700-GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -534,22 +408,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
 }
 
 define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 {
-; GFX700-SDAG-LABEL: test_writelane_imm_sreg_i64:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x2
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX700-SDAG-NEXT:    v_writelane_b32 v1, 0, s4
-; GFX700-SDAG-NEXT:    v_writelane_b32 v0, 32, s4
-; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -600,22 +458,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_imm_sreg_i64:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x2
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX700-GISEL-NEXT:    v_writelane_b32 v0, 32, s4
-; GFX700-GISEL-NEXT:    v_writelane_b32 v1, 0, s4
-; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -672,24 +514,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
 }
 
 define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 {
-; GFX700-SDAG-LABEL: test_writelane_imm_sreg_f64:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x2
-; GFX700-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX700-SDAG-NEXT:    s_mov_b32 m0, s4
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s5, m0
-; GFX700-SDAG-NEXT:    v_writelane_b32 v0, 0, s4
-; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_f64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -745,24 +569,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_imm_sreg_f64:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x2
-; GFX700-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX700-GISEL-NEXT:    s_mov_b32 m0, s4
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX700-GISEL-NEXT:    v_writelane_b32 v0, 0, s4
-; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
-; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_f64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -823,29 +629,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
 }
 
 define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
-; GFX700-SDAG-LABEL: test_writelane_vreg_lane_i32:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX700-SDAG-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GFX700-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX700-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
-; GFX700-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX700-SDAG-NEXT:    flat_load_dword v0, v[0:1]
-; GFX700-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX700-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX700-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-SDAG-NEXT:    s_nop 2
-; GFX700-SDAG-NEXT:    v_writelane_b32 v2, 12, s2
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-SDAG-NEXT:    flat_store_dword v[0:1], v2
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -905,30 +688,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_vreg_lane_i32:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX700-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX700-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX700-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
-; GFX700-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX700-GISEL-NEXT:    flat_load_dword v0, v[0:1]
-; GFX700-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX700-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX700-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-GISEL-NEXT:    s_nop 2
-; GFX700-GISEL-NEXT:    v_writelane_b32 v2, 12, s2
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -999,31 +758,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
 }
 
 define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
-; GFX700-SDAG-LABEL: test_writelane_vreg_lane_i64:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX700-SDAG-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GFX700-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX700-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
-; GFX700-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX700-SDAG-NEXT:    flat_load_dword v2, v[0:1]
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX700-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX700-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-SDAG-NEXT:    s_nop 2
-; GFX700-SDAG-NEXT:    v_writelane_b32 v1, 0, s2
-; GFX700-SDAG-NEXT:    v_writelane_b32 v0, 12, s2
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1089,31 +823,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_vreg_lane_i64:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX700-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX700-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX700-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
-; GFX700-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX700-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v4, s1
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s0
-; GFX700-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX700-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX700-GISEL-NEXT:    s_nop 3
-; GFX700-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
-; GFX700-GISEL-NEXT:    v_writelane_b32 v2, 0, s2
-; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1188,33 +897,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
 }
 
 define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
-; GFX700-SDAG-LABEL: test_writelane_vreg_lane_f64:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX700-SDAG-NEXT:    s_mov_b32 s4, 0x40280000
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX700-SDAG-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GFX700-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX700-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
-; GFX700-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX700-SDAG-NEXT:    flat_load_dword v2, v[0:1]
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX700-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX700-SDAG-NEXT:    v_readfirstlane_b32 m0, v2
-; GFX700-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-SDAG-NEXT:    s_nop 1
-; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s4, m0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-SDAG-NEXT:    v_writelane_b32 v0, 0, s2
-; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_f64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1284,33 +966,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_vreg_lane_f64:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
-; GFX700-GISEL-NEXT:    s_mov_b32 s4, 0x40280000
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX700-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX700-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX700-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
-; GFX700-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX700-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v4, s1
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s0
-; GFX700-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX700-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX700-GISEL-NEXT:    s_mov_b32 m0, s2
-; GFX700-GISEL-NEXT:    s_nop 2
-; GFX700-GISEL-NEXT:    v_writelane_b32 v1, 0, s2
-; GFX700-GISEL-NEXT:    v_writelane_b32 v2, s4, m0
-; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_f64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1390,25 +1045,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 }
 
 define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 {
-; GFX700-SDAG-LABEL: test_writelane_m0_sreg_i32:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX700-SDAG-NEXT:    ;;#ASMSTART
-; GFX700-SDAG-NEXT:    s_mov_b32 m0, -1
-; GFX700-SDAG-NEXT:    ;;#ASMEND
-; GFX700-SDAG-NEXT:    s_mov_b32 s4, m0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
-; GFX700-SDAG-NEXT:    s_mov_b32 m0, s2
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX700-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
-; GFX700-SDAG-NEXT:    flat_store_dword v[0:1], v2
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_m0_sreg_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -1464,25 +1100,6 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_m0_sreg_i32:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX700-GISEL-NEXT:    ;;#ASMSTART
-; GFX700-GISEL-NEXT:    s_mov_b32 m0, -1
-; GFX700-GISEL-NEXT:    ;;#ASMEND
-; GFX700-GISEL-NEXT:    s_mov_b32 s4, m0
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
-; GFX700-GISEL-NEXT:    s_mov_b32 m0, s2
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX700-GISEL-NEXT:    v_writelane_b32 v2, s4, m0
-; GFX700-GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_m0_sreg_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -1545,20 +1162,6 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
 }
 
 define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %src0) #1 {
-; GFX700-SDAG-LABEL: test_writelane_imm_i32:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX700-SDAG-NEXT:    v_writelane_b32 v2, s2, 32
-; GFX700-SDAG-NEXT:    flat_store_dword v[0:1], v2
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_imm_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -1603,20 +1206,6 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_imm_i32:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX700-GISEL-NEXT:    v_writelane_b32 v2, s2, 32
-; GFX700-GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_imm_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -1667,21 +1256,6 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
 }
 
 define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %src0) #1 {
-; GFX700-SDAG-LABEL: test_writelane_imm_i64:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s5
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
-; GFX700-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
-; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_imm_i64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1727,21 +1301,6 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_imm_i64:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
-; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
-; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_imm_i64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1793,21 +1352,6 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
 }
 
 define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double %src0) #1 {
-; GFX700-SDAG-LABEL: test_writelane_imm_f64:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s5
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
-; GFX700-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
-; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_imm_f64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1853,21 +1397,6 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_imm_f64:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
-; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
-; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_imm_f64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1919,19 +1448,6 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
 }
 
 define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
-; GFX700-SDAG-LABEL: test_writelane_sreg_oldval_i32:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s6
-; GFX700-SDAG-NEXT:    s_mov_b32 m0, s3
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-SDAG-NEXT:    v_writelane_b32 v2, s2, m0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-SDAG-NEXT:    flat_store_dword v[0:1], v2
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1971,19 +1487,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_sreg_oldval_i32:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX700-GISEL-NEXT:    s_mov_b32 m0, s3
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-GISEL-NEXT:    v_writelane_b32 v2, s2, m0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -2028,22 +1531,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
 }
 
 define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
-; GFX700-SDAG-LABEL: test_writelane_sreg_oldval_i64:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x6
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-SDAG-NEXT:    s_mov_b32 m0, s6
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX700-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
-; GFX700-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
-; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2092,22 +1579,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_sreg_oldval_i64:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x6
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-GISEL-NEXT:    s_mov_b32 m0, s6
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
-; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2161,22 +1632,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
 }
 
 define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
-; GFX700-SDAG-LABEL: test_writelane_sreg_oldval_f64:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x6
-; GFX700-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-SDAG-NEXT:    s_mov_b32 m0, s6
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; GFX700-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
-; GFX700-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
-; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2225,22 +1680,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_sreg_oldval_f64:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x6
-; GFX700-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-GISEL-NEXT:    s_mov_b32 m0, s6
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
-; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_f64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2294,18 +1733,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
 }
 
 define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
-; GFX700-SDAG-LABEL: test_writelane_imm_oldval_i32:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, 42
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    s_mov_b32 m0, s3
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-SDAG-NEXT:    v_writelane_b32 v2, s2, m0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-SDAG-NEXT:    flat_store_dword v[0:1], v2
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2340,18 +1767,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_imm_oldval_i32:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, 42
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    s_mov_b32 m0, s3
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX700-GISEL-NEXT:    v_writelane_b32 v2, s2, m0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX700-GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2391,21 +1806,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 }
 
 define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 {
-; GFX700-SDAG-LABEL: test_writelane_imm_oldval_i64:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x4
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, 42
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-SDAG-NEXT:    s_mov_b32 m0, s4
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
-; GFX700-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
-; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2451,21 +1851,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_imm_oldval_i64:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x4
-; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, 42
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    s_mov_b32 m0, s6
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
-; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
@@ -2516,21 +1901,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 }
 
 define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 {
-; GFX700-SDAG-LABEL: test_writelane_imm_oldval_f64:
-; GFX700-SDAG:       ; %bb.0:
-; GFX700-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x4
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX700-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-SDAG-NEXT:    s_mov_b32 m0, s4
-; GFX700-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
-; GFX700-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
-; GFX700-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-SDAG-NEXT:    s_endpgm
-;
 ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_f64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2576,21 +1946,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
-; GFX700-GISEL-LABEL: test_writelane_imm_oldval_f64:
-; GFX700-GISEL:       ; %bb.0:
-; GFX700-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x4
-; GFX700-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40450000
-; GFX700-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX700-GISEL-NEXT:    s_mov_b32 m0, s6
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX700-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
-; GFX700-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
-; GFX700-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX700-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX700-GISEL-NEXT:    s_endpgm
-;
 ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_f64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
@@ -2640,6 +1995,916 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
   ret void
 }
 
+define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_half:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_short v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_half:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_short v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_half:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_u16 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_half:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_short v[0:1], v4
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_half:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT:    global_store_short v[0:1], v4, off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_half:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_u16 v4, v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load half, ptr addrspace(1) %out
+  %writelane = call half @llvm.amdgcn.writelane.f16(half %src, i32 %src1, half %oldval)
+  store half %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_float(ptr addrspace(1) %out, float %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_float:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_float:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_float:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_float:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_float:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_float:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load float, ptr addrspace(1) %out
+  %writelane = call float @llvm.amdgcn.writelane.f32(float %src, i32 %src1, float %oldval)
+  store float %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_bfloat:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_short v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_bfloat:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_short v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_bfloat:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_u16 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_bfloat:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_short v[0:1], v4
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_bfloat:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT:    global_store_short v[0:1], v4, off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_bfloat:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_u16 v4, v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load bfloat, ptr addrspace(1) %out
+  %writelane = call bfloat @llvm.amdgcn.writelane.bf16(bfloat %src, i32 %src1, bfloat %oldval)
+  store bfloat %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_i16:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_short v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_i16:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_short v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_i16:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_u16 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_i16:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_short v[0:1], v4
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_i16:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT:    global_store_short v[0:1], v4, off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_i16:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_u16 v4, v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load i16, ptr addrspace(1) %out
+  %writelane = call i16 @llvm.amdgcn.writelane.i16(i16 %src, i32 %src1, i16 %oldval)
+  store i16 %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v2f16:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v2f16:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v2f16:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v2f16:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v2f16:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v2f16:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <2 x half>, ptr addrspace(1) %out
+  %writelane = call <2 x half> @llvm.amdgcn.writelane.v2f16(<2 x half> %src, i32 %src1, <2 x half> %oldval)
+  store <2 x half> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_readlane_v2f32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v5, s5, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_readlane_v2f32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s4, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v5, s6, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_readlane_v2f32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b64 v[5:6], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v5, s2, s1
+; GFX1100-SDAG-NEXT:    global_store_b64 v[0:1], v[5:6], off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_readlane_v2f32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v5, s4, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v6, s6, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_readlane_v2f32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v5, s4, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v6, s6, s5
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_readlane_v2f32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b64 v[5:6], v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v5, s0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v6, s2, s1
+; GFX1100-GISEL-NEXT:    global_store_b64 v[0:1], v[5:6], off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <2 x float>, ptr addrspace(1) %out
+  %writelane = call <2 x float> @llvm.amdgcn.writelane.v2f32(<2 x float> %src, i32 %src1, <2 x float> %oldval)
+  store <2 x float> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v7i32:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v17, vcc, 16, v0
+; GFX802-SDAG-NEXT:    flat_load_dwordx4 v[10:13], v[0:1]
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v18, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    flat_load_dwordx3 v[14:16], v[17:18]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v9
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v13, s7, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v12, s8, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v11, s9, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v10, s10, m0
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v16, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v15, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v14, s6, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx4 v[0:1], v[10:13]
+; GFX802-SDAG-NEXT:    flat_store_dwordx3 v[17:18], v[14:16]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v7i32:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    global_load_dwordx3 v[14:16], v[0:1], off offset:16
+; GFX1010-SDAG-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v9
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s8, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s9, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s11, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v6
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v16, s4, s5
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v13, s8, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v12, s9, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v11, s10, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v10, s11, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v15, s6, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v14, s7, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
+; GFX1010-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[14:16], off offset:16
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v7i32:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    global_load_b96 v[14:16], v[0:1], off offset:16
+; GFX1100-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v7
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v6
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v16, s0, s1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v13, s4, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v12, s5, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v11, s6, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v10, s7, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v15, s2, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v14, s3, s1
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    global_store_b128 v[0:1], v[10:13], off
+; GFX1100-SDAG-NEXT:    global_store_b96 v[0:1], v[14:16], off offset:16
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v7i32:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v18, vcc, 16, v0
+; GFX802-GISEL-NEXT:    flat_load_dwordx4 v[10:13], v[0:1]
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v19, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    flat_load_dwordx4 v[14:17], v[18:19]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v9
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s8, v5
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s9, v6
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v10, s4, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v11, s6, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v12, s7, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v13, s8, m0
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v14, s9, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v15, s10, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v16, s11, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[10:13]
+; GFX802-GISEL-NEXT:    flat_store_dwordx3 v[18:19], v[14:16]
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v7i32:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
+; GFX1010-GISEL-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:16
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v9
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s8, v5
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s9, v6
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v10, s4, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v11, s6, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v12, s7, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v13, s8, s5
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v14, s9, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v15, s10, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v16, s11, s5
+; GFX1010-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
+; GFX1010-GISEL-NEXT:    global_store_dwordx3 v[0:1], v[14:16], off offset:16
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v7i32:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    global_load_b128 v[10:13], v[0:1], off
+; GFX1100-GISEL-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:16
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v10, s0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v11, s2, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v12, s3, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v13, s4, s1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v14, s5, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v15, s6, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v16, s7, s1
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    global_store_b128 v[0:1], v[10:13], off
+; GFX1100-GISEL-NEXT:    global_store_b96 v[0:1], v[14:16], off offset:16
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <7 x i32>, ptr addrspace(1) %out
+  %writelane = call <7 x i32> @llvm.amdgcn.writelane.v7i32(<7 x i32> %src, i32 %src1, <7 x i32> %oldval)
+  store <7 x i32> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p0:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v5, s5, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p0:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s4, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v5, s6, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p0:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b64 v[5:6], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v5, s2, s1
+; GFX1100-SDAG-NEXT:    global_store_b64 v[0:1], v[5:6], off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_p0:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v5, s4, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v6, s6, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_p0:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v5, s4, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v6, s6, s5
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_p0:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b64 v[5:6], v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v5, s0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v6, s2, s1
+; GFX1100-GISEL-NEXT:    global_store_b64 v[0:1], v[5:6], off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load ptr, ptr addrspace(1) %out
+  %writelane = call ptr @llvm.amdgcn.writelane.p0(ptr %src, i32 %src1, ptr %oldval)
+  store ptr %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p0:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v13, vcc, 16, v0
+; GFX802-SDAG-NEXT:    flat_load_dwordx4 v[9:12], v[0:1]
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    flat_load_dwordx2 v[15:16], v[13:14]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v8
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v7
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v12, s6, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v11, s7, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v10, s8, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v9, s9, m0
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v16, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v15, s5, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx4 v[0:1], v[9:12]
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[13:14], v[15:16]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p0:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    global_load_dwordx2 v[13:14], v[0:1], off offset:16
+; GFX1010-SDAG-NEXT:    global_load_dwordx4 v[9:12], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v7
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v14, s4, s5
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v12, s7, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v11, s8, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v10, s9, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v9, s10, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v13, s6, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[9:12], off
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[13:14], off offset:16
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p0:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    global_load_b64 v[13:14], v[0:1], off offset:16
+; GFX1100-SDAG-NEXT:    global_load_b128 v[9:12], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v7
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v14, s0, s1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v12, s3, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v11, s4, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v10, s5, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v9, s6, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v13, s2, s1
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    global_store_b128 v[0:1], v[9:12], off
+; GFX1100-SDAG-NEXT:    global_store_b64 v[0:1], v[13:14], off offset:16
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v3p0:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v17, vcc, 16, v0
+; GFX802-GISEL-NEXT:    flat_load_dwordx4 v[9:12], v[0:1]
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    flat_load_dwordx4 v[13:16], v[17:18]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s8, v5
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s9, v6
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v9, s4, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v10, s6, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v11, s7, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v12, s8, m0
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v13, s9, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v14, s10, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[9:12]
+; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[17:18], v[13:14]
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v3p0:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    s_clause 0x1
+; GFX1010-GISEL-NEXT:    global_load_dwordx4 v[9:12], v[0:1], off
+; GFX1010-GISEL-NEXT:    global_load_dwordx4 v[13:16], v[0:1], off offset:16
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s8, v5
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s9, v6
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v9, s4, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v10, s6, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v11, s7, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v12, s8, s5
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v13, s9, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v14, s10, s5
+; GFX1010-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[9:12], off
+; GFX1010-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[13:14], off offset:16
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v3p0:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    global_load_b128 v[9:12], v[0:1], off
+; GFX1100-GISEL-NEXT:    global_load_b128 v[13:16], v[0:1], off offset:16
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v9, s0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v10, s2, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v11, s3, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v12, s4, s1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v13, s5, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v14, s6, s1
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    global_store_b128 v[0:1], v[9:12], off
+; GFX1100-GISEL-NEXT:    global_store_b64 v[0:1], v[13:14], off offset:16
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <3 x ptr>, ptr addrspace(1) %out
+  %writelane = call <3 x ptr> @llvm.amdgcn.writelane.v3p0(<3 x ptr> %src, i32 %src1, <3 x ptr> %oldval)
+  store <3 x ptr> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }

>From d0610c47c0e2cb4bca5f90c289ffaa5c4178547f Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Thu, 9 May 2024 02:50:18 -0400
Subject: [PATCH 12/30] Review comments, refactor GISel Impl

---
 llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td     |  2 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 68 ++++++++++++-------
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  2 +-
 3 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index ebb6e4de36c01..02f860ffce932 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -520,4 +520,4 @@ def AMDGPUreadfirstlane : PatFrags<(ops node:$src),
 def AMDGPUwritelane : PatFrags<(ops node:$src0, node:$src1, node:$src2),
   [(int_amdgcn_writelane node:$src0, node:$src1, node:$src2),
    (AMDGPUwritelane_impl node:$src0, node:$src1, node:$src2)]>;
-   
\ No newline at end of file
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index faf70ca6fbdc9..16d3219a16dfb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5396,17 +5396,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   Register DstReg = MI.getOperand(0).getReg();
   Register Src0 = MI.getOperand(2).getReg();
 
-  auto createLaneOp = [&](Register &Src0, Register &Src1,
-                          Register &Src2) -> Register {
-    auto LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0);
-    if (Src2.isValid())
-      return (LaneOpDst.addUse(Src1).addUse(Src2)).getReg(0);
-    if (Src1.isValid())
-      return (LaneOpDst.addUse(Src1)).getReg(0);
-    return LaneOpDst.getReg(0);
-  };
-
-  Register Src1, Src2, Src0Valid, Src2Valid;
+  Register Src1, Src2;
   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
     Src1 = MI.getOperand(3).getReg();
     if (IID == Intrinsic::amdgcn_writelane) {
@@ -5423,10 +5413,24 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
       return true;
 
     Register Src0Valid = B.buildBitcast(S32, Src0).getReg(0);
-    if (Src2.isValid())
-      Src2Valid = B.buildBitcast(S32, Src2).getReg(0);
-    Register LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid);
-    B.buildBitcast(DstReg, LaneOp);
+    MachineInstrBuilder LaneOpDst;
+    switch (IID) {
+      case Intrinsic::amdgcn_readfirstlane: {
+        LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid);
+        break;
+      }
+      case Intrinsic::amdgcn_readlane: {
+        LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid).addUse(Src1);
+        break;
+      }
+      case Intrinsic::amdgcn_writelane: {
+        Register Src2Valid = B.buildBitcast(S32, Src2).getReg(0);
+        LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid).addUse(Src1).addUse(Src2Valid);
+      }
+    }
+
+    Register LaneOpDstReg = LaneOpDst.getReg(0);
+    B.buildBitcast(DstReg, LaneOpDstReg);
     MI.eraseFromParent();
     return true;
   }
@@ -5435,20 +5439,32 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     Register Src0Cast = MRI.getType(Src0).isScalar()
                             ? Src0
                             : B.buildBitcast(LLT::scalar(Size), Src0).getReg(0);
-    Src0Valid = B.buildAnyExt(S32, Src0Cast).getReg(0);
-
-    if (Src2.isValid()) {
-      Register Src2Cast =
-          MRI.getType(Src2).isScalar()
-              ? Src2
-              : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
-      Src2Valid = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0);
+    Register Src0Valid = B.buildAnyExt(S32, Src0Cast).getReg(0);
+
+    MachineInstrBuilder LaneOpDst;
+    switch (IID) {
+      case Intrinsic::amdgcn_readfirstlane: {
+        LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid);
+        break;
+      }
+      case Intrinsic::amdgcn_readlane: {
+        LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid).addUse(Src1);
+        break;
+      }
+      case Intrinsic::amdgcn_writelane: {
+        Register Src2Cast = MRI.getType(Src2).isScalar()
+                            ? Src2
+                            : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
+        Register Src2Valid = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0);
+        LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid).addUse(Src1).addUse(Src2Valid);
+      }
     }
-    Register LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid);
+
+    Register LaneOpDstReg = LaneOpDst.getReg(0);
     if (Ty.isScalar())
-      B.buildTrunc(DstReg, LaneOp);
+      B.buildTrunc(DstReg, LaneOpDstReg);
     else {
-      auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOp);
+      auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOpDstReg);
       B.buildBitcast(DstReg, Trunc);
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 478211c32f845..c68bf252b0193 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5991,7 +5991,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   SDLoc SL(N);
   MVT IntVT = MVT::getIntegerVT(ValSize);
 
-  auto createLaneOp = [&](SDValue &Src0, SDValue &Src1, SDValue &Src2,
+  auto createLaneOp = [&](SDValue Src0, SDValue Src1, SDValue Src2,
                           MVT VT) -> SDValue {
     return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2})
             : Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1})

>From 9233833b6c03c3251af23546a8211ff2394690cc Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Thu, 9 May 2024 02:54:54 -0400
Subject: [PATCH 13/30] clang-format

---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 61 +++++++++++--------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 16d3219a16dfb..b0a2bbeb61031 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5415,18 +5415,21 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     Register Src0Valid = B.buildBitcast(S32, Src0).getReg(0);
     MachineInstrBuilder LaneOpDst;
     switch (IID) {
-      case Intrinsic::amdgcn_readfirstlane: {
-        LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid);
-        break;
-      }
-      case Intrinsic::amdgcn_readlane: {
-        LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid).addUse(Src1);
-        break;
-      }
-      case Intrinsic::amdgcn_writelane: {
-        Register Src2Valid = B.buildBitcast(S32, Src2).getReg(0);
-        LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid).addUse(Src1).addUse(Src2Valid);
-      }
+    case Intrinsic::amdgcn_readfirstlane: {
+      LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid);
+      break;
+    }
+    case Intrinsic::amdgcn_readlane: {
+      LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid).addUse(Src1);
+      break;
+    }
+    case Intrinsic::amdgcn_writelane: {
+      Register Src2Valid = B.buildBitcast(S32, Src2).getReg(0);
+      LaneOpDst = B.buildIntrinsic(IID, {S32})
+                      .addUse(Src0Valid)
+                      .addUse(Src1)
+                      .addUse(Src2Valid);
+    }
     }
 
     Register LaneOpDstReg = LaneOpDst.getReg(0);
@@ -5443,21 +5446,25 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
 
     MachineInstrBuilder LaneOpDst;
     switch (IID) {
-      case Intrinsic::amdgcn_readfirstlane: {
-        LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid);
-        break;
-      }
-      case Intrinsic::amdgcn_readlane: {
-        LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid).addUse(Src1);
-        break;
-      }
-      case Intrinsic::amdgcn_writelane: {
-        Register Src2Cast = MRI.getType(Src2).isScalar()
-                            ? Src2
-                            : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
-        Register Src2Valid = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0);
-        LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid).addUse(Src1).addUse(Src2Valid);
-      }
+    case Intrinsic::amdgcn_readfirstlane: {
+      LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid);
+      break;
+    }
+    case Intrinsic::amdgcn_readlane: {
+      LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid).addUse(Src1);
+      break;
+    }
+    case Intrinsic::amdgcn_writelane: {
+      Register Src2Cast =
+          MRI.getType(Src2).isScalar()
+              ? Src2
+              : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
+      Register Src2Valid = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0);
+      LaneOpDst = B.buildIntrinsic(IID, {S32})
+                      .addUse(Src0Valid)
+                      .addUse(Src1)
+                      .addUse(Src2Valid);
+    }
     }
 
     Register LaneOpDstReg = LaneOpDst.getReg(0);

>From 993a6307eaa31522bc8580c664e2a184fe6a3ddf Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Mon, 13 May 2024 13:28:18 +0000
Subject: [PATCH 14/30] Review comments, improve pointer handling with GISel

---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 175 ++++---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  20 +-
 .../atomic_optimization_split_dt_update.ll    |   2 +-
 .../AMDGPU/llvm.amdgcn.readfirstlane.ll       |  79 +++
 .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll    |  91 ++++
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   | 489 ++++++++++++++----
 6 files changed, 700 insertions(+), 156 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e5e68a17c4069..551a9c926b88d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5397,6 +5397,21 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   Register DstReg = MI.getOperand(0).getReg();
   Register Src0 = MI.getOperand(2).getReg();
 
+  auto createLaneOp = [&](Register Src0, Register Src1,
+                          Register Src2) -> Register {
+    auto LaneOp = B.buildIntrinsic(IID, {S32}).addUse(Src0);
+    switch (IID) {
+    case Intrinsic::amdgcn_readfirstlane:
+      return LaneOp.getReg(0);
+    case Intrinsic::amdgcn_readlane:
+      return LaneOp.addUse(Src1).getReg(0);
+    case Intrinsic::amdgcn_writelane:
+      return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
+    default:
+      llvm_unreachable("unhandled lane op");
+    }
+  };
+
   Register Src1, Src2;
   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
     Src1 = MI.getOperand(3).getReg();
@@ -5413,28 +5428,22 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
       // Already legal
       return true;
 
-    Register Src0Valid = B.buildBitcast(S32, Src0).getReg(0);
-    MachineInstrBuilder LaneOpDst;
-    switch (IID) {
-    case Intrinsic::amdgcn_readfirstlane: {
-      LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid);
-      break;
-    }
-    case Intrinsic::amdgcn_readlane: {
-      LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid).addUse(Src1);
-      break;
-    }
-    case Intrinsic::amdgcn_writelane: {
-      Register Src2Valid = B.buildBitcast(S32, Src2).getReg(0);
-      LaneOpDst = B.buildIntrinsic(IID, {S32})
-                      .addUse(Src0Valid)
-                      .addUse(Src1)
-                      .addUse(Src2Valid);
-    }
+    auto IsPtr = Ty.isPointer();
+    Src0 = IsPtr ? B.buildPtrToInt(S32, Src0).getReg(0)
+                 : B.buildBitcast(S32, Src0).getReg(0);
+
+    if (Src2.isValid()) {
+      Src2 = IsPtr ? B.buildPtrToInt(S32, Src2).getReg(0)
+                   : B.buildBitcast(S32, Src2).getReg(0);
     }
 
-    Register LaneOpDstReg = LaneOpDst.getReg(0);
-    B.buildBitcast(DstReg, LaneOpDstReg);
+    Register LaneOpDst = createLaneOp(Src0, Src1, Src2);
+
+    if (IsPtr)
+      B.buildIntToPtr(DstReg, LaneOpDst);
+    else
+      B.buildBitcast(DstReg, LaneOpDst);
+
     MI.eraseFromParent();
     return true;
   }
@@ -5443,36 +5452,20 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     Register Src0Cast = MRI.getType(Src0).isScalar()
                             ? Src0
                             : B.buildBitcast(LLT::scalar(Size), Src0).getReg(0);
-    Register Src0Valid = B.buildAnyExt(S32, Src0Cast).getReg(0);
-
-    MachineInstrBuilder LaneOpDst;
-    switch (IID) {
-    case Intrinsic::amdgcn_readfirstlane: {
-      LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid);
-      break;
-    }
-    case Intrinsic::amdgcn_readlane: {
-      LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid).addUse(Src1);
-      break;
-    }
-    case Intrinsic::amdgcn_writelane: {
+    Src0 = B.buildAnyExt(S32, Src0Cast).getReg(0);
+    if (Src2.isValid()) {
       Register Src2Cast =
           MRI.getType(Src2).isScalar()
               ? Src2
               : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
-      Register Src2Valid = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0);
-      LaneOpDst = B.buildIntrinsic(IID, {S32})
-                      .addUse(Src0Valid)
-                      .addUse(Src1)
-                      .addUse(Src2Valid);
-    }
+      Src2 = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0);
     }
 
-    Register LaneOpDstReg = LaneOpDst.getReg(0);
+    Register LaneOpDst = createLaneOp(Src0, Src1, Src2);
     if (Ty.isScalar())
-      B.buildTrunc(DstReg, LaneOpDstReg);
+      B.buildTrunc(DstReg, LaneOpDst);
     else {
-      auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOpDstReg);
+      auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOpDst);
       B.buildBitcast(DstReg, Trunc);
     }
 
@@ -5483,50 +5476,116 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   if ((Size % 32) == 0) {
     SmallVector<Register, 2> PartialRes;
     unsigned NumParts = Size / 32;
-    auto Src0Parts = B.buildUnmerge(S32, Src0);
+    auto IsS16Vec = Ty.isVector() && Ty.getElementType() == S16;
+    MachineInstrBuilder Src0Parts;
+
+    if (Ty.isPointer()) {
+      auto PtrToInt = B.buildPtrToInt(LLT::scalar(Size), Src0);
+      Src0Parts = B.buildUnmerge(S32, PtrToInt);
+    } else if (Ty.isPointerVector()) {
+      LLT IntVecTy = Ty.changeElementType(
+          LLT::scalar(Ty.getElementType().getSizeInBits()));
+      auto PtrToInt = B.buildPtrToInt(IntVecTy, Src0);
+      Src0Parts = B.buildUnmerge(S32, PtrToInt);
+    } else
+      Src0Parts =
+          IsS16Vec ? B.buildUnmerge(V2S16, Src0) : B.buildUnmerge(S32, Src0);
 
     switch (IID) {
     case Intrinsic::amdgcn_readlane: {
       Register Src1 = MI.getOperand(3).getReg();
-      for (unsigned i = 0; i < NumParts; ++i)
+      for (unsigned i = 0; i < NumParts; ++i) {
+        Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
+                        : Src0Parts.getReg(i);
         PartialRes.push_back(
             (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {S32})
-                 .addUse(Src0Parts.getReg(i))
+                 .addUse(Src0)
                  .addUse(Src1))
                 .getReg(0));
+      }
       break;
     }
     case Intrinsic::amdgcn_readfirstlane: {
-
-      for (unsigned i = 0; i < NumParts; ++i)
+      for (unsigned i = 0; i < NumParts; ++i) {
+        Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
+                        : Src0Parts.getReg(i);
         PartialRes.push_back(
             (B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {S32})
-                 .addUse(Src0Parts.getReg(i)))
-                .getReg(0));
+                 .addUse(Src0)
+                 .getReg(0)));
+      }
 
       break;
     }
     case Intrinsic::amdgcn_writelane: {
       Register Src1 = MI.getOperand(3).getReg();
       Register Src2 = MI.getOperand(4).getReg();
-      auto Src2Parts = B.buildUnmerge(S32, Src2);
-
-      for (unsigned i = 0; i < NumParts; ++i)
+      MachineInstrBuilder Src2Parts;
+
+      if (Ty.isPointer()) {
+        auto PtrToInt = B.buildPtrToInt(S64, Src2);
+        Src2Parts = B.buildUnmerge(S32, PtrToInt);
+      } else if (Ty.isPointerVector()) {
+        LLT IntVecTy = Ty.changeElementType(
+            LLT::scalar(Ty.getElementType().getSizeInBits()));
+        auto PtrToInt = B.buildPtrToInt(IntVecTy, Src2);
+        Src2Parts = B.buildUnmerge(S32, PtrToInt);
+      } else
+        Src2Parts =
+            IsS16Vec ? B.buildUnmerge(V2S16, Src2) : B.buildUnmerge(S32, Src2);
+
+      for (unsigned i = 0; i < NumParts; ++i) {
+        Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
+                        : Src0Parts.getReg(i);
+        Src2 = IsS16Vec ? B.buildBitcast(S32, Src2Parts.getReg(i)).getReg(0)
+                        : Src2Parts.getReg(i);
         PartialRes.push_back(
             (B.buildIntrinsic(Intrinsic::amdgcn_writelane, {S32})
-                 .addUse(Src0Parts.getReg(i))
+                 .addUse(Src0)
                  .addUse(Src1)
-                 .addUse(Src2Parts.getReg(i)))
+                 .addUse(Src2))
                 .getReg(0));
+      }
+
+      break;
     }
     }
 
     if (Ty.isPointerVector()) {
-      auto MergedVec = B.buildMergeLikeInstr(
-          LLT::vector(ElementCount::getFixed(NumParts), S32), PartialRes);
-      B.buildBitcast(DstReg, MergedVec);
-    } else
+      unsigned PtrSize = Ty.getElementType().getSizeInBits();
+      SmallVector<Register, 2> PtrElements;
+      if (PtrSize == 32) {
+        // Handle 32 bit pointers
+        for (unsigned i = 0; i < NumParts; i++)
+          PtrElements.push_back(
+              B.buildIntToPtr(Ty.getElementType(), PartialRes[i]).getReg(0));
+      } else {
+        // Handle legalization of <? x [pointer type bigger than 32 bits]>
+        SmallVector<Register, 2> PtrParts;
+        unsigned NumS32Parts = PtrSize / 32;
+        unsigned PartIdx = 0;
+        for (unsigned i = 0, j = 1; i < NumParts; i += NumS32Parts, j++) {
+          // Merge S32 components of a pointer element first.
+          for (; PartIdx < (j * NumS32Parts); PartIdx++)
+            PtrParts.push_back(PartialRes[PartIdx]);
+
+          auto MergedPtr =
+              B.buildMergeLikeInstr(LLT::scalar(PtrSize), PtrParts);
+          PtrElements.push_back(
+              B.buildIntToPtr(Ty.getElementType(), MergedPtr).getReg(0));
+          PtrParts.clear();
+        }
+      }
+
+      B.buildMergeLikeInstr(DstReg, PtrElements);
+    } else {
+      if (IsS16Vec) {
+        for (unsigned i = 0; i < NumParts; i++)
+          PartialRes[i] = B.buildBitcast(V2S16, PartialRes[i]).getReg(0);
+      }
       B.buildMergeLikeInstr(DstReg, PartialRes);
+    }
+
     MI.eraseFromParent();
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4107628c831ec..6105c5466d222 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6102,7 +6102,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
                    : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0}));
   };
 
-  SDValue Src1, Src2, Src0Valid, Src2Valid;
+  SDValue Src1, Src2;
   if (IntrinsicID == Intrinsic::amdgcn_readlane ||
       IntrinsicID == Intrinsic::amdgcn_writelane) {
     Src1 = N->getOperand(2);
@@ -6114,33 +6114,33 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
     if (VT == MVT::i32)
       // Already legal
       return SDValue();
-    Src0Valid = DAG.getBitcast(IntVT, Src0);
+    Src0 = DAG.getBitcast(IntVT, Src0);
     if (Src2.getNode())
-      Src2Valid = DAG.getBitcast(IntVT, Src2);
-    SDValue LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid, MVT::i32);
+      Src2 = DAG.getBitcast(IntVT, Src2);
+    SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
     return DAG.getBitcast(VT, LaneOp);
   }
 
   if (ValSize < 32) {
     SDValue InitBitCast = DAG.getBitcast(IntVT, Src0);
-    Src0Valid = DAG.getAnyExtOrTrunc(InitBitCast, SL, MVT::i32);
+    Src0 = DAG.getAnyExtOrTrunc(InitBitCast, SL, MVT::i32);
     if (Src2.getNode()) {
       SDValue Src2Cast = DAG.getBitcast(IntVT, Src2);
-      Src2Valid = DAG.getAnyExtOrTrunc(Src2Cast, SL, MVT::i32);
+      Src2 = DAG.getAnyExtOrTrunc(Src2Cast, SL, MVT::i32);
     }
-    SDValue LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid, MVT::i32);
+    SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
     SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
     return DAG.getBitcast(VT, Trunc);
   }
 
   if ((ValSize % 32) == 0) {
     MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
-    Src0Valid = DAG.getBitcast(VecVT, Src0);
+    Src0 = DAG.getBitcast(VecVT, Src0);
 
     if (Src2.getNode())
-      Src2Valid = DAG.getBitcast(VecVT, Src2);
+      Src2 = DAG.getBitcast(VecVT, Src2);
 
-    SDValue LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid, VecVT);
+    SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
     SDValue UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode());
     return DAG.getBitcast(VT, UnrolledLaneOp);
   }
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
index c07cd4e493b9a..019f76aa44a87 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
@@ -48,7 +48,7 @@ define amdgpu_kernel void @ham(ptr addrspace(4) %arg) {
 ; CHECK-NEXT:    [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP6]], [[BB7]] ], [ [[TMP16:%.*]], [[COMPUTELOOP]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[PHI]], i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[PHI]], i32 [[TMP11]])
 ; CHECK-NEXT:    [[TMP13]] = add i32 [[ACCUMULATOR]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = shl i64 1, [[TMP10]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], -1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 8600480b1148c..eabf05e3d1033 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -624,5 +624,84 @@ define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) {
   ret void
 }
 
+define void @test_readfirstlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p3:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_p3:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr addrspace(3) @llvm.amdgcn.readfirstlane.p3(ptr addrspace(3) %src)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %x)
+  ret void
+}
+
+define void @test_readfirstlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p3:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v3p3:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:6]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v3p3(<3 x ptr addrspace(3)> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v8i16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:7]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v8i16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:7]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <8 x i16> @llvm.amdgcn.readfirstlane.v8i16(<8 x i16> %src)
+  call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
+  ret void
+}
+
 attributes #0 = { nounwind readnone convergent }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 47486d75630f3..dd4deb76b90b3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -928,6 +928,97 @@ define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1
   ret void
 }
 
+define void @test_readlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p3:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_p3:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s4
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr addrspace(3) @llvm.amdgcn.readlane.p3(ptr addrspace(3) %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %x)
+  ret void
+}
+
+define void @test_readlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p3:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v3p3:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v5
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s6
+; CHECK-GISEL-NEXT:    v_readlane_b32 s5, v3, s6
+; CHECK-GISEL-NEXT:    v_readlane_b32 s6, v4, s6
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:6]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readlane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %x)
+  ret void
+}
+
+define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v8i16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v6
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:7]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readlane_v8i16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v6
+; CHECK-GISEL-NEXT:    s_nop 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s7
+; CHECK-GISEL-NEXT:    v_readlane_b32 s5, v3, s7
+; CHECK-GISEL-NEXT:    v_readlane_b32 s6, v4, s7
+; CHECK-GISEL-NEXT:    v_readlane_b32 s7, v5, s7
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:7]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 8c25cf3977858..e1083042a6f09 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -2741,31 +2741,35 @@ define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
   ret void
 }
 
-define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) {
+define void @test_writelane_v3p0(ptr addrspace(1) %out, <4 x ptr> %src, i32 %src1) {
 ; GFX802-SDAG-LABEL: test_writelane_v3p0:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX802-SDAG-NEXT:    v_add_u32_e32 v13, vcc, 16, v0
-; GFX802-SDAG-NEXT:    flat_load_dwordx4 v[9:12], v[0:1]
-; GFX802-SDAG-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
-; GFX802-SDAG-NEXT:    flat_load_dwordx2 v[15:16], v[13:14]
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v8
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v5
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s7, v4
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s9, v2
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v7
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v19, vcc, 16, v0
+; GFX802-SDAG-NEXT:    flat_load_dwordx4 v[11:14], v[0:1]
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v20, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    flat_load_dwordx4 v[15:18], v[19:20]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v10
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s8, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s9, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s11, v2
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s7, v6
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(1)
-; GFX802-SDAG-NEXT:    v_writelane_b32 v12, s6, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v11, s7, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v10, s8, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v9, s9, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v14, s8, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v13, s9, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v12, s10, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v11, s11, m0
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT:    v_writelane_b32 v16, s4, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v15, s5, m0
-; GFX802-SDAG-NEXT:    flat_store_dwordx4 v[0:1], v[9:12]
-; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[13:14], v[15:16]
+; GFX802-SDAG-NEXT:    v_writelane_b32 v18, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v17, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v16, s6, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v15, s7, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
+; GFX802-SDAG-NEXT:    flat_store_dwordx4 v[19:20], v[15:18]
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2773,62 +2777,69 @@ define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
-; GFX1010-SDAG-NEXT:    global_load_dwordx2 v[13:14], v[0:1], off offset:16
-; GFX1010-SDAG-NEXT:    global_load_dwordx4 v[9:12], v[0:1], off
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v8
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s8, v4
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s9, v3
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v7
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX1010-SDAG-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:16
+; GFX1010-SDAG-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s9, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s10, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s12, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v8
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(1)
 ; GFX1010-SDAG-NEXT:    v_writelane_b32 v14, s4, s5
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v18, s9, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v17, s10, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v16, s11, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v15, s12, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v13, s6, s5
 ; GFX1010-SDAG-NEXT:    v_writelane_b32 v12, s7, s5
 ; GFX1010-SDAG-NEXT:    v_writelane_b32 v11, s8, s5
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v10, s9, s5
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v9, s10, s5
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v13, s6, s5
-; GFX1010-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[9:12], off
-; GFX1010-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[13:14], off offset:16
+; GFX1010-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off
+; GFX1010-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off offset:16
 ; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: test_writelane_v3p0:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_clause 0x1
-; GFX1100-SDAG-NEXT:    global_load_b64 v[13:14], v[0:1], off offset:16
-; GFX1100-SDAG-NEXT:    global_load_b128 v[9:12], v[0:1], off
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v8
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v5
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v7
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX1100-SDAG-NEXT:    global_load_b128 v[11:14], v[0:1], off offset:16
+; GFX1100-SDAG-NEXT:    global_load_b128 v[15:18], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v7
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s4, v6
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(1)
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v14, s0, s1
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v18, s5, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v17, s6, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v16, s7, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v15, s8, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v13, s2, s1
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v12, s3, s1
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v11, s4, s1
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v10, s5, s1
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v9, s6, s1
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v13, s2, s1
 ; GFX1100-SDAG-NEXT:    s_clause 0x1
-; GFX1100-SDAG-NEXT:    global_store_b128 v[0:1], v[9:12], off
-; GFX1100-SDAG-NEXT:    global_store_b64 v[0:1], v[13:14], off offset:16
+; GFX1100-SDAG-NEXT:    global_store_b128 v[0:1], v[15:18], off
+; GFX1100-SDAG-NEXT:    global_store_b128 v[0:1], v[11:14], off offset:16
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX802-GISEL-LABEL: test_writelane_v3p0:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX802-GISEL-NEXT:    v_add_u32_e32 v17, vcc, 16, v0
-; GFX802-GISEL-NEXT:    flat_load_dwordx4 v[9:12], v[0:1]
-; GFX802-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v1, vcc
-; GFX802-GISEL-NEXT:    flat_load_dwordx4 v[13:16], v[17:18]
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX802-GISEL-NEXT:    v_add_u32_e32 v19, vcc, 16, v0
+; GFX802-GISEL-NEXT:    flat_load_dwordx4 v[11:14], v[0:1]
+; GFX802-GISEL-NEXT:    v_addc_u32_e32 v20, vcc, 0, v1, vcc
+; GFX802-GISEL-NEXT:    flat_load_dwordx4 v[15:18], v[19:20]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v10
 ; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
 ; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
@@ -2836,16 +2847,20 @@ define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
 ; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s9, v6
 ; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s12, v9
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX802-GISEL-NEXT:    v_writelane_b32 v9, s4, m0
-; GFX802-GISEL-NEXT:    v_writelane_b32 v10, s6, m0
-; GFX802-GISEL-NEXT:    v_writelane_b32 v11, s7, m0
-; GFX802-GISEL-NEXT:    v_writelane_b32 v12, s8, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v11, s4, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v12, s6, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v13, s7, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v14, s8, m0
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-GISEL-NEXT:    v_writelane_b32 v13, s9, m0
-; GFX802-GISEL-NEXT:    v_writelane_b32 v14, s10, m0
-; GFX802-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[9:12]
-; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[17:18], v[13:14]
+; GFX802-GISEL-NEXT:    v_writelane_b32 v15, s9, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v16, s10, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v17, s11, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v18, s12, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
+; GFX802-GISEL-NEXT:    flat_store_dwordx4 v[19:20], v[15:18]
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2853,55 +2868,355 @@ define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-GISEL-NEXT:    s_clause 0x1
-; GFX1010-GISEL-NEXT:    global_load_dwordx4 v[9:12], v[0:1], off
-; GFX1010-GISEL-NEXT:    global_load_dwordx4 v[13:16], v[0:1], off offset:16
+; GFX1010-GISEL-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off
+; GFX1010-GISEL-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:16
 ; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v10
 ; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
 ; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
 ; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s8, v5
 ; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s9, v6
 ; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s12, v9
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v9, s4, s5
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v10, s6, s5
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v11, s7, s5
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v12, s8, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v11, s4, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v12, s6, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v13, s7, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v14, s8, s5
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v13, s9, s5
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v14, s10, s5
-; GFX1010-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[9:12], off
-; GFX1010-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[13:14], off offset:16
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v15, s9, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v16, s10, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v17, s11, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v18, s12, s5
+; GFX1010-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off
+; GFX1010-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off offset:16
 ; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: test_writelane_v3p0:
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_clause 0x1
-; GFX1100-GISEL-NEXT:    global_load_b128 v[9:12], v[0:1], off
-; GFX1100-GISEL-NEXT:    global_load_b128 v[13:16], v[0:1], off offset:16
+; GFX1100-GISEL-NEXT:    global_load_b128 v[11:14], v[0:1], off
+; GFX1100-GISEL-NEXT:    global_load_b128 v[15:18], v[0:1], off offset:16
 ; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
 ; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
 ; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
 ; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s4, v5
 ; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s5, v6
 ; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s8, v9
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v9, s0, s1
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v10, s2, s1
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v11, s3, s1
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v12, s4, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v11, s0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v12, s2, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v13, s3, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v14, s4, s1
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v13, s5, s1
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v14, s6, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v15, s5, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v16, s6, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v17, s7, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v18, s8, s1
 ; GFX1100-GISEL-NEXT:    s_clause 0x1
-; GFX1100-GISEL-NEXT:    global_store_b128 v[0:1], v[9:12], off
-; GFX1100-GISEL-NEXT:    global_store_b64 v[0:1], v[13:14], off offset:16
+; GFX1100-GISEL-NEXT:    global_store_b128 v[0:1], v[11:14], off
+; GFX1100-GISEL-NEXT:    global_store_b128 v[0:1], v[15:18], off offset:16
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <4 x ptr>, ptr addrspace(1) %out
+  %writelane = call <4 x ptr> @llvm.amdgcn.writelane.v3p0(<4 x ptr> %src, i32 %src1, <4 x ptr> %oldval)
+  store <4 x ptr> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p3:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p3:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p3:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_p3:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_p3:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_p3:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load ptr addrspace(3), ptr addrspace(1) %out
+  %writelane = call ptr addrspace(3) @llvm.amdgcn.writelane.p3(ptr addrspace(3) %src, i32 %src1, ptr addrspace(3) %oldval)
+  store ptr addrspace(3) %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p3:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v8, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v7, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s6, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx3 v[0:1], v[6:8]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p3:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v8, s4, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v7, s6, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s7, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p3:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b96 v[6:8], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v8, s0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v7, s2, s1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s3, s1
+; GFX1100-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v3p3:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v6, s4, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v7, s6, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v8, s7, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx3 v[0:1], v[6:8]
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v3p3:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v6, s4, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v7, s6, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v8, s7, s5
+; GFX1010-GISEL-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v3p3:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b96 v[6:8], v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v6, s0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v7, s2, s1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v8, s3, s1
+; GFX1100-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off
+; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <3 x ptr addrspace(3)>, ptr addrspace(1) %out
+  %writelane = call <3 x ptr addrspace(3)> @llvm.amdgcn.writelane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1, <3 x ptr addrspace(3)> %oldval)
+  store <3 x ptr addrspace(3)> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v8i16:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dwordx4 v[7:10], v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v6
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v10, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v9, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v8, s6, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v7, s7, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v8i16:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v10, s4, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v9, s6, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v8, s7, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v7, s8, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v8i16:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b128 v[7:10], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v10, s0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v9, s2, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v8, s3, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v7, s4, s1
+; GFX1100-SDAG-NEXT:    global_store_b128 v[0:1], v[7:10], off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_v8i16:
+; GFX802-GISEL:       ; %bb.0:
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT:    flat_load_dwordx4 v[7:10], v[0:1]
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s8, v5
+; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    v_writelane_b32 v7, s4, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v8, s6, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v9, s7, m0
+; GFX802-GISEL-NEXT:    v_writelane_b32 v10, s8, m0
+; GFX802-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
+; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_v8i16:
+; GFX1010-GISEL:       ; %bb.0:
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s8, v5
+; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v7, s4, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v8, s6, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v9, s7, s5
+; GFX1010-GISEL-NEXT:    v_writelane_b32 v10, s8, s5
+; GFX1010-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off
+; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_v8i16:
+; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    global_load_b128 v[7:10], v[0:1], off
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v7, s0, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v8, s2, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v9, s3, s1
+; GFX1100-GISEL-NEXT:    v_writelane_b32 v10, s4, s1
+; GFX1100-GISEL-NEXT:    global_store_b128 v[0:1], v[7:10], off
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %oldval = load <3 x ptr>, ptr addrspace(1) %out
-  %writelane = call <3 x ptr> @llvm.amdgcn.writelane.v3p0(<3 x ptr> %src, i32 %src1, <3 x ptr> %oldval)
-  store <3 x ptr> %writelane, ptr addrspace(1) %out, align 4
+  %oldval = load <8 x i16>, ptr addrspace(1) %out
+  %writelane = call <8 x i16> @llvm.amdgcn.writelane.v8i16(<8 x i16> %src, i32 %src1, <8 x i16> %oldval)
+  store <8 x i16> %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 

>From 556dda2954b7d5d1109d4cb4b68aa957af0a0cd5 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Mon, 13 May 2024 09:40:57 -0400
Subject: [PATCH 15/30] align comments

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 74190a4e51abb..457566944069e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2190,9 +2190,9 @@ def int_amdgcn_readlane :
 // undefined.
 def int_amdgcn_writelane :
   Intrinsic<[llvm_any_ty], [
-    LLVMMatchType<0>,    // uniform value to write: returned by the selected lane
+    LLVMMatchType<0>,   // uniform value to write: returned by the selected lane
     llvm_i32_ty,        // uniform lane select
-    LLVMMatchType<0>     // returned by all lanes other than the selected one
+    LLVMMatchType<0>    // returned by all lanes other than the selected one
   ],
   [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;

>From b59873e0971949e41c8f5193da23fa196ee8b338 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Wed, 15 May 2024 20:13:16 +0000
Subject: [PATCH 16/30] Review comments

---
 llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td     | 18 ++++++++++++++---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 20 +++++--------------
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 10 ++--------
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |  9 +++++++--
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    | 16 +++++++++++----
 5 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 0bc151c65f14d..f9f75e752fa47 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -342,9 +342,21 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
 
 def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
 
-def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", SDTIntBinOp>;
-def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", SDTIntUnaryOp>;
-def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDTIntTernaryOp>;
+def AMDGPUReadfirstlaneOp : SDTypeProfile<1, 1, [
+  SDTCisSameAs<0, 1>
+]>;
+
+def AMDGPUReadlaneOp : SDTypeProfile<1, 2, [
+  SDTCisSameAs<0, 1>, SDTCisInt<2>
+]>;
+
+def AMDGPUDWritelaneOp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<1, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3>,
+]>;
+
+def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp>;
+def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", AMDGPUReadfirstlaneOp>;
+def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDWritelaneOp>;
 
 // SI+ export
 def AMDGPUExportOp : SDTypeProfile<0, 8, [
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 551a9c926b88d..bae27246e20c5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5424,26 +5424,16 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   unsigned Size = Ty.getSizeInBits();
 
   if (Size == 32) {
-    if (Ty.isScalar())
+    if (!Ty.isPointer()) {
       // Already legal
       return true;
-
-    auto IsPtr = Ty.isPointer();
-    Src0 = IsPtr ? B.buildPtrToInt(S32, Src0).getReg(0)
-                 : B.buildBitcast(S32, Src0).getReg(0);
-
-    if (Src2.isValid()) {
-      Src2 = IsPtr ? B.buildPtrToInt(S32, Src2).getReg(0)
-                   : B.buildBitcast(S32, Src2).getReg(0);
     }
+    Src0 = B.buildPtrToInt(S32, Src0).getReg(0);
+    if (Src2.isValid())
+      Src2 = B.buildPtrToInt(S32, Src2).getReg(0);
 
     Register LaneOpDst = createLaneOp(Src0, Src1, Src2);
-
-    if (IsPtr)
-      B.buildIntToPtr(DstReg, LaneOpDst);
-    else
-      B.buildBitcast(DstReg, LaneOpDst);
-
+    B.buildIntToPtr(DstReg, LaneOpDst);
     MI.eraseFromParent();
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6105c5466d222..58fb7cf3c4ac8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6111,14 +6111,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   }
 
   if (ValSize == 32) {
-    if (VT == MVT::i32)
-      // Already legal
-      return SDValue();
-    Src0 = DAG.getBitcast(IntVT, Src0);
-    if (Src2.getNode())
-      Src2 = DAG.getBitcast(IntVT, Src2);
-    SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
-    return DAG.getBitcast(VT, LaneOp);
+    // Already legal
+    return SDValue();
   }
 
   if (ValSize < 32) {
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index a963355f5cd92..9c5d6a7bf6d0b 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -243,11 +243,16 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
 // FIXME: Specify SchedRW for READFIRSTLANE_B32
 // TODO: There is VOP3 encoding also
 def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
-                                       getVOP1Pat<AMDGPUreadfirstlane,
-                                                  VOP_READFIRSTLANE>.ret, 1> {
+                                       [], 1> {
   let isConvergent = 1;
 }
 
+foreach vt = Reg32Types.types in {
+  def : GCNPat<(vt (AMDGPUreadfirstlane (vt VRegOrLdsSrc_32:$src0))),
+        (V_READFIRSTLANE_B32 (vt VRegOrLdsSrc_32:$src0))
+  >;
+}
+
 let isReMaterializable = 1 in {
 let SchedRW = [WriteDoubleCvt] in {
 // OMod clears exceptions when set in this instruction
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 9b23a304dc1f8..b1df57320cfdd 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -780,14 +780,22 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag,
 
 // These are special and do not read the exec mask.
 let isConvergent = 1, Uses = []<Register> in {
-def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
-  [(set i32:$vdst, (AMDGPUreadlane i32:$src0, i32:$src1))]>;
+def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,[]>;
 let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
-def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
-  [(set i32:$vdst, (AMDGPUwritelane i32:$src0, i32:$src1, i32:$vdst_in))]>;
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []>;
 } // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in
 } // End isConvergent = 1
 
+foreach vt = Reg32Types.types in {
+  def : GCNPat<(vt (AMDGPUreadlane vt:$src0, i32:$src1)),
+        (V_READLANE_B32 VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1)
+  >;
+
+  def : GCNPat<(vt (AMDGPUwritelane vt:$src0, i32:$src1, vt:$src2)),
+        (V_WRITELANE_B32 SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$src2)
+  >;
+}
+
 let isReMaterializable = 1 in {
 defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
 defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32, add_ctpop>;

>From edd3179d3b241493a55dcc116ec7b5ad919955c7 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Thu, 16 May 2024 13:05:24 +0000
Subject: [PATCH 17/30] fix type profile

---
 llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index f9f75e752fa47..68d1985b51731 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -351,7 +351,7 @@ def AMDGPUReadlaneOp : SDTypeProfile<1, 2, [
 ]>;
 
 def AMDGPUDWritelaneOp : SDTypeProfile<1, 3, [
-  SDTCisSameAs<1, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3>,
+  SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3>,
 ]>;
 
 def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp>;

>From a75eb6b4d4cb8e4e9d3ee9b256d506476da6b5a6 Mon Sep 17 00:00:00 2001
From: Vikram Hegde <115221833+vikramRH at users.noreply.github.com>
Date: Fri, 17 May 2024 11:24:31 +0530
Subject: [PATCH 18/30] remove spurious comma

---
 llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 68d1985b51731..e4f329b200c86 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -351,7 +351,7 @@ def AMDGPUReadlaneOp : SDTypeProfile<1, 2, [
 ]>;
 
 def AMDGPUDWritelaneOp : SDTypeProfile<1, 3, [
-  SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3>,
+  SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3>
 ]>;
 
 def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp>;

>From 52d7020546d25a037a63a1df77fbf88b5a39ccbb Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Sat, 18 May 2024 08:52:36 +0000
Subject: [PATCH 19/30] review comments, move pointer tests to new files

---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  12 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   4 +-
 .../AMDGPU/llvm.amdgcn.readfirstlane.ll       |  50 ---
 .../AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll   |  92 ++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll    |  58 ----
 .../AMDGPU/llvm.amdgcn.readlane.ptr.ll        | 105 +++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   | 181 -----------
 .../AMDGPU/llvm.amdgcn.writelane.ptr.ll       | 292 ++++++++++++++++++
 8 files changed, 492 insertions(+), 302 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index bae27246e20c5..6ffc8a20f76fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5424,17 +5424,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   unsigned Size = Ty.getSizeInBits();
 
   if (Size == 32) {
-    if (!Ty.isPointer()) {
-      // Already legal
-      return true;
-    }
-    Src0 = B.buildPtrToInt(S32, Src0).getReg(0);
-    if (Src2.isValid())
-      Src2 = B.buildPtrToInt(S32, Src2).getReg(0);
-
-    Register LaneOpDst = createLaneOp(Src0, Src1, Src2);
-    B.buildIntToPtr(DstReg, LaneOpDst);
-    MI.eraseFromParent();
+    // Already legal
     return true;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 58fb7cf3c4ac8..6c23bdf09974b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6095,8 +6095,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   SDLoc SL(N);
   MVT IntVT = MVT::getIntegerVT(ValSize);
 
-  auto createLaneOp = [&](SDValue Src0, SDValue Src1, SDValue Src2,
-                          MVT VT) -> SDValue {
+  auto createLaneOp = [&DAG, &SL](SDValue Src0, SDValue Src1, SDValue Src2,
+                                  MVT VT) -> SDValue {
     return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2})
             : Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1})
                    : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0}));
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index eabf05e3d1033..732489f22c36f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -624,56 +624,6 @@ define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) {
   ret void
 }
 
-define void @test_readfirstlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src) {
-; CHECK-SDAG-LABEL: test_readfirstlane_p3:
-; CHECK-SDAG:       ; %bb.0:
-; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
-; CHECK-SDAG-NEXT:    ;;#ASMSTART
-; CHECK-SDAG-NEXT:    ; use s4
-; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; CHECK-GISEL-LABEL: test_readfirstlane_p3:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT:    ;;#ASMSTART
-; CHECK-GISEL-NEXT:    ; use s4
-; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %x = call ptr addrspace(3) @llvm.amdgcn.readfirstlane.p3(ptr addrspace(3) %src)
-  call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %x)
-  ret void
-}
-
-define void @test_readfirstlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src) {
-; CHECK-SDAG-LABEL: test_readfirstlane_v3p3:
-; CHECK-SDAG:       ; %bb.0:
-; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
-; CHECK-SDAG-NEXT:    ;;#ASMSTART
-; CHECK-SDAG-NEXT:    ; use s[4:6]
-; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; CHECK-GISEL-LABEL: test_readfirstlane_v3p3:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT:    ;;#ASMSTART
-; CHECK-GISEL-NEXT:    ; use s[4:6]
-; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v3p3(<3 x ptr addrspace(3)> %src)
-  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %x)
-  ret void
-}
-
 define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
 ; CHECK-SDAG-LABEL: test_readfirstlane_v8i16:
 ; CHECK-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
new file mode 100644
index 0000000000000..588f239606f52
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
+
+define void @test_readfirstlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p3:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr addrspace(3) @llvm.amdgcn.readfirstlane.p3(ptr addrspace(3) %src)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %x)
+  ret void
+}
+
+define void @test_readfirstlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p3:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v3p3(<3 x ptr addrspace(3)> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %x)
+  ret void
+}
+
+define void @test_readfirstlane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p5:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr addrspace(5) @llvm.amdgcn.readfirstlane.p5(ptr addrspace(5) %src)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %x)
+  ret void
+}
+
+define void @test_readfirstlane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p5:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr addrspace(5)> @llvm.amdgcn.readfirstlane.v3p5(<3 x ptr addrspace(5)> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(5)> %x)
+  ret void
+}
+
+define void @test_readfirstlane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p6:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr addrspace(6) @llvm.amdgcn.readfirstlane.p6(ptr addrspace(6) %src)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(6) %x)
+  ret void
+}
+
+define void @test_readfirstlane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p6:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr addrspace(6)> @llvm.amdgcn.readfirstlane.v3p6(<3 x ptr addrspace(6)> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(6)> %x)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index dd4deb76b90b3..71cd3db81addd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -928,64 +928,6 @@ define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1
   ret void
 }
 
-define void @test_readlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
-; CHECK-SDAG-LABEL: test_readlane_p3:
-; CHECK-SDAG:       ; %bb.0:
-; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
-; CHECK-SDAG-NEXT:    s_nop 3
-; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
-; CHECK-SDAG-NEXT:    ;;#ASMSTART
-; CHECK-SDAG-NEXT:    ; use s4
-; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; CHECK-GISEL-LABEL: test_readlane_p3:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
-; CHECK-GISEL-NEXT:    s_nop 3
-; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s4
-; CHECK-GISEL-NEXT:    ;;#ASMSTART
-; CHECK-GISEL-NEXT:    ; use s4
-; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %x = call ptr addrspace(3) @llvm.amdgcn.readlane.p3(ptr addrspace(3) %src, i32 %src1)
-  call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %x)
-  ret void
-}
-
-define void @test_readlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) {
-; CHECK-SDAG-LABEL: test_readlane_v3p3:
-; CHECK-SDAG:       ; %bb.0:
-; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
-; CHECK-SDAG-NEXT:    s_nop 3
-; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
-; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
-; CHECK-SDAG-NEXT:    ;;#ASMSTART
-; CHECK-SDAG-NEXT:    ; use s[4:6]
-; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; CHECK-GISEL-LABEL: test_readlane_v3p3:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v5
-; CHECK-GISEL-NEXT:    s_nop 3
-; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s6
-; CHECK-GISEL-NEXT:    v_readlane_b32 s5, v3, s6
-; CHECK-GISEL-NEXT:    v_readlane_b32 s6, v4, s6
-; CHECK-GISEL-NEXT:    ;;#ASMSTART
-; CHECK-GISEL-NEXT:    ; use s[4:6]
-; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readlane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1)
-  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %x)
-  ret void
-}
-
 define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
 ; CHECK-SDAG-LABEL: test_readlane_v8i16:
 ; CHECK-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
new file mode 100644
index 0000000000000..1b4ee84c75250
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
+
+define void @test_readlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p3:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr addrspace(3) @llvm.amdgcn.readlane.p3(ptr addrspace(3) %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %x)
+  ret void
+}
+
+define void @test_readlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p3:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readlane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %x)
+  ret void
+}
+
+define void @test_readlane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p5:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr addrspace(5) @llvm.amdgcn.readlane.p5(ptr addrspace(5) %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %x)
+  ret void
+}
+
+define void @test_readlane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p5:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr addrspace(5)> @llvm.amdgcn.readlane.v3p5(<3 x ptr addrspace(5)> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(5)> %x)
+  ret void
+}
+
+define void @test_readlane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p6:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s4
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr addrspace(6) @llvm.amdgcn.readlane.p6(ptr addrspace(6) %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(6) %x)
+  ret void
+}
+
+define void @test_readlane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p6:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr addrspace(6)> @llvm.amdgcn.readlane.v3p6(<3 x ptr addrspace(6)> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(6)> %x)
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index e1083042a6f09..d0a865f565eeb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -2928,187 +2928,6 @@ define void @test_writelane_v3p0(ptr addrspace(1) %out, <4 x ptr> %src, i32 %src
   ret void
 }
 
-define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
-; GFX802-SDAG-LABEL: test_writelane_p3:
-; GFX802-SDAG:       ; %bb.0:
-; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT:    s_nop 1
-; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
-; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
-; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1010-SDAG-LABEL: test_writelane_p3:
-; GFX1010-SDAG:       ; %bb.0:
-; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
-; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
-; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: test_writelane_p3:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
-; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX802-GISEL-LABEL: test_writelane_p3:
-; GFX802-GISEL:       ; %bb.0:
-; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX802-GISEL-NEXT:    flat_load_dword v4, v[0:1]
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
-; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-GISEL-NEXT:    v_writelane_b32 v4, s4, m0
-; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v4
-; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1010-GISEL-LABEL: test_writelane_p3:
-; GFX1010-GISEL:       ; %bb.0:
-; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-GISEL-NEXT:    global_load_dword v4, v[0:1], off
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v4, s4, s5
-; GFX1010-GISEL-NEXT:    global_store_dword v[0:1], v4, off
-; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: test_writelane_p3:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_load_b32 v4, v[0:1], off
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v4, s0, s1
-; GFX1100-GISEL-NEXT:    global_store_b32 v[0:1], v4, off
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %oldval = load ptr addrspace(3), ptr addrspace(1) %out
-  %writelane = call ptr addrspace(3) @llvm.amdgcn.writelane.p3(ptr addrspace(3) %src, i32 %src1, ptr addrspace(3) %oldval)
-  store ptr addrspace(3) %writelane, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-define void @test_writelane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) {
-; GFX802-SDAG-LABEL: test_writelane_v3p3:
-; GFX802-SDAG:       ; %bb.0:
-; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX802-SDAG-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v5
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT:    v_writelane_b32 v8, s4, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v7, s5, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s6, m0
-; GFX802-SDAG-NEXT:    flat_store_dwordx3 v[0:1], v[6:8]
-; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1010-SDAG-LABEL: test_writelane_v3p3:
-; GFX1010-SDAG:       ; %bb.0:
-; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-SDAG-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
-; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v8, s4, s5
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v7, s6, s5
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s7, s5
-; GFX1010-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off
-; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: test_writelane_v3p3:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    global_load_b96 v[6:8], v[0:1], off
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v8, s0, s1
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v7, s2, s1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s3, s1
-; GFX1100-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX802-GISEL-LABEL: test_writelane_v3p3:
-; GFX802-GISEL:       ; %bb.0:
-; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX802-GISEL-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
-; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
-; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-GISEL-NEXT:    v_writelane_b32 v6, s4, m0
-; GFX802-GISEL-NEXT:    v_writelane_b32 v7, s6, m0
-; GFX802-GISEL-NEXT:    v_writelane_b32 v8, s7, m0
-; GFX802-GISEL-NEXT:    flat_store_dwordx3 v[0:1], v[6:8]
-; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1010-GISEL-LABEL: test_writelane_v3p3:
-; GFX1010-GISEL:       ; %bb.0:
-; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-GISEL-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
-; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v6, s4, s5
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v7, s6, s5
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v8, s7, s5
-; GFX1010-GISEL-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off
-; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: test_writelane_v3p3:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_load_b96 v[6:8], v[0:1], off
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v6, s0, s1
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v7, s2, s1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v8, s3, s1
-; GFX1100-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %oldval = load <3 x ptr addrspace(3)>, ptr addrspace(1) %out
-  %writelane = call <3 x ptr addrspace(3)> @llvm.amdgcn.writelane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1, <3 x ptr addrspace(3)> %oldval)
-  store <3 x ptr addrspace(3)> %writelane, ptr addrspace(1) %out, align 4
-  ret void
-}
-
 define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
 ; GFX802-SDAG-LABEL: test_writelane_v8i16:
 ; GFX802-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
new file mode 100644
index 0000000000000..afc394627d356
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
@@ -0,0 +1,292 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX802-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
+
+define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p3:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p3:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p3:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load ptr addrspace(3), ptr addrspace(1) %out
+  %writelane = call ptr addrspace(3) @llvm.amdgcn.writelane.p3(ptr addrspace(3) %src, i32 %src1, ptr addrspace(3) %oldval)
+  store ptr addrspace(3) %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p3:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v8, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v7, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s6, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx3 v[0:1], v[6:8]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p3:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v8, s4, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v7, s6, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s7, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p3:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b96 v[6:8], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v8, s0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v7, s2, s1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s3, s1
+; GFX1100-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <3 x ptr addrspace(3)>, ptr addrspace(1) %out
+  %writelane = call <3 x ptr addrspace(3)> @llvm.amdgcn.writelane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1, <3 x ptr addrspace(3)> %oldval)
+  store <3 x ptr addrspace(3)> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p5:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p5:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p5:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load ptr addrspace(5), ptr addrspace(1) %out
+  %writelane = call ptr addrspace(5) @llvm.amdgcn.writelane.p5(ptr addrspace(5) %src, i32 %src1, ptr addrspace(5) %oldval)
+  store ptr addrspace(5) %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p5:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v8, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v7, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s6, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx3 v[0:1], v[6:8]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p5:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v8, s4, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v7, s6, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s7, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p5:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b96 v[6:8], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v8, s0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v7, s2, s1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s3, s1
+; GFX1100-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <3 x ptr addrspace(5)>, ptr addrspace(1) %out
+  %writelane = call <3 x ptr addrspace(5)> @llvm.amdgcn.writelane.v3p5(<3 x ptr addrspace(5)> %src, i32 %src1, <3 x ptr addrspace(5)> %oldval)
+  store <3 x ptr addrspace(5)> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p6:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 1
+; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p6:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p6:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load ptr addrspace(6), ptr addrspace(1) %out
+  %writelane = call ptr addrspace(6) @llvm.amdgcn.writelane.p6(ptr addrspace(6) %src, i32 %src1, ptr addrspace(6) %oldval)
+  store ptr addrspace(6) %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p6:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v8, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v7, s5, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s6, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx3 v[0:1], v[6:8]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p6:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v8, s4, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v7, s6, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s7, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p6:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b96 v[6:8], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v8, s0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v7, s2, s1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s3, s1
+; GFX1100-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <3 x ptr addrspace(6)>, ptr addrspace(1) %out
+  %writelane = call <3 x ptr addrspace(6)> @llvm.amdgcn.writelane.v3p6(<3 x ptr addrspace(6)> %src, i32 %src1, <3 x ptr addrspace(6)> %oldval)
+  store <3 x ptr addrspace(6)> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}

>From 66ca57c48f7f0901e22f782412b964fb55f338ce Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Thu, 23 May 2024 08:55:52 +0000
Subject: [PATCH 20/30] remove bitcasts, avoid special handling of pointers in
 gisel

---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 127 ++------
 .../AMDGPU/llvm.amdgcn.readfirstlane.ll       |  58 ----
 .../AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll   |  34 +++
 .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll    |  66 -----
 .../AMDGPU/llvm.amdgcn.readlane.ptr.ll        |  38 +++
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   | 277 ------------------
 .../AMDGPU/llvm.amdgcn.writelane.ptr.ll       | 133 +++++++++
 7 files changed, 235 insertions(+), 498 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 6ffc8a20f76fa..6d79e67fc0021 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5387,6 +5387,7 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
   return true;
 }
 
+// TODO: Fix pointer type handling
 bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
                                          MachineInstr &MI,
                                          Intrinsic::ID IID) const {
@@ -5429,25 +5430,12 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   }
 
   if (Size < 32) {
-    Register Src0Cast = MRI.getType(Src0).isScalar()
-                            ? Src0
-                            : B.buildBitcast(LLT::scalar(Size), Src0).getReg(0);
-    Src0 = B.buildAnyExt(S32, Src0Cast).getReg(0);
-    if (Src2.isValid()) {
-      Register Src2Cast =
-          MRI.getType(Src2).isScalar()
-              ? Src2
-              : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
-      Src2 = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0);
-    }
+    Src0 = B.buildAnyExt(S32, Src0).getReg(0);
+    if (Src2.isValid())
+      Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
 
     Register LaneOpDst = createLaneOp(Src0, Src1, Src2);
-    if (Ty.isScalar())
-      B.buildTrunc(DstReg, LaneOpDst);
-    else {
-      auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOpDst);
-      B.buildBitcast(DstReg, Trunc);
-    }
+    B.buildTrunc(DstReg, LaneOpDst);
 
     MI.eraseFromParent();
     return true;
@@ -5456,43 +5444,32 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   if ((Size % 32) == 0) {
     SmallVector<Register, 2> PartialRes;
     unsigned NumParts = Size / 32;
-    auto IsS16Vec = Ty.isVector() && Ty.getElementType() == S16;
+    bool IsS16Vec = Ty.isVector() && Ty.getElementType() == S16;
     MachineInstrBuilder Src0Parts;
 
-    if (Ty.isPointer()) {
-      auto PtrToInt = B.buildPtrToInt(LLT::scalar(Size), Src0);
-      Src0Parts = B.buildUnmerge(S32, PtrToInt);
-    } else if (Ty.isPointerVector()) {
-      LLT IntVecTy = Ty.changeElementType(
-          LLT::scalar(Ty.getElementType().getSizeInBits()));
-      auto PtrToInt = B.buildPtrToInt(IntVecTy, Src0);
-      Src0Parts = B.buildUnmerge(S32, PtrToInt);
-    } else
-      Src0Parts =
-          IsS16Vec ? B.buildUnmerge(V2S16, Src0) : B.buildUnmerge(S32, Src0);
+    Src0Parts =
+        IsS16Vec ? B.buildUnmerge(V2S16, Src0) : B.buildUnmerge(S32, Src0);
 
     switch (IID) {
     case Intrinsic::amdgcn_readlane: {
       Register Src1 = MI.getOperand(3).getReg();
       for (unsigned i = 0; i < NumParts; ++i) {
-        Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
-                        : Src0Parts.getReg(i);
-        PartialRes.push_back(
-            (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {S32})
-                 .addUse(Src0)
-                 .addUse(Src1))
-                .getReg(0));
+        Src0 = Src0Parts.getReg(i);
+        PartialRes.push_back((B.buildIntrinsic(Intrinsic::amdgcn_readlane,
+                                               {IsS16Vec ? V2S16 : S32})
+                                  .addUse(Src0)
+                                  .addUse(Src1))
+                                 .getReg(0));
       }
       break;
     }
     case Intrinsic::amdgcn_readfirstlane: {
       for (unsigned i = 0; i < NumParts; ++i) {
-        Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
-                        : Src0Parts.getReg(i);
-        PartialRes.push_back(
-            (B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {S32})
-                 .addUse(Src0)
-                 .getReg(0)));
+        Src0 = Src0Parts.getReg(i);
+        PartialRes.push_back((B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
+                                               {IsS16Vec ? V2S16 : S32})
+                                  .addUse(Src0)
+                                  .getReg(0)));
       }
 
       break;
@@ -5502,69 +5479,25 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
       Register Src2 = MI.getOperand(4).getReg();
       MachineInstrBuilder Src2Parts;
 
-      if (Ty.isPointer()) {
-        auto PtrToInt = B.buildPtrToInt(S64, Src2);
-        Src2Parts = B.buildUnmerge(S32, PtrToInt);
-      } else if (Ty.isPointerVector()) {
-        LLT IntVecTy = Ty.changeElementType(
-            LLT::scalar(Ty.getElementType().getSizeInBits()));
-        auto PtrToInt = B.buildPtrToInt(IntVecTy, Src2);
-        Src2Parts = B.buildUnmerge(S32, PtrToInt);
-      } else
-        Src2Parts =
-            IsS16Vec ? B.buildUnmerge(V2S16, Src2) : B.buildUnmerge(S32, Src2);
+      Src2Parts =
+          IsS16Vec ? B.buildUnmerge(V2S16, Src2) : B.buildUnmerge(S32, Src2);
 
       for (unsigned i = 0; i < NumParts; ++i) {
-        Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
-                        : Src0Parts.getReg(i);
-        Src2 = IsS16Vec ? B.buildBitcast(S32, Src2Parts.getReg(i)).getReg(0)
-                        : Src2Parts.getReg(i);
-        PartialRes.push_back(
-            (B.buildIntrinsic(Intrinsic::amdgcn_writelane, {S32})
-                 .addUse(Src0)
-                 .addUse(Src1)
-                 .addUse(Src2))
-                .getReg(0));
+        Src0 = Src0Parts.getReg(i);
+        Src2 = Src2Parts.getReg(i);
+        PartialRes.push_back((B.buildIntrinsic(Intrinsic::amdgcn_writelane,
+                                               {IsS16Vec ? V2S16 : S32})
+                                  .addUse(Src0)
+                                  .addUse(Src1)
+                                  .addUse(Src2))
+                                 .getReg(0));
       }
 
       break;
     }
     }
 
-    if (Ty.isPointerVector()) {
-      unsigned PtrSize = Ty.getElementType().getSizeInBits();
-      SmallVector<Register, 2> PtrElements;
-      if (PtrSize == 32) {
-        // Handle 32 bit pointers
-        for (unsigned i = 0; i < NumParts; i++)
-          PtrElements.push_back(
-              B.buildIntToPtr(Ty.getElementType(), PartialRes[i]).getReg(0));
-      } else {
-        // Handle legalization of <? x [pointer type bigger than 32 bits]>
-        SmallVector<Register, 2> PtrParts;
-        unsigned NumS32Parts = PtrSize / 32;
-        unsigned PartIdx = 0;
-        for (unsigned i = 0, j = 1; i < NumParts; i += NumS32Parts, j++) {
-          // Merge S32 components of a pointer element first.
-          for (; PartIdx < (j * NumS32Parts); PartIdx++)
-            PtrParts.push_back(PartialRes[PartIdx]);
-
-          auto MergedPtr =
-              B.buildMergeLikeInstr(LLT::scalar(PtrSize), PtrParts);
-          PtrElements.push_back(
-              B.buildIntToPtr(Ty.getElementType(), MergedPtr).getReg(0));
-          PtrParts.clear();
-        }
-      }
-
-      B.buildMergeLikeInstr(DstReg, PtrElements);
-    } else {
-      if (IsS16Vec) {
-        for (unsigned i = 0; i < NumParts; i++)
-          PartialRes[i] = B.buildBitcast(V2S16, PartialRes[i]).getReg(0);
-      }
-      B.buildMergeLikeInstr(DstReg, PartialRes);
-    }
+    B.buildMergeLikeInstr(DstReg, PartialRes);
 
     MI.eraseFromParent();
     return true;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 732489f22c36f..ed0da0d2a61a2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -566,64 +566,6 @@ define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) {
   ret void
 }
 
-define void @test_readfirstlane_p0(ptr addrspace(1) %out, ptr %src) {
-; CHECK-SDAG-LABEL: test_readfirstlane_p0:
-; CHECK-SDAG:       ; %bb.0:
-; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
-; CHECK-SDAG-NEXT:    ;;#ASMSTART
-; CHECK-SDAG-NEXT:    ; use s[4:5]
-; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; CHECK-GISEL-LABEL: test_readfirstlane_p0:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT:    ;;#ASMSTART
-; CHECK-GISEL-NEXT:    ; use s[4:5]
-; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %x = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %src)
-  call void asm sideeffect "; use $0", "s"(ptr %x)
-  ret void
-}
-
-define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) {
-; CHECK-SDAG-LABEL: test_readfirstlane_v3p0:
-; CHECK-SDAG:       ; %bb.0:
-; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
-; CHECK-SDAG-NEXT:    ;;#ASMSTART
-; CHECK-SDAG-NEXT:    ; use s[4:9]
-; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; CHECK-GISEL-LABEL: test_readfirstlane_v3p0:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s9, v7
-; CHECK-GISEL-NEXT:    ;;#ASMSTART
-; CHECK-GISEL-NEXT:    ; use s[4:9]
-; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %x = call <3 x ptr> @llvm.amdgcn.readfirstlane.v3p0(<3 x ptr> %src)
-  call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
-  ret void
-}
-
 define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
 ; CHECK-SDAG-LABEL: test_readfirstlane_v8i16:
 ; CHECK-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
index 588f239606f52..3882a5f0f9f4f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
@@ -1,6 +1,40 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
 
+define void @test_readfirstlane_p0(ptr addrspace(1) %out, ptr %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_p0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:5]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %src)
+  call void asm sideeffect "; use $0", "s"(ptr %x)
+  ret void
+}
+
+define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3p0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:9]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr> @llvm.amdgcn.readfirstlane.v3p0(<3 x ptr> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
+  ret void
+}
+
 define void @test_readfirstlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src) {
 ; CHECK-SDAG-LABEL: test_readfirstlane_p3:
 ; CHECK-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 71cd3db81addd..325a39abb588a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -862,72 +862,6 @@ define void @test_readlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src
   ret void
 }
 
-define void @test_readlane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
-; CHECK-SDAG-LABEL: test_readlane_p0:
-; CHECK-SDAG:       ; %bb.0:
-; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
-; CHECK-SDAG-NEXT:    s_nop 3
-; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
-; CHECK-SDAG-NEXT:    ;;#ASMSTART
-; CHECK-SDAG-NEXT:    ; use s[4:5]
-; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; CHECK-GISEL-LABEL: test_readlane_p0:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
-; CHECK-GISEL-NEXT:    s_nop 3
-; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s5
-; CHECK-GISEL-NEXT:    v_readlane_b32 s5, v3, s5
-; CHECK-GISEL-NEXT:    ;;#ASMSTART
-; CHECK-GISEL-NEXT:    ; use s[4:5]
-; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %x = call ptr @llvm.amdgcn.readlane.p0(ptr %src, i32 %src1)
-  call void asm sideeffect "; use $0", "s"(ptr %x)
-  ret void
-}
-
-define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) {
-; CHECK-SDAG-LABEL: test_readlane_v3p0:
-; CHECK-SDAG:       ; %bb.0:
-; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v8
-; CHECK-SDAG-NEXT:    s_nop 3
-; CHECK-SDAG-NEXT:    v_readlane_b32 s9, v7, s4
-; CHECK-SDAG-NEXT:    v_readlane_b32 s8, v6, s4
-; CHECK-SDAG-NEXT:    v_readlane_b32 s7, v5, s4
-; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
-; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
-; CHECK-SDAG-NEXT:    ;;#ASMSTART
-; CHECK-SDAG-NEXT:    ; use s[4:9]
-; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; CHECK-GISEL-LABEL: test_readlane_v3p0:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s9, v8
-; CHECK-GISEL-NEXT:    s_nop 3
-; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v2, s9
-; CHECK-GISEL-NEXT:    v_readlane_b32 s5, v3, s9
-; CHECK-GISEL-NEXT:    v_readlane_b32 s6, v4, s9
-; CHECK-GISEL-NEXT:    v_readlane_b32 s7, v5, s9
-; CHECK-GISEL-NEXT:    v_readlane_b32 s8, v6, s9
-; CHECK-GISEL-NEXT:    v_readlane_b32 s9, v7, s9
-; CHECK-GISEL-NEXT:    ;;#ASMSTART
-; CHECK-GISEL-NEXT:    ; use s[4:9]
-; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %x = call <3 x ptr> @llvm.amdgcn.readlane.v3p0(<3 x ptr> %src, i32 %src1)
-  call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
-  ret void
-}
-
 define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
 ; CHECK-SDAG-LABEL: test_readlane_v8i16:
 ; CHECK-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
index 1b4ee84c75250..49f8ef391c230 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
@@ -1,6 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
 
+define void @test_readlane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_p0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:5]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call ptr @llvm.amdgcn.readlane.p0(ptr %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(ptr %x)
+  ret void
+}
+
+define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readlane_v3p0:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v8
+; CHECK-SDAG-NEXT:    s_nop 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s9, v7, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s8, v6, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s7, v5, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s6, v4, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s5, v3, s4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:9]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x ptr> @llvm.amdgcn.readlane.v3p0(<3 x ptr> %src, i32 %src1)
+  call void asm sideeffect "; use $0", "s"(<3 x ptr> %x)
+  ret void
+}
+
 define void @test_readlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
 ; CHECK-SDAG-LABEL: test_readlane_p3:
 ; CHECK-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index d0a865f565eeb..31f1085dd76ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -2651,283 +2651,6 @@ define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %sr
   ret void
 }
 
-define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
-; GFX802-SDAG-LABEL: test_writelane_p0:
-; GFX802-SDAG:       ; %bb.0:
-; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX802-SDAG-NEXT:    flat_load_dwordx2 v[5:6], v[0:1]
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v4
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v2
-; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT:    s_nop 0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s4, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v5, s5, m0
-; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[5:6]
-; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1010-SDAG-LABEL: test_writelane_p0:
-; GFX1010-SDAG:       ; %bb.0:
-; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-SDAG-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s4, s5
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v5, s6, s5
-; GFX1010-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[5:6], off
-; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: test_writelane_p0:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    global_load_b64 v[5:6], v[0:1], off
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s0, s1
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v5, s2, s1
-; GFX1100-SDAG-NEXT:    global_store_b64 v[0:1], v[5:6], off
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX802-GISEL-LABEL: test_writelane_p0:
-; GFX802-GISEL:       ; %bb.0:
-; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX802-GISEL-NEXT:    flat_load_dwordx2 v[5:6], v[0:1]
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
-; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-GISEL-NEXT:    v_writelane_b32 v5, s4, m0
-; GFX802-GISEL-NEXT:    v_writelane_b32 v6, s6, m0
-; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[5:6]
-; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1010-GISEL-LABEL: test_writelane_p0:
-; GFX1010-GISEL:       ; %bb.0:
-; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-GISEL-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v5, s4, s5
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v6, s6, s5
-; GFX1010-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[5:6], off
-; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: test_writelane_p0:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_load_b64 v[5:6], v[0:1], off
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v4
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v5, s0, s1
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v6, s2, s1
-; GFX1100-GISEL-NEXT:    global_store_b64 v[0:1], v[5:6], off
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %oldval = load ptr, ptr addrspace(1) %out
-  %writelane = call ptr @llvm.amdgcn.writelane.p0(ptr %src, i32 %src1, ptr %oldval)
-  store ptr %writelane, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-define void @test_writelane_v3p0(ptr addrspace(1) %out, <4 x ptr> %src, i32 %src1) {
-; GFX802-SDAG-LABEL: test_writelane_v3p0:
-; GFX802-SDAG:       ; %bb.0:
-; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX802-SDAG-NEXT:    v_add_u32_e32 v19, vcc, 16, v0
-; GFX802-SDAG-NEXT:    flat_load_dwordx4 v[11:14], v[0:1]
-; GFX802-SDAG-NEXT:    v_addc_u32_e32 v20, vcc, 0, v1, vcc
-; GFX802-SDAG-NEXT:    flat_load_dwordx4 v[15:18], v[19:20]
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v10
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s8, v5
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s9, v4
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s10, v3
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s11, v2
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v8
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s7, v6
-; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(1)
-; GFX802-SDAG-NEXT:    v_writelane_b32 v14, s8, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v13, s9, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v12, s10, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v11, s11, m0
-; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT:    v_writelane_b32 v18, s4, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v17, s5, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v16, s6, m0
-; GFX802-SDAG-NEXT:    v_writelane_b32 v15, s7, m0
-; GFX802-SDAG-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
-; GFX802-SDAG-NEXT:    flat_store_dwordx4 v[19:20], v[15:18]
-; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1010-SDAG-LABEL: test_writelane_v3p0:
-; GFX1010-SDAG:       ; %bb.0:
-; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-SDAG-NEXT:    s_clause 0x1
-; GFX1010-SDAG-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:16
-; GFX1010-SDAG-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s9, v5
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s10, v4
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s12, v2
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v8
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
-; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
-; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(1)
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v14, s4, s5
-; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v18, s9, s5
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v17, s10, s5
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v16, s11, s5
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v15, s12, s5
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v13, s6, s5
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v12, s7, s5
-; GFX1010-SDAG-NEXT:    v_writelane_b32 v11, s8, s5
-; GFX1010-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off
-; GFX1010-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off offset:16
-; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: test_writelane_v3p0:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    s_clause 0x1
-; GFX1100-SDAG-NEXT:    global_load_b128 v[11:14], v[0:1], off offset:16
-; GFX1100-SDAG-NEXT:    global_load_b128 v[15:18], v[0:1], off
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v8
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v7
-; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s4, v6
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(1)
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v14, s0, s1
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v18, s5, s1
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v17, s6, s1
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v16, s7, s1
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v15, s8, s1
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v13, s2, s1
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v12, s3, s1
-; GFX1100-SDAG-NEXT:    v_writelane_b32 v11, s4, s1
-; GFX1100-SDAG-NEXT:    s_clause 0x1
-; GFX1100-SDAG-NEXT:    global_store_b128 v[0:1], v[15:18], off
-; GFX1100-SDAG-NEXT:    global_store_b128 v[0:1], v[11:14], off offset:16
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX802-GISEL-LABEL: test_writelane_v3p0:
-; GFX802-GISEL:       ; %bb.0:
-; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX802-GISEL-NEXT:    v_add_u32_e32 v19, vcc, 16, v0
-; GFX802-GISEL-NEXT:    flat_load_dwordx4 v[11:14], v[0:1]
-; GFX802-GISEL-NEXT:    v_addc_u32_e32 v20, vcc, 0, v1, vcc
-; GFX802-GISEL-NEXT:    flat_load_dwordx4 v[15:18], v[19:20]
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s8, v5
-; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s9, v6
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s12, v9
-; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX802-GISEL-NEXT:    v_writelane_b32 v11, s4, m0
-; GFX802-GISEL-NEXT:    v_writelane_b32 v12, s6, m0
-; GFX802-GISEL-NEXT:    v_writelane_b32 v13, s7, m0
-; GFX802-GISEL-NEXT:    v_writelane_b32 v14, s8, m0
-; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-GISEL-NEXT:    v_writelane_b32 v15, s9, m0
-; GFX802-GISEL-NEXT:    v_writelane_b32 v16, s10, m0
-; GFX802-GISEL-NEXT:    v_writelane_b32 v17, s11, m0
-; GFX802-GISEL-NEXT:    v_writelane_b32 v18, s12, m0
-; GFX802-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
-; GFX802-GISEL-NEXT:    flat_store_dwordx4 v[19:20], v[15:18]
-; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1010-GISEL-LABEL: test_writelane_v3p0:
-; GFX1010-GISEL:       ; %bb.0:
-; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-GISEL-NEXT:    s_clause 0x1
-; GFX1010-GISEL-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off
-; GFX1010-GISEL-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:16
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s7, v4
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s8, v5
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s9, v6
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX1010-GISEL-NEXT:    v_readfirstlane_b32 s12, v9
-; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v11, s4, s5
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v12, s6, s5
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v13, s7, s5
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v14, s8, s5
-; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v15, s9, s5
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v16, s10, s5
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v17, s11, s5
-; GFX1010-GISEL-NEXT:    v_writelane_b32 v18, s12, s5
-; GFX1010-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off
-; GFX1010-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off offset:16
-; GFX1010-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: test_writelane_v3p0:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    s_clause 0x1
-; GFX1100-GISEL-NEXT:    global_load_b128 v[11:14], v[0:1], off
-; GFX1100-GISEL-NEXT:    global_load_b128 v[15:18], v[0:1], off offset:16
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s4, v5
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s5, v6
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX1100-GISEL-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v11, s0, s1
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v12, s2, s1
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v13, s3, s1
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v14, s4, s1
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v15, s5, s1
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v16, s6, s1
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v17, s7, s1
-; GFX1100-GISEL-NEXT:    v_writelane_b32 v18, s8, s1
-; GFX1100-GISEL-NEXT:    s_clause 0x1
-; GFX1100-GISEL-NEXT:    global_store_b128 v[0:1], v[11:14], off
-; GFX1100-GISEL-NEXT:    global_store_b128 v[0:1], v[15:18], off offset:16
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %oldval = load <4 x ptr>, ptr addrspace(1) %out
-  %writelane = call <4 x ptr> @llvm.amdgcn.writelane.v3p0(<4 x ptr> %src, i32 %src1, <4 x ptr> %oldval)
-  store <4 x ptr> %writelane, ptr addrspace(1) %out, align 4
-  ret void
-}
-
 define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) {
 ; GFX802-SDAG-LABEL: test_writelane_v8i16:
 ; GFX802-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
index afc394627d356..edc1afe410a63 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
@@ -3,6 +3,139 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
 
+define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_p0:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    flat_load_dwordx2 v[5:6], v[0:1]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_nop 0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v5, s5, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[5:6]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_p0:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s4, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v5, s6, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[5:6], off
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_p0:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    global_load_b64 v[5:6], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s0, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v5, s2, s1
+; GFX1100-SDAG-NEXT:    global_store_b64 v[0:1], v[5:6], off
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load ptr, ptr addrspace(1) %out
+  %writelane = call ptr @llvm.amdgcn.writelane.p0(ptr %src, i32 %src1, ptr %oldval)
+  store ptr %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_v3p0:
+; GFX802-SDAG:       ; %bb.0:
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT:    v_add_u32_e32 v13, vcc, 16, v0
+; GFX802-SDAG-NEXT:    flat_load_dwordx4 v[9:12], v[0:1]
+; GFX802-SDAG-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT:    flat_load_dwordx2 v[15:16], v[13:14]
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v8
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v5
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v7
+; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v12, s6, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v11, s7, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v10, s8, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v9, s9, m0
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    v_writelane_b32 v16, s4, m0
+; GFX802-SDAG-NEXT:    v_writelane_b32 v15, s5, m0
+; GFX802-SDAG-NEXT:    flat_store_dwordx4 v[0:1], v[9:12]
+; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[13:14], v[15:16]
+; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_v3p0:
+; GFX1010-SDAG:       ; %bb.0:
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT:    s_clause 0x1
+; GFX1010-SDAG-NEXT:    global_load_dwordx2 v[13:14], v[0:1], off offset:16
+; GFX1010-SDAG-NEXT:    global_load_dwordx4 v[9:12], v[0:1], off
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v7
+; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v14, s4, s5
+; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v12, s7, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v11, s8, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v10, s9, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v9, s10, s5
+; GFX1010-SDAG-NEXT:    v_writelane_b32 v13, s6, s5
+; GFX1010-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[9:12], off
+; GFX1010-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[13:14], off offset:16
+; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_v3p0:
+; GFX1100-SDAG:       ; %bb.0:
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    global_load_b64 v[13:14], v[0:1], off offset:16
+; GFX1100-SDAG-NEXT:    global_load_b128 v[9:12], v[0:1], off
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v7
+; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v14, s0, s1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v12, s3, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v11, s4, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v10, s5, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v9, s6, s1
+; GFX1100-SDAG-NEXT:    v_writelane_b32 v13, s2, s1
+; GFX1100-SDAG-NEXT:    s_clause 0x1
+; GFX1100-SDAG-NEXT:    global_store_b128 v[0:1], v[9:12], off
+; GFX1100-SDAG-NEXT:    global_store_b64 v[0:1], v[13:14], off offset:16
+; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+  %oldval = load <3 x ptr>, ptr addrspace(1) %out
+  %writelane = call <3 x ptr> @llvm.amdgcn.writelane.v3p0(<3 x ptr> %src, i32 %src1, <3 x ptr> %oldval)
+  store <3 x ptr> %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
 ; GFX802-SDAG-LABEL: test_writelane_p3:
 ; GFX802-SDAG:       ; %bb.0:

>From c3e512c693b9a32cf188e14d01e58b0ebad36c4c Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Mon, 27 May 2024 05:08:32 +0000
Subject: [PATCH 21/30] Review comments, updated AMDGPUUsage.rst

---
 llvm/docs/AMDGPUUsage.rst                     | 17 ++++++++
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 39 +++++++++----------
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 11 +++---
 3 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 51969be85648f..dc905488714ee 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1170,6 +1170,23 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
 
   :ref:`llvm.set.fpenv<int_set_fpenv>`             Sets the floating point environment to the specifies state.
 
+  llvm.amdgcn.readfirstlane                        Provides direct access to v_readfirstlane_b32. Returns the value in
+                                                   the lowest active lane of the input operand. Currently 
+                                                   implemented for i16, i32, float, half, bf16, v2i16, v2f16 and types 
+                                                   whose sizes are multiples of 32-bit.
+
+  llvm.amdgcn.readlane                             Provides direct access to v_readlane_b32. Returns the value in the 
+                                                   specified lane of the first input operand. The second operand 
+                                                   specifies the lane to read from. Currently implemented
+                                                   for i16, i32, float, half, bf16, v2i16, v2f16 and types whose sizes
+                                                   are multiples of 32-bit.
+
+  llvm.amdgcn.writelane                            Provides direct access to v_writelane_b32. Writes value 
+                                                   in the first input operand to the specified lane of divergent 
+                                                   output. The second operand Specifies the lane to write. Currently 
+                                                   implemented for i16, i32, float, half, bf16, v2i16, v2f16 and types 
+                                                   whose sizes are multiples of 32-bit.
+
   llvm.amdgcn.wave.reduce.umin                     Performs an arithmetic unsigned min reduction on the unsigned values
                                                    provided by each lane in the wavefront.
                                                    Intrinsic takes a hint for reduction strategy using second operand
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 6d79e67fc0021..c736701514d36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5444,32 +5444,32 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   if ((Size % 32) == 0) {
     SmallVector<Register, 2> PartialRes;
     unsigned NumParts = Size / 32;
-    bool IsS16Vec = Ty.isVector() && Ty.getElementType() == S16;
+    LLT PartialResTy =
+        Ty.isVector() && Ty.getElementType() == S16 ? V2S16 : S32;
     MachineInstrBuilder Src0Parts;
 
-    Src0Parts =
-        IsS16Vec ? B.buildUnmerge(V2S16, Src0) : B.buildUnmerge(S32, Src0);
+    Src0Parts = B.buildUnmerge(PartialResTy, Src0);
 
     switch (IID) {
     case Intrinsic::amdgcn_readlane: {
       Register Src1 = MI.getOperand(3).getReg();
       for (unsigned i = 0; i < NumParts; ++i) {
         Src0 = Src0Parts.getReg(i);
-        PartialRes.push_back((B.buildIntrinsic(Intrinsic::amdgcn_readlane,
-                                               {IsS16Vec ? V2S16 : S32})
-                                  .addUse(Src0)
-                                  .addUse(Src1))
-                                 .getReg(0));
+        PartialRes.push_back(
+            (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {PartialResTy})
+                 .addUse(Src0)
+                 .addUse(Src1))
+                .getReg(0));
       }
       break;
     }
     case Intrinsic::amdgcn_readfirstlane: {
       for (unsigned i = 0; i < NumParts; ++i) {
         Src0 = Src0Parts.getReg(i);
-        PartialRes.push_back((B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
-                                               {IsS16Vec ? V2S16 : S32})
-                                  .addUse(Src0)
-                                  .getReg(0)));
+        PartialRes.push_back(
+            (B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {PartialResTy})
+                 .addUse(Src0)
+                 .getReg(0)));
       }
 
       break;
@@ -5479,18 +5479,17 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
       Register Src2 = MI.getOperand(4).getReg();
       MachineInstrBuilder Src2Parts;
 
-      Src2Parts =
-          IsS16Vec ? B.buildUnmerge(V2S16, Src2) : B.buildUnmerge(S32, Src2);
+      Src2Parts = B.buildUnmerge(PartialResTy, Src2);
 
       for (unsigned i = 0; i < NumParts; ++i) {
         Src0 = Src0Parts.getReg(i);
         Src2 = Src2Parts.getReg(i);
-        PartialRes.push_back((B.buildIntrinsic(Intrinsic::amdgcn_writelane,
-                                               {IsS16Vec ? V2S16 : S32})
-                                  .addUse(Src0)
-                                  .addUse(Src1)
-                                  .addUse(Src2))
-                                 .getReg(0));
+        PartialRes.push_back(
+            (B.buildIntrinsic(Intrinsic::amdgcn_writelane, {PartialResTy})
+                 .addUse(Src0)
+                 .addUse(Src1)
+                 .addUse(Src2))
+                .getReg(0));
       }
 
       break;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6c23bdf09974b..cb95329839fcb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6116,15 +6116,16 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   }
 
   if (ValSize < 32) {
-    SDValue InitBitCast = DAG.getBitcast(IntVT, Src0);
-    Src0 = DAG.getAnyExtOrTrunc(InitBitCast, SL, MVT::i32);
+    bool IsFloat = VT.isFloatingPoint();
+    Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
+                                SL, MVT::i32);
     if (Src2.getNode()) {
-      SDValue Src2Cast = DAG.getBitcast(IntVT, Src2);
-      Src2 = DAG.getAnyExtOrTrunc(Src2Cast, SL, MVT::i32);
+      Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
+                                  SL, MVT::i32);
     }
     SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
     SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
-    return DAG.getBitcast(VT, Trunc);
+    return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
   }
 
   if ((ValSize % 32) == 0) {

>From 72af37cb8ea68778a45005d61f828bb4999ccfe8 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Thu, 30 May 2024 13:33:59 +0000
Subject: [PATCH 22/30] preserve legel 32-bit pieces, update usage doc

---
 llvm/docs/AMDGPUUsage.rst                     | 31 +++++-----
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 56 ++++++++++++-------
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 53 ++++++++++++++----
 3 files changed, 92 insertions(+), 48 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index dc905488714ee..70dd168e90727 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1170,22 +1170,19 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
 
   :ref:`llvm.set.fpenv<int_set_fpenv>`             Sets the floating point environment to the specifies state.
 
-  llvm.amdgcn.readfirstlane                        Provides direct access to v_readfirstlane_b32. Returns the value in
-                                                   the lowest active lane of the input operand. Currently 
-                                                   implemented for i16, i32, float, half, bf16, v2i16, v2f16 and types 
-                                                   whose sizes are multiples of 32-bit.
-
-  llvm.amdgcn.readlane                             Provides direct access to v_readlane_b32. Returns the value in the 
-                                                   specified lane of the first input operand. The second operand 
-                                                   specifies the lane to read from. Currently implemented
-                                                   for i16, i32, float, half, bf16, v2i16, v2f16 and types whose sizes
-                                                   are multiples of 32-bit.
-
-  llvm.amdgcn.writelane                            Provides direct access to v_writelane_b32. Writes value 
-                                                   in the first input operand to the specified lane of divergent 
-                                                   output. The second operand Specifies the lane to write. Currently 
-                                                   implemented for i16, i32, float, half, bf16, v2i16, v2f16 and types 
-                                                   whose sizes are multiples of 32-bit.
+  llvm.amdgcn.readfirstlane                        These intrinsics provide direct access to v_readfirstlane_b32, 
+  llvm.amdgcn.readlane                             v_readlane_b32 and v_writelane_b32 respectively.
+  llvm.amdgcn.writelane                             - `llvm.amdgcn.readfirstlane` Returns the value in the lowest active 
+                                                       lane of the input operand
+                                                    - `llvm.amdgcn.readlane` Returns the value in the specified lane of 
+                                                       the first input operand. The second operand specifies the lane to 
+                                                       read from.
+                                                    - `llvm.amdgcn.writelane` Writes value in the first input operand to 
+                                                       the specified lane of divergent output. The second operand specifies
+                                                       the lane to write.
+                                                   These are currently implemented for i16, i32, float, half, bf16, <2 x i16>, 
+                                                   <2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the 32-bit 
+                                                   vectors.
 
   llvm.amdgcn.wave.reduce.umin                     Performs an arithmetic unsigned min reduction on the unsigned values
                                                    provided by each lane in the wavefront.
@@ -1225,7 +1222,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    the output.
 
   llvm.amdgcn.sdot2                                Provides direct access to v_dot2_i32_i16 across targets which
-                                                   support such instructions. This performs signed dot product
+                                                   upport such instructions. This performs signed dot product
                                                    with two v2i16 operands, summed with the third i32 operand. The
                                                    i1 fourth operand is used to clamp the output.
                                                    When applicable (e.g. no clamping), this is lowered into
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c736701514d36..13abde8922264 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5441,28 +5441,45 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     return true;
   }
 
-  if ((Size % 32) == 0) {
-    SmallVector<Register, 2> PartialRes;
-    unsigned NumParts = Size / 32;
-    LLT PartialResTy =
-        Ty.isVector() && Ty.getElementType() == S16 ? V2S16 : S32;
-    MachineInstrBuilder Src0Parts;
+  if (Size % 32 != 0)
+    return false;
 
-    Src0Parts = B.buildUnmerge(PartialResTy, Src0);
+  SmallVector<Register, 2> PartialRes;
+  unsigned NumParts = Size / 32;
+  LLT PartialResTy = S32;
 
-    switch (IID) {
-    case Intrinsic::amdgcn_readlane: {
-      Register Src1 = MI.getOperand(3).getReg();
-      for (unsigned i = 0; i < NumParts; ++i) {
-        Src0 = Src0Parts.getReg(i);
-        PartialRes.push_back(
-            (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {PartialResTy})
-                 .addUse(Src0)
-                 .addUse(Src1))
-                .getReg(0));
-      }
+  if (Ty.isVector()) {
+    LLT EltTy = Ty.getElementType();
+    switch (EltTy.getSizeInBits()) {
+    case 16:
+      PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
+      break;
+    case 32:
+      PartialResTy = EltTy;
+      break;
+    default:
+      // Handle all other cases via S32 pieces;
       break;
     }
+  }
+
+  MachineInstrBuilder Src0Parts;
+
+  Src0Parts = B.buildUnmerge(PartialResTy, Src0);
+
+  switch (IID) {
+  case Intrinsic::amdgcn_readlane: {
+    Register Src1 = MI.getOperand(3).getReg();
+    for (unsigned i = 0; i < NumParts; ++i) {
+      Src0 = Src0Parts.getReg(i);
+      PartialRes.push_back(
+          (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {PartialResTy})
+               .addUse(Src0)
+               .addUse(Src1))
+              .getReg(0));
+    }
+    break;
+    }
     case Intrinsic::amdgcn_readfirstlane: {
       for (unsigned i = 0; i < NumParts; ++i) {
         Src0 = Src0Parts.getReg(i);
@@ -5500,9 +5517,6 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
 
     MI.eraseFromParent();
     return true;
-  }
-
-  return false;
 }
 
 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cb95329839fcb..b1c9d3130228c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6128,19 +6128,52 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
     return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
   }
 
-  if ((ValSize % 32) == 0) {
-    MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
-    Src0 = DAG.getBitcast(VecVT, Src0);
-
-    if (Src2.getNode())
-      Src2 = DAG.getBitcast(VecVT, Src2);
+  if (ValSize % 32 != 0)
+    return SDValue();
 
-    SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
-    SDValue UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode());
-    return DAG.getBitcast(VT, UnrolledLaneOp);
+  if (VT.isVector()) {
+    switch (MVT::SimpleValueType EltTy =
+                VT.getVectorElementType().getSimpleVT().SimpleTy) {
+    case MVT::i32:
+    case MVT::f32: {
+      SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
+      return DAG.UnrollVectorOp(LaneOp.getNode());
+    }
+    case MVT::i16:
+    case MVT::f16:
+    case MVT::bf16: {
+      MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
+      SmallVector<SDValue, 4> Pieces;
+      for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
+        SDValue Src0SubVec =
+            DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
+                        DAG.getConstant(EltIdx, SL, MVT::i32));
+
+        SDValue Src2SubVec;
+        if (Src2)
+          Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
+                                   DAG.getConstant(EltIdx, SL, MVT::i32));
+
+        Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
+        EltIdx += 2;
+      }
+      return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
+    }
+    default:
+      // Handle all other cases by bitcasting to i32 vectors
+      break;
+    }
   }
 
-  return SDValue();
+  MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
+  Src0 = DAG.getBitcast(VecVT, Src0);
+
+  if (Src2)
+    Src2 = DAG.getBitcast(VecVT, Src2);
+
+  SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
+  SDValue UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode());
+  return DAG.getBitcast(VT, UnrolledLaneOp);
 }
 
 void SITargetLowering::ReplaceNodeResults(SDNode *N,

>From 2e4c5bc624196e50e79b7eb7760094f0a8f3e8d2 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Thu, 30 May 2024 17:09:21 +0000
Subject: [PATCH 23/30] Refactor GIsel lowering

---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 80 +++++--------------
 1 file changed, 20 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 13abde8922264..94fcabda69408 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5395,12 +5395,9 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   MachineIRBuilder &B = Helper.MIRBuilder;
   MachineRegisterInfo &MRI = *B.getMRI();
 
-  Register DstReg = MI.getOperand(0).getReg();
-  Register Src0 = MI.getOperand(2).getReg();
-
-  auto createLaneOp = [&](Register Src0, Register Src1,
-                          Register Src2) -> Register {
-    auto LaneOp = B.buildIntrinsic(IID, {S32}).addUse(Src0);
+  auto createLaneOp = [&IID, &B](Register Src0, Register Src1, Register Src2,
+                                 LLT VT) -> Register {
+    auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
     switch (IID) {
     case Intrinsic::amdgcn_readfirstlane:
       return LaneOp.getReg(0);
@@ -5413,6 +5410,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     }
   };
 
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(2).getReg();
   Register Src1, Src2;
   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
     Src1 = MI.getOperand(3).getReg();
@@ -5434,7 +5433,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     if (Src2.isValid())
       Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
 
-    Register LaneOpDst = createLaneOp(Src0, Src1, Src2);
+    Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
     B.buildTrunc(DstReg, LaneOpDst);
 
     MI.eraseFromParent();
@@ -5444,10 +5443,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   if (Size % 32 != 0)
     return false;
 
-  SmallVector<Register, 2> PartialRes;
-  unsigned NumParts = Size / 32;
   LLT PartialResTy = S32;
-
   if (Ty.isVector()) {
     LLT EltTy = Ty.getElementType();
     switch (EltTy.getSizeInBits()) {
@@ -5463,60 +5459,24 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     }
   }
 
-  MachineInstrBuilder Src0Parts;
-
+  SmallVector<Register, 2> PartialRes;
+  unsigned NumParts = Size / 32;
+  MachineInstrBuilder Src0Parts, Src2Parts;
   Src0Parts = B.buildUnmerge(PartialResTy, Src0);
 
-  switch (IID) {
-  case Intrinsic::amdgcn_readlane: {
-    Register Src1 = MI.getOperand(3).getReg();
-    for (unsigned i = 0; i < NumParts; ++i) {
-      Src0 = Src0Parts.getReg(i);
-      PartialRes.push_back(
-          (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {PartialResTy})
-               .addUse(Src0)
-               .addUse(Src1))
-              .getReg(0));
-    }
-    break;
-    }
-    case Intrinsic::amdgcn_readfirstlane: {
-      for (unsigned i = 0; i < NumParts; ++i) {
-        Src0 = Src0Parts.getReg(i);
-        PartialRes.push_back(
-            (B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {PartialResTy})
-                 .addUse(Src0)
-                 .getReg(0)));
-      }
-
-      break;
-    }
-    case Intrinsic::amdgcn_writelane: {
-      Register Src1 = MI.getOperand(3).getReg();
-      Register Src2 = MI.getOperand(4).getReg();
-      MachineInstrBuilder Src2Parts;
-
-      Src2Parts = B.buildUnmerge(PartialResTy, Src2);
-
-      for (unsigned i = 0; i < NumParts; ++i) {
-        Src0 = Src0Parts.getReg(i);
-        Src2 = Src2Parts.getReg(i);
-        PartialRes.push_back(
-            (B.buildIntrinsic(Intrinsic::amdgcn_writelane, {PartialResTy})
-                 .addUse(Src0)
-                 .addUse(Src1)
-                 .addUse(Src2))
-                .getReg(0));
-      }
-
-      break;
-    }
-    }
+  if (Src2.isValid())
+    Src2Parts = B.buildUnmerge(PartialResTy, Src2);
 
-    B.buildMergeLikeInstr(DstReg, PartialRes);
+  for (unsigned i = 0; i < NumParts; ++i) {
+    Src0 = Src0Parts.getReg(i);
+    if (Src2.isValid())
+      Src2 = Src2Parts.getReg(i);
+    PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
+  }
 
-    MI.eraseFromParent();
-    return true;
+  B.buildMergeLikeInstr(DstReg, PartialRes);
+  MI.eraseFromParent();
+  return true;
 }
 
 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,

>From 67e19e5f2efb8066c37ad26c1cbe9b335aaf931b Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Thu, 30 May 2024 18:32:47 +0000
Subject: [PATCH 24/30] fix documentation mess

---
 llvm/docs/AMDGPUUsage.rst | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 70dd168e90727..a2a557e82fd46 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1170,19 +1170,22 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
 
   :ref:`llvm.set.fpenv<int_set_fpenv>`             Sets the floating point environment to the specifies state.
 
-  llvm.amdgcn.readfirstlane                        These intrinsics provide direct access to v_readfirstlane_b32, 
-  llvm.amdgcn.readlane                             v_readlane_b32 and v_writelane_b32 respectively.
-  llvm.amdgcn.writelane                             - `llvm.amdgcn.readfirstlane` Returns the value in the lowest active 
-                                                       lane of the input operand
-                                                    - `llvm.amdgcn.readlane` Returns the value in the specified lane of 
-                                                       the first input operand. The second operand specifies the lane to 
-                                                       read from.
-                                                    - `llvm.amdgcn.writelane` Writes value in the first input operand to 
-                                                       the specified lane of divergent output. The second operand specifies
-                                                       the lane to write.
-                                                   These are currently implemented for i16, i32, float, half, bf16, <2 x i16>, 
-                                                   <2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the 32-bit 
-                                                   vectors.
+  llvm.amdgcn.readfirstlane                        Provides direct access to v_readfirstlane_b32. Returns the value in
+                                                   the lowest active lane of the input operand. Currently implemented
+                                                   for i16, i32, float, half, bf16, <2 x i16>, <2 x half>, <2 x bfloat>,
+                                                   i64, double, pointers, multiples of the 32-bit vectors.
+
+  llvm.amdgcn.readlane                             Provides direct access to v_readlane_b32. Returns the value in the
+                                                   specified lane of the first input operand. The second operand specifies
+                                                   the lane to read from. Currently implemented for i16, i32, float, half,
+                                                   bf16, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
+                                                   multiples of the 32-bit vectors.
+
+  llvm.amdgcn.writelane                            Provides direct access to v_writelane_b32. Writes value in the first input
+                                                   operand to the specified lane of divergent output. The second operand
+                                                   specifies the lane to write. Currently implemented for i16, i32, float,
+                                                   half, bf16, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
+                                                   multiples of the 32-bit vectors.
 
   llvm.amdgcn.wave.reduce.umin                     Performs an arithmetic unsigned min reduction on the unsigned values
                                                    provided by each lane in the wavefront.

>From cba2b1df94cffa13d8d5bf8f8407bc78fcff963b Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Thu, 30 May 2024 19:20:26 +0000
Subject: [PATCH 25/30] review comments

---
 llvm/docs/AMDGPUUsage.rst                      | 8 ++++----
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 3 +--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index a2a557e82fd46..a8adbea459a95 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1172,19 +1172,19 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
 
   llvm.amdgcn.readfirstlane                        Provides direct access to v_readfirstlane_b32. Returns the value in
                                                    the lowest active lane of the input operand. Currently implemented
-                                                   for i16, i32, float, half, bf16, <2 x i16>, <2 x half>, <2 x bfloat>,
+                                                   for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>,
                                                    i64, double, pointers, multiples of the 32-bit vectors.
 
   llvm.amdgcn.readlane                             Provides direct access to v_readlane_b32. Returns the value in the
                                                    specified lane of the first input operand. The second operand specifies
                                                    the lane to read from. Currently implemented for i16, i32, float, half,
-                                                   bf16, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
+                                                   bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
                                                    multiples of the 32-bit vectors.
 
   llvm.amdgcn.writelane                            Provides direct access to v_writelane_b32. Writes value in the first input
                                                    operand to the specified lane of divergent output. The second operand
                                                    specifies the lane to write. Currently implemented for i16, i32, float,
-                                                   half, bf16, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
+                                                   half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
                                                    multiples of the 32-bit vectors.
 
   llvm.amdgcn.wave.reduce.umin                     Performs an arithmetic unsigned min reduction on the unsigned values
@@ -1225,7 +1225,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    the output.
 
   llvm.amdgcn.sdot2                                Provides direct access to v_dot2_i32_i16 across targets which
-                                                   upport such instructions. This performs signed dot product
+                                                   support such instructions. This performs signed dot product
                                                    with two v2i16 operands, summed with the third i32 operand. The
                                                    i1 fourth operand is used to clamp the output.
                                                    When applicable (e.g. no clamping), this is lowered into
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 94fcabda69408..53ad81a610b3d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5461,8 +5461,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
 
   SmallVector<Register, 2> PartialRes;
   unsigned NumParts = Size / 32;
-  MachineInstrBuilder Src0Parts, Src2Parts;
-  Src0Parts = B.buildUnmerge(PartialResTy, Src0);
+  MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0), Src2Parts;
 
   if (Src2.isValid())
     Src2Parts = B.buildUnmerge(PartialResTy, Src2);

>From 26223c8a0019b794b3333ea5c1e609db67b448f9 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Fri, 31 May 2024 01:03:11 -0400
Subject: [PATCH 26/30] handle comment

---
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 53ad81a610b3d..974238f7829da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5461,7 +5461,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
 
   SmallVector<Register, 2> PartialRes;
   unsigned NumParts = Size / 32;
-  MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0), Src2Parts;
+  MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
+  MachineInstrBuilder Src2Parts;
 
   if (Src2.isValid())
     Src2Parts = B.buildUnmerge(PartialResTy, Src2);

>From 429fb0fad3345cec63271b6ae3a3689bdeb30c4c Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Fri, 31 May 2024 14:30:06 +0000
Subject: [PATCH 27/30] Review comments

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp         | 3 +++
 llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td             | 6 +++---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp             | 2 ++
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll | 5 ++---
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll      | 7 +++----
 5 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 9cd7496ba9f96..cc92b0b253887 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -902,6 +902,9 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
     return false;
   case AMDGPUISD::SETCC: // ballot-style instruction
     return true;
+  case AMDGPUISD::READFIRSTLANE:
+  case AMDGPUISD::READLANE:
+    return true;
   }
   return false;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index e4f329b200c86..c34b43bfca9e2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -354,9 +354,9 @@ def AMDGPUDWritelaneOp : SDTypeProfile<1, 3, [
   SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3>
 ]>;
 
-def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp>;
-def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", AMDGPUReadfirstlaneOp>;
-def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDWritelaneOp>;
+def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp, [SDNPOptInGlue]>;
+def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", AMDGPUReadfirstlaneOp, [SDNPOptInGlue]>;
+def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDWritelaneOp, [SDNPOptInGlue]>;
 
 // SI+ export
 def AMDGPUExportOp : SDTypeProfile<0, 8, [
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b1c9d3130228c..2981cc684232a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16128,6 +16128,8 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
   case AMDGPUISD::BUFFER_ATOMIC_FMAX:
     // Target-specific read-modify-write atomics are sources of divergence.
     return true;
+  case AMDGPUISD::WRITELANE:
+    return true;
   default:
     if (auto *A = dyn_cast<AtomicSDNode>(N)) {
       // Generic read-modify-write atomics are sources of divergence.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index ed0da0d2a61a2..cc6c630ae6466 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -463,10 +463,9 @@ define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) {
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0xffff
-; CHECK-SDAG-NEXT:    v_and_b32_e32 v0, s4, v0
+; CHECK-SDAG-NEXT:    s_and_b32 s4, s4, 0xffff
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
-; CHECK-SDAG-NEXT:    ; use v0
+; CHECK-SDAG-NEXT:    ; use s4
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
 ; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 325a39abb588a..66e1f9396de5a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -743,12 +743,11 @@ define void @test_readlane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0xffff
-; CHECK-SDAG-NEXT:    s_nop 2
+; CHECK-SDAG-NEXT:    s_nop 3
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v2, s4
-; CHECK-SDAG-NEXT:    v_and_b32_e32 v0, s4, v0
+; CHECK-SDAG-NEXT:    s_and_b32 s4, s4, 0xffff
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
-; CHECK-SDAG-NEXT:    ; use v0
+; CHECK-SDAG-NEXT:    ; use s4
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
 ; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;

>From ec7b5c116c9f6e690fbaf55be1eac65dc8cb7847 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Mon, 3 Jun 2024 08:15:00 +0000
Subject: [PATCH 28/30] test for convergence related crash

---
 .../test/Verifier/convergence-laneops-xfail.ll | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 llvm/test/Verifier/convergence-laneops-xfail.ll

diff --git a/llvm/test/Verifier/convergence-laneops-xfail.ll b/llvm/test/Verifier/convergence-laneops-xfail.ll
new file mode 100644
index 0000000000000..712832f4139f3
--- /dev/null
+++ b/llvm/test/Verifier/convergence-laneops-xfail.ll
@@ -0,0 +1,18 @@
+; RUN: not --crash llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s 2>&1 | FileCheck %s
+
+; CHECK: *** Bad machine code: Cannot mix controlled and uncontrolled convergence in the same function. ***
+; CHECK: function:    basic_branch_i64
+define i64 @basic_branch_i64(i64 %src, i1 %cond) #0 {
+entry:
+  %t = call token @llvm.experimental.convergence.anchor()
+  %x = add i64 %src, 1
+  br i1 %cond, label %then, label %else
+
+then:
+  %r = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %x) [ "convergencectrl"(token %t) ]
+  br label %else
+
+else:
+  %p = phi i64 [%r, %then], [%x, %entry]
+  ret i64 %p
+}
\ No newline at end of file

>From 3d9cf2ea12c27c09eac1ad7aa0ea8e586304fb8f Mon Sep 17 00:00:00 2001
From: Vikram Hegde <115221833+vikramRH at users.noreply.github.com>
Date: Mon, 3 Jun 2024 13:47:48 +0530
Subject: [PATCH 29/30] Update convergence-laneops-xfail.ll

---
 llvm/test/Verifier/convergence-laneops-xfail.ll | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Verifier/convergence-laneops-xfail.ll b/llvm/test/Verifier/convergence-laneops-xfail.ll
index 712832f4139f3..370592194c0fd 100644
--- a/llvm/test/Verifier/convergence-laneops-xfail.ll
+++ b/llvm/test/Verifier/convergence-laneops-xfail.ll
@@ -15,4 +15,5 @@ then:
 else:
   %p = phi i64 [%r, %then], [%x, %entry]
   ret i64 %p
-}
\ No newline at end of file
+}
+

>From 482f3800e1138171e2bf59ed390f2bdd4729fdf9 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Wed, 12 Jun 2024 09:27:22 +0000
Subject: [PATCH 30/30] update convergence related failure tests

---
 .../convergence-readfirstlane64-xfail.mir     | 46 +++++++++++++++
 .../AMDGPU/convergence-readlane64-xfail.mir   | 49 ++++++++++++++++
 .../AMDGPU/convergence-writelane64-xfail.mir  | 58 +++++++++++++++++++
 .../Verifier/convergence-laneops-xfail.ll     | 19 ------
 4 files changed, 153 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/convergence-readfirstlane64-xfail.mir
 create mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/convergence-readlane64-xfail.mir
 create mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/convergence-writelane64-xfail.mir
 delete mode 100644 llvm/test/Verifier/convergence-laneops-xfail.ll

diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/convergence-readfirstlane64-xfail.mir b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/convergence-readfirstlane64-xfail.mir
new file mode 100644
index 0000000000000..e52a072c05198
--- /dev/null
+++ b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/convergence-readfirstlane64-xfail.mir
@@ -0,0 +1,46 @@
+# RUN: not --crash llc -mtriple=amdgcn -run-pass=none -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s
+---
+name:            basic_branch_i64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    %7:vgpr_32 = COPY $vgpr2
+    %6:vgpr_32 = COPY $vgpr1
+    %5:vgpr_32 = COPY $vgpr0
+    %8:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %6, %subreg.sub1
+    %9:vgpr_32 = V_AND_B32_e64 1, %7, implicit $exec
+    %10:sreg_64 = V_CMP_EQ_U32_e64 killed %9, 1, implicit $exec
+    %0:sreg_64 = CONVERGENCECTRL_ANCHOR
+    %11:sreg_64 = S_MOV_B64 1
+    %1:vreg_64 = V_ADD_U64_PSEUDO killed %8, killed %11, implicit-def dead $vcc, implicit $exec
+    %2:sreg_64 = SI_IF killed %10, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+
+    %12:vgpr_32 = COPY %1.sub1
+    ; CHECK: Cannot mix controlled and uncontrolled convergence in the same function.
+    ; CHECK: V_READFIRSTLANE_B32 killed %{{[0-9]+}}:vgpr_32, implicit $exec
+    %13:sreg_32 = V_READFIRSTLANE_B32 killed %12, implicit $exec
+    %14:vgpr_32 = COPY %1.sub0
+    ; CHECK: Cannot mix controlled and uncontrolled convergence in the same function.
+    ; CHECK: V_READFIRSTLANE_B32 killed %{{[0-9]+}}:vgpr_32, implicit $exec
+    %15:sreg_32 = V_READFIRSTLANE_B32 killed %14, implicit $exec
+    %16:sreg_64 = REG_SEQUENCE killed %15, %subreg.sub0, killed %13, %subreg.sub1
+    %3:sreg_64 = COPY %16
+
+  bb.2:
+    %4:vreg_64 = PHI %1, %bb.0, %3, %bb.1
+    SI_END_CF %2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %17:vgpr_32 = COPY %4.sub0
+    %18:vgpr_32 = COPY %4.sub1
+    $vgpr0 = COPY %17
+    $vgpr1 = COPY %18
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+...
+
diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/convergence-readlane64-xfail.mir b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/convergence-readlane64-xfail.mir
new file mode 100644
index 0000000000000..232782cca2713
--- /dev/null
+++ b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/convergence-readlane64-xfail.mir
@@ -0,0 +1,49 @@
+# RUN: not --crash llc -mtriple=amdgcn -run-pass=none -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s
+---
+name:            basic_branch_i64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    %8:vgpr_32 = COPY $vgpr3
+    %7:vgpr_32 = COPY $vgpr2
+    %6:vgpr_32 = COPY $vgpr1
+    %5:vgpr_32 = COPY $vgpr0
+    %9:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %6, %subreg.sub1
+    %10:vgpr_32 = V_AND_B32_e64 1, %7, implicit $exec
+    %11:sreg_64 = V_CMP_EQ_U32_e64 killed %10, 1, implicit $exec
+    %0:sreg_64 = CONVERGENCECTRL_ANCHOR
+    %12:sreg_64 = S_MOV_B64 1
+    %1:vreg_64 = V_ADD_U64_PSEUDO killed %9, killed %12, implicit-def dead $vcc, implicit $exec
+    %2:sreg_64 = SI_IF killed %11, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+
+    %13:vgpr_32 = COPY %1.sub1
+    %15:sreg_32 = COPY %8
+    ; CHECK: Cannot mix controlled and uncontrolled convergence in the same function.
+    ; CHECK: V_READLANE_B32 killed %{{[0-9]+}}:vgpr_32, %{{[0-9]+}}:sreg_32
+    %14:sreg_32 = V_READLANE_B32 killed %13, %15
+    %16:vgpr_32 = COPY %1.sub0
+    %18:sreg_32 = COPY %8
+    ; CHECK: Cannot mix controlled and uncontrolled convergence in the same function.
+    ; CHECK: V_READLANE_B32 killed %{{[0-9]+}}:vgpr_32, %{{[0-9]+}}:sreg_32
+    %17:sreg_32 = V_READLANE_B32 killed %16, %18
+    %19:sreg_64 = REG_SEQUENCE killed %17, %subreg.sub0, killed %14, %subreg.sub1
+    %3:sreg_64 = COPY %19
+
+  bb.2:
+    %4:vreg_64 = PHI %1, %bb.0, %3, %bb.1
+    SI_END_CF %2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %20:vgpr_32 = COPY %4.sub0
+    %21:vgpr_32 = COPY %4.sub1
+    $vgpr0 = COPY %20
+    $vgpr1 = COPY %21
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+...
+
diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/convergence-writelane64-xfail.mir b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/convergence-writelane64-xfail.mir
new file mode 100644
index 0000000000000..d0caf9e7b34d0
--- /dev/null
+++ b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/convergence-writelane64-xfail.mir
@@ -0,0 +1,58 @@
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=none -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s
+---
+name:            basic_branch_i64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    %10:vgpr_32 = COPY $vgpr5
+    %9:vgpr_32 = COPY $vgpr4
+    %8:vgpr_32 = COPY $vgpr3
+    %7:vgpr_32 = COPY $vgpr2
+    %6:vgpr_32 = COPY $vgpr1
+    %5:vgpr_32 = COPY $vgpr0
+    %12:sreg_64 = REG_SEQUENCE %9, %subreg.sub0, %10, %subreg.sub1
+    %13:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %6, %subreg.sub1
+    %14:vgpr_32 = V_AND_B32_e64 1, %7, implicit $exec
+    %15:sreg_32 = V_CMP_EQ_U32_e64 killed %14, 1, implicit $exec
+    %11:vreg_64 = COPY %12
+    %0:sreg_64 = CONVERGENCECTRL_ANCHOR
+    %16:sreg_64 = S_MOV_B64 1
+    %1:vreg_64 = V_ADD_U64_PSEUDO killed %13, killed %16, implicit-def dead $vcc, implicit $exec
+    %2:sreg_32 = SI_IF killed %15, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+
+    %17:vreg_64 = GLOBAL_LOAD_DWORDX2 %11, 0, 0, implicit $exec
+    %18:vgpr_32 = COPY %1.sub1
+    %19:vgpr_32 = COPY %17.sub1
+    %21:sreg_32 = COPY %18
+    %22:sreg_32 = COPY %8
+    ; CHECK: Cannot mix controlled and uncontrolled convergence in the same function.
+    ; CHECK: V_WRITELANE_B32 killed %{{[0-9]+}}:sreg_32, %{{[0-9]+}}:sreg_32, %{{[0-9]+}}:vgpr_32(tied-def 0)
+    %20:vgpr_32 = V_WRITELANE_B32 killed %21, %22, %19
+    %23:vgpr_32 = COPY %1.sub0
+    %24:vgpr_32 = COPY %17.sub0
+    %26:sreg_32 = COPY %23
+    %27:sreg_32 = COPY %8
+    ; CHECK: Cannot mix controlled and uncontrolled convergence in the same function.
+    ; CHECK: V_WRITELANE_B32 killed %{{[0-9]+}}:sreg_32, %{{[0-9]+}}:sreg_32, %{{[0-9]+}}:vgpr_32(tied-def 0)
+    %25:vgpr_32 = V_WRITELANE_B32 killed %26, %27, %24
+    %28:sreg_64 = REG_SEQUENCE killed %25, %subreg.sub0, killed %20, %subreg.sub1
+    %3:vreg_64 = COPY %28
+
+  bb.2:
+    %4:vreg_64 = PHI %1, %bb.0, %3, %bb.1
+    SI_END_CF %2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %29:vgpr_32 = COPY %4.sub0
+    %30:vgpr_32 = COPY %4.sub1
+    $vgpr0 = COPY %29
+    $vgpr1 = COPY %30
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+
+...
+
diff --git a/llvm/test/Verifier/convergence-laneops-xfail.ll b/llvm/test/Verifier/convergence-laneops-xfail.ll
deleted file mode 100644
index 370592194c0fd..0000000000000
--- a/llvm/test/Verifier/convergence-laneops-xfail.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: not --crash llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s 2>&1 | FileCheck %s
-
-; CHECK: *** Bad machine code: Cannot mix controlled and uncontrolled convergence in the same function. ***
-; CHECK: function:    basic_branch_i64
-define i64 @basic_branch_i64(i64 %src, i1 %cond) #0 {
-entry:
-  %t = call token @llvm.experimental.convergence.anchor()
-  %x = add i64 %src, 1
-  br i1 %cond, label %then, label %else
-
-then:
-  %r = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %x) [ "convergencectrl"(token %t) ]
-  br label %else
-
-else:
-  %p = phi i64 [%r, %then], [%x, %entry]
-  ret i64 %p
-}
-



More information about the cfe-commits mailing list