[llvm-branch-commits] [llvm] [AMDGPU] Add gfx1251 V_PK_LSHL_ADD_U64 (PR #203612)
Stanislav Mekhanoshin via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Jun 12 12:22:32 PDT 2026
https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/203612
None
>From 94c3745351c5d821b79c67ade9a14555bf7357c0 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Fri, 12 Jun 2026 12:21:47 -0700
Subject: [PATCH] [AMDGPU] Add gfx1251 V_PK_LSHL_ADD_U64
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 +
llvm/lib/Target/AMDGPU/SOPInstructions.td | 14 +
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 2 +
llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 46 ++
llvm/test/CodeGen/AMDGPU/pk-lshl-add-u64.ll | 241 ++++++
llvm/test/CodeGen/AMDGPU/shl.v2i64.ll | 736 ++++++++++++++++++
llvm/test/MC/AMDGPU/gfx1251_asm_vop3p.s | 52 ++
llvm/test/MC/AMDGPU/gfx1251_err.s | 34 +
.../AMDGPU/gfx1251_dasm_vop3p.txt | 39 +
10 files changed, 1167 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/pk-lshl-add-u64.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/shl.v2i64.ll
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f4f2f9e3632d8..7e2ad6ffeeebb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -920,8 +920,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
if (Subtarget->hasPackedU64Ops()) {
- setOperationAction({ISD::ADD, ISD::SUB}, MVT::v2i64, Legal);
- setOperationAction({ISD::ADD, ISD::SUB},
+ setOperationAction({ISD::ADD, ISD::SUB, ISD::SHL}, MVT::v2i64, Legal);
+ setOperationAction({ISD::ADD, ISD::SUB, ISD::SHL},
{MVT::v4i64, MVT::v8i64, MVT::v16i64, MVT::v32i64},
Custom);
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b3fffba0bd1a3..8c30e53e9b4e4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -3186,6 +3186,7 @@ def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>;
def VOP_V2F64_V2F64_V2F64 : VOPProfile <[v2f64, v2f64, v2f64, untyped]>;
def VOP_V2F64_V2F64_V2F64_V2F64 : VOPProfile <[v2f64, v2f64, v2f64, v2f64]>;
def VOP_V2I64_V2I64_V2I64 : VOPProfile <[v2i64, v2i64, v2i64, untyped]>;
+def VOP_V2I64_V2I64_V2I32_V2I64 : VOPProfile <[v2i64, v2i64, v2i32, v2i64]>;
def VOP_V4I32_I64_I64_V4I32 : VOPProfile <[v4i32, i64, i64, v4i32]>;
def VOP_V16I32_I64_I64_V16I32 : VOPProfile <[v16i32, i64, i64, v16i32]>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index ef4333ef055cd..01dd444c477e2 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -2132,6 +2132,20 @@ def : GCNPat<
>;
}
+// v2i64 shl is legal, we need a way to select scalar code
+let SubtargetPredicate = HasPackedU64Ops in {
+def : GCNPat<
+ (v2i64 (UniformBinFrag<cshl_64> v2i64:$src0, v2i64:$src1)),
+ (v2i64 (REG_SEQUENCE SReg_128,
+ (i64 (COPY_TO_REGCLASS
+ (S_LSHL_B64 (i64 (EXTRACT_SUBREG $src0, sub0_sub1)),
+ (i32 (EXTRACT_SUBREG $src1, sub0))), SGPR_64)), sub0_sub1,
+ (i64 (COPY_TO_REGCLASS
+ (S_LSHL_B64 (i64 (EXTRACT_SUBREG $src0, sub2_sub3)),
+ (i32 (EXTRACT_SUBREG $src1, sub2))), SGPR_64)), sub2_sub3))
+>;
+}
+
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index c1e9bc6336eb6..7c283be411b20 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -3857,6 +3857,8 @@ bool isPacked64BitInst(unsigned Opc) {
case AMDGPU::V_PK_ADD_NC_U64_gfx1250:
case AMDGPU::V_PK_SUB_NC_U64:
case AMDGPU::V_PK_SUB_NC_U64_gfx1250:
+ case AMDGPU::V_PK_LSHL_ADD_U64:
+ case AMDGPU::V_PK_LSHL_ADD_U64_gfx1250:
return true;
default:
return false;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index d6eb02d9c9c36..79250421f5df7 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -148,6 +148,18 @@ def V_PK_VOP2_I64_Profile : VOP3P_Profile<VOP_V2I64_V2I64_V2I64, VOP3_PACKED> {
let Src1RC64 = VSrc_v2b64;
}
+def V_PK_LSHL_ADD_U64_Profile : VOP3P_Profile<VOP_V2I64_V2I64_V2I32_V2I64, VOP3_PACKED> {
+ let HasModifiers = 0;
+ let HasOpSel = 0;
+ let HasClamp = 0;
+ let Src0RC64 = VSrc_v2b64;
+ let Src1RC64 = VSrc_v2b32;
+ let Src2RC64 = VSrc_v2b64;
+ let InsVOP3P = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+ HasClamp, HasModifiers, HasSrc2Mods,
+ HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
+}
+
let isReMaterializable = 1 in {
let isCommutable = 1 in {
defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
@@ -193,6 +205,7 @@ defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2
let SubtargetPredicate = HasPackedU64Ops, SchedRW = [Write64Bit] in {
defm V_PK_SUB_NC_U64 : VOP3PInst<"v_pk_sub_nc_u64", V_PK_VOP2_I64_Profile, sub>;
+defm V_PK_LSHL_ADD_U64 : VOP3PInst<"v_pk_lshl_add_u64", V_PK_LSHL_ADD_U64_Profile>;
} // End SubtargetPredicate = HasPackedU64Ops, , SchedRW = [Write64Bit]
} // End isReMaterializable = 1
@@ -556,6 +569,38 @@ def : ThreeOp_OpSelClampPats<smin, smin, V_PK_MIN3_I16>;
def : ThreeOp_OpSelClampPats<umin, umin, V_PK_MIN3_U16>;
}
+let SubtargetPredicate = HasPackedU64Ops in {
+let AddedComplexity = 5 in
+def : GCNPat<
+ (v2i64 (DivergentBinFrag<shl_0_to_4> v2i64:$src0, v2i64:$src1)),
+ (v2i64 (V_PK_LSHL_ADD_U64 $src0,
+ (v2i32 (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG $src1, sub0)), sub0,
+ (i32 (EXTRACT_SUBREG $src1, sub2)), sub1)),
+ (v2i64 (as_i64imm (i32 0)))))
+>;
+
+def : GCNPat<
+ (v2i64 (DivergentBinFrag<cshl_64> v2i64:$src0, v2i64:$src1)),
+ (v2i64 (REG_SEQUENCE VReg_128,
+ (V_LSHLREV_B64_pseudo_e64 (i32 (EXTRACT_SUBREG $src1, sub0)),
+ (EXTRACT_SUBREG $src0, sub0_sub1)),
+ sub0_sub1,
+ (V_LSHLREV_B64_pseudo_e64 (i32 (EXTRACT_SUBREG $src1, sub2)),
+ (EXTRACT_SUBREG $src0, sub2_sub3)),
+ sub2_sub3))
+>;
+
+def : GCNPat<
+ (ThreeOpFrag<shl_0_to_4, add> v2i64:$src0, v2i64:$src1, v2i64:$src2),
+ (V_PK_LSHL_ADD_U64 VSrc_v2b64:$src0,
+ (v2i32 (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG $src1, sub0)), sub0,
+ (i32 (EXTRACT_SUBREG $src1, sub2)), sub1)),
+ VSrc_v2b64:$src2)
+>;
+} // End SubtargetPredicate = HasPackedU64Ops
+
// Defines patterns that extract signed 4bit from each Idx[0].
foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in
def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src),
@@ -2727,6 +2772,7 @@ defm V_PK_ADD_NC_U64 : VOP3P_Real_gfx1250<0x4c>;
defm V_PK_SUB_NC_U64 : VOP3P_Real_gfx1250<0x4d>;
defm V_PK_MAX_NUM_F64 : VOP3P_Real_gfx1250<0x4e>;
defm V_PK_MIN_NUM_F64 : VOP3P_Real_gfx1250<0x4f>;
+defm V_PK_LSHL_ADD_U64 : VOP3P_Real_gfx1250<0x7e>;
let AssemblerPredicate = isGFX1250Plus in
def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">;
diff --git a/llvm/test/CodeGen/AMDGPU/pk-lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/pk-lshl-add-u64.ll
new file mode 100644
index 0000000000000..a045572322ee8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pk-lshl-add-u64.ll
@@ -0,0 +1,241 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1251 < %s | FileCheck -check-prefix=GFX1251 %s
+
+define <2 x i64> @pk_lshl_add_u64_v1v(<2 x i64> %v, <2 x i64> %a) {
+; GFX1251-LABEL: pk_lshl_add_u64_v1v:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: s_mov_b32 s0, 1
+; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1251-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s0
+; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[0:3], v[8:9], v[4:7]
+; GFX1251-NEXT: s_set_pc_i64 s[30:31]
+ %shl = shl <2 x i64> %v, <i64 1, i64 1>
+ %add = add <2 x i64> %shl, %a
+ ret <2 x i64> %add
+}
+
+define <2 x i64> @pk_lshl_add_u64_v4_5v(<2 x i64> %v, <2 x i64> %a) {
+; GFX1251-LABEL: pk_lshl_add_u64_v4_5v:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[2:3], 5, v[2:3]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[0:1], 4, v[0:1]
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
+; GFX1251-NEXT: s_set_pc_i64 s[30:31]
+ %shl = shl <2 x i64> %v, <i64 4, i64 5>
+ %add = add <2 x i64> %shl, %a
+ ret <2 x i64> %add
+}
+
+define <2 x i64> @pk_lshl_add_u64_vvv(<2 x i64> %v, <2 x i64> %s, <2 x i64> %a) {
+; GFX1251-LABEL: pk_lshl_add_u64_vvv:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[2:3], v6, v[2:3]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[0:1], v4, v[0:1]
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[8:11]
+; GFX1251-NEXT: s_set_pc_i64 s[30:31]
+ %shl = shl <2 x i64> %v, %s
+ %add = add <2 x i64> %shl, %a
+ ret <2 x i64> %add
+}
+
+define amdgpu_kernel void @pk_lshl_add_u64_s2v(<2 x i64> %v) {
+; GFX1251-LABEL: pk_lshl_add_u64_s2v:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: flat_load_b128 v[0:3], v[0:1]
+; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX1251-NEXT: s_wait_xcnt 0x0
+; GFX1251-NEXT: s_mov_b32 s4, 2
+; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1251-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s4
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
+; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[4:7], v[8:9], v[0:3]
+; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3]
+; GFX1251-NEXT: s_endpgm
+ %a = load <2 x i64>, ptr poison
+ %shl = shl <2 x i64> %v, <i64 2, i64 2>
+ %add = add <2 x i64> %shl, %a
+ store <2 x i64> %add, ptr poison
+ ret void
+}
+
+define amdgpu_kernel void @pk_lshl_add_u64_v2s(<2 x i64> %a) {
+; GFX1251-LABEL: pk_lshl_add_u64_v2s:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: flat_load_b128 v[0:3], v[0:1]
+; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX1251-NEXT: s_wait_xcnt 0x0
+; GFX1251-NEXT: s_mov_b32 s4, 2
+; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1251-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s4
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
+; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[0:3], v[8:9], v[4:7]
+; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3]
+; GFX1251-NEXT: s_endpgm
+ %v = load <2 x i64>, ptr poison
+ %shl = shl <2 x i64> %v, <i64 2, i64 2>
+ %add = add <2 x i64> %shl, %a
+ store <2 x i64> %add, ptr poison
+ ret void
+}
+
+define amdgpu_kernel void @pk_lshl_add_u64_s2s(<2 x i64> %v, <2 x i64> %a) {
+; GFX1251-LABEL: pk_lshl_add_u64_s2s:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2
+; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 2
+; GFX1251-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1251-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[4:7], v[0:3]
+; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3]
+; GFX1251-NEXT: s_endpgm
+ %shl = shl <2 x i64> %v, <i64 2, i64 2>
+ %add = add <2 x i64> %shl, %a
+ store <2 x i64> %add, ptr poison
+ ret void
+}
+
+define i32 @pk_lshl_add_u64_gep(<2 x ptr> %p, <2 x i64> %a) {
+; GFX1251-LABEL: pk_lshl_add_u64_gep:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: s_mov_b32 s0, 2
+; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1251-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s0
+; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[4:7], v[8:9], v[0:3]
+; GFX1251-NEXT: flat_load_b32 v4, v[0:1]
+; GFX1251-NEXT: flat_load_b32 v5, v[2:3]
+; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1251-NEXT: s_wait_xcnt 0x1
+; GFX1251-NEXT: v_add_nc_u32_e32 v0, v4, v5
+; GFX1251-NEXT: s_set_pc_i64 s[30:31]
+ %gep = getelementptr inbounds i32, <2 x ptr> %p, <2 x i64> %a
+ %gep0 = extractelement <2 x ptr> %gep, i32 0
+ %gep1 = extractelement <2 x ptr> %gep, i32 1
+ %v0 = load i32, ptr %gep0
+ %v1 = load i32, ptr %gep1
+ %v = add i32 %v0, %v1
+ ret i32 %v
+}
+
+define i32 @pk_lshl_add_u64_maybe_oob(<2 x ptr> %p, <2 x i32> %i) {
+; GFX1251-LABEL: pk_lshl_add_u64_maybe_oob:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: v_dual_mov_b32 v6, v5 :: v_dual_ashrrev_i32 v5, 31, v4
+; GFX1251-NEXT: s_mov_b32 s0, 2
+; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1251-NEXT: v_dual_mov_b32 v8, 12 :: v_dual_mov_b32 v11, s0
+; GFX1251-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_ashrrev_i32 v7, 31, v6
+; GFX1251-NEXT: v_mov_b32_e32 v9, 0
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[4:7], v[10:11], v[0:3]
+; GFX1251-NEXT: v_dual_mov_b32 v10, v8 :: v_dual_mov_b32 v11, v9
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[8:11]
+; GFX1251-NEXT: flat_load_b32 v4, v[0:1]
+; GFX1251-NEXT: flat_load_b32 v5, v[2:3]
+; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1251-NEXT: s_wait_xcnt 0x1
+; GFX1251-NEXT: v_add_nc_u32_e32 v0, v4, v5
+; GFX1251-NEXT: s_set_pc_i64 s[30:31]
+ %idx = add nsw <2 x i32> %i, <i32 3, i32 3>
+ %gep = getelementptr i32, <2 x ptr> %p, <2 x i32> %idx
+ %gep0 = extractelement <2 x ptr> %gep, i32 0
+ %gep1 = extractelement <2 x ptr> %gep, i32 1
+ %v0 = load i32, ptr %gep0
+ %v1 = load i32, ptr %gep1
+ %v = add i32 %v0, %v1
+ ret i32 %v
+}
+
+define amdgpu_kernel void @pk_lshl_add_u64_s2s_shift2_3(<2 x i64> %v, <2 x i64> %a) {
+; GFX1251-LABEL: pk_lshl_add_u64_s2s_shift2_3:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2
+; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; GFX1251-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1251-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[4:7], v[0:3]
+; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3]
+; GFX1251-NEXT: s_endpgm
+ %shl = shl <2 x i64> %v, <i64 2, i64 3>
+ %add = add <2 x i64> %shl, %a
+ store <2 x i64> %add, ptr poison
+ ret void
+}
+
+; FIXME: That shall be possible to use v_pk_lshl_add_u64 here,
+; but ComputeKnownBits does not understand this vector with shift amounts.
+define amdgpu_kernel void @pk_lshl_add_u64_s2s_shift2_4(<2 x i64> %v, <2 x i64> %a) {
+; GFX1251-LABEL: pk_lshl_add_u64_s2s_shift2_4:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2
+; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 4
+; GFX1251-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1251-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[4:7], v[0:3]
+; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3]
+; GFX1251-NEXT: s_endpgm
+ %shl = shl <2 x i64> %v, <i64 2, i64 4>
+ %add = add <2 x i64> %shl, %a
+ store <2 x i64> %add, ptr poison
+ ret void
+}
+
+define amdgpu_kernel void @pk_lshl_add_u64_s2s_shift2_5(<2 x i64> %v, <2 x i64> %a) {
+; GFX1251-LABEL: pk_lshl_add_u64_s2s_shift2_5:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2
+; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 5
+; GFX1251-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX1251-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[4:7], v[0:3]
+; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3]
+; GFX1251-NEXT: s_endpgm
+ %shl = shl <2 x i64> %v, <i64 2, i64 5>
+ %add = add <2 x i64> %shl, %a
+ store <2 x i64> %add, ptr poison
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i64.ll
new file mode 100644
index 0000000000000..ecbb578957232
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i64.ll
@@ -0,0 +1,736 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX1250 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1251 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX1251 %s
+
+define amdgpu_kernel void @s_shl_v2i64(ptr addrspace(1) %out, <2 x i64> %lhs, <2 x i64> %rhs) #0 {
+; GFX1250-LABEL: s_shl_v2i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshl_b64 s[2:3], s[8:9], s12
+; GFX1250-NEXT: s_lshl_b64 s[4:5], s[10:11], s14
+; GFX1250-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_mov_b32_e32 v3, s5
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1251-LABEL: s_shl_v2i64:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_clause 0x1
+; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv
+; GFX1251-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
+; GFX1251-NEXT: v_mov_b32_e32 v4, 0
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: s_lshl_b64 s[2:3], s[8:9], s12
+; GFX1251-NEXT: s_lshl_b64 s[4:5], s[10:11], s14
+; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX1251-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1251-NEXT: s_endpgm
+ %result = shl <2 x i64> %lhs, %rhs
+ store <2 x i64> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @s_shl_v2i64_s_imm(ptr addrspace(1) %out, <2 x i64> %lhs) #0 {
+; GFX1250-LABEL: s_shl_v2i64_s_imm:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
+; GFX1250-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
+; GFX1250-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v3, s3
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[6:7]
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1251-LABEL: s_shl_v2i64_s_imm:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_clause 0x1
+; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv
+; GFX1251-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
+; GFX1251-NEXT: v_mov_b32_e32 v4, 0
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
+; GFX1251-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
+; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1251-NEXT: global_store_b128 v4, v[0:3], s[6:7]
+; GFX1251-NEXT: s_endpgm
+ %result = shl <2 x i64> %lhs, <i64 1, i64 2>
+ store <2 x i64> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @s_shl_v2i64_imm_s(ptr addrspace(1) %out, <2 x i64> %rhs) #0 {
+; GFX1250-LABEL: s_shl_v2i64_imm_s:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshl_b64 s[0:1], 0x4d2, s0
+; GFX1250-NEXT: s_lshl_b64 s[2:3], 0x162e, s2
+; GFX1250-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v3, s3
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[6:7]
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1251-LABEL: s_shl_v2i64_imm_s:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_clause 0x1
+; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv
+; GFX1251-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
+; GFX1251-NEXT: s_mov_b64 s[8:9], 0x4d2
+; GFX1251-NEXT: s_wait_xcnt 0x0
+; GFX1251-NEXT: s_movk_i32 s4, 0x162e
+; GFX1251-NEXT: s_mov_b32 s5, s9
+; GFX1251-NEXT: v_mov_b32_e32 v4, 0
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], s0
+; GFX1251-NEXT: s_lshl_b64 s[2:3], s[4:5], s2
+; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1251-NEXT: global_store_b128 v4, v[0:3], s[6:7]
+; GFX1251-NEXT: s_endpgm
+ %result = shl <2 x i64> <i64 1234, i64 5678>, %rhs
+ store <2 x i64> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: v_shl_v2i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GCN-NEXT: v_and_b32_e32 v8, 0x3ff, v0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16 scale_offset
+; GCN-NEXT: global_load_b128 v[4:7], v8, s[2:3] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: v_lshlrev_b64_e32 v[2:3], v2, v[6:7]
+; GCN-NEXT: v_lshlrev_b64_e32 v[0:1], v0, v[4:5]
+; GCN-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %in.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
+ %out.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i64 %tid.ext
+ %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in.gep, i32 1
+ %a = load <2 x i64>, ptr addrspace(1) %in.gep
+ %b = load <2 x i64>, ptr addrspace(1) %b_ptr
+ %result = shl <2 x i64> %a, %b
+ store <2 x i64> %result, ptr addrspace(1) %out.gep
+ ret void
+}
+
+define amdgpu_kernel void @shl_v_s_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i64> %sgpr) #0 {
+; GCN-LABEL: shl_v_s_v2i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GCN-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv
+; GCN-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: global_load_b128 v[0:3], v4, s[10:11] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: v_lshlrev_b64_e32 v[2:3], s14, v[2:3]
+; GCN-NEXT: v_lshlrev_b64_e32 v[0:1], s12, v[0:1]
+; GCN-NEXT: global_store_b128 v4, v[0:3], s[8:9] scale_offset
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %in.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
+ %out.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i64 %tid.ext
+ %vgpr = load <2 x i64>, ptr addrspace(1) %in.gep
+ %result = shl <2 x i64> %vgpr, %sgpr
+ store <2 x i64> %result, ptr addrspace(1) %out.gep
+ ret void
+}
+
+define amdgpu_kernel void @shl_s_v_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i64> %sgpr) #0 {
+; GCN-LABEL: shl_s_v_v2i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GCN-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv
+; GCN-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: global_load_b128 v[0:3], v4, s[10:11] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: v_lshlrev_b64_e64 v[2:3], v2, s[14:15]
+; GCN-NEXT: v_lshlrev_b64_e64 v[0:1], v0, s[12:13]
+; GCN-NEXT: global_store_b128 v4, v[0:3], s[8:9] scale_offset
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %in.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
+ %out.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i64 %tid.ext
+ %vgpr = load <2 x i64>, ptr addrspace(1) %in.gep
+ %result = shl <2 x i64> %sgpr, %vgpr
+ store <2 x i64> %result, ptr addrspace(1) %out.gep
+ ret void
+}
+
+define amdgpu_kernel void @shl_imm_v_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GFX1250-LABEL: shl_imm_v_v2i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_lshlrev_b64_e64 v[2:3], v2, 8
+; GFX1250-NEXT: v_lshlrev_b64_e64 v[0:1], v0, 8
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1251-LABEL: shl_imm_v_v2i64:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX1251-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset
+; GFX1251-NEXT: s_wait_xcnt 0x0
+; GFX1251-NEXT: s_mov_b64 s[2:3], 8
+; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1251-NEXT: s_mov_b32 s4, s2
+; GFX1251-NEXT: s_mov_b32 s5, s3
+; GFX1251-NEXT: s_wait_loadcnt 0x0
+; GFX1251-NEXT: v_lshlrev_b64_e64 v[2:3], v2, s[4:5]
+; GFX1251-NEXT: v_lshlrev_b64_e64 v[0:1], v0, s[2:3]
+; GFX1251-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset
+; GFX1251-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %in.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
+ %out.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i64 %tid.ext
+ %vgpr = load <2 x i64>, ptr addrspace(1) %in.gep
+ %result = shl <2 x i64> <i64 8, i64 8>, %vgpr
+ store <2 x i64> %result, ptr addrspace(1) %out.gep
+ ret void
+}
+
+define amdgpu_kernel void @shl_v_imm_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: shl_v_imm_v2i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GCN-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: v_lshlrev_b64_e32 v[2:3], 8, v[2:3]
+; GCN-NEXT: v_lshlrev_b64_e32 v[0:1], 8, v[0:1]
+; GCN-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %in.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext
+ %out.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i64 %tid.ext
+ %vgpr = load <2 x i64>, ptr addrspace(1) %in.gep
+ %result = shl <2 x i64> %vgpr, <i64 8, i64 8>
+ store <2 x i64> %result, ptr addrspace(1) %out.gep
+ ret void
+}
+
+define amdgpu_kernel void @v_shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: v_shl_v4i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_clause 0x3
+; GCN-NEXT: global_load_b128 v[0:3], v16, s[2:3] offset:48
+; GCN-NEXT: global_load_b128 v[4:7], v16, s[2:3] offset:32
+; GCN-NEXT: global_load_b128 v[8:11], v16, s[2:3]
+; GCN-NEXT: global_load_b128 v[12:15], v16, s[2:3] offset:16
+; GCN-NEXT: s_wait_loadcnt 0x1
+; GCN-NEXT: v_lshlrev_b64_e32 v[6:7], v6, v[10:11]
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: v_lshlrev_b64_e32 v[2:3], v2, v[14:15]
+; GCN-NEXT: v_lshlrev_b64_e32 v[0:1], v0, v[12:13]
+; GCN-NEXT: v_lshlrev_b64_e32 v[4:5], v4, v[8:9]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:16
+; GCN-NEXT: global_store_b128 v16, v[4:7], s[0:1]
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %in.gep = getelementptr inbounds <4 x i64>, ptr addrspace(1) %in, i64 %tid.ext
+ %out.gep = getelementptr inbounds <4 x i64>, ptr addrspace(1) %out, i64 %tid.ext
+ %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in.gep, i32 1
+ %a = load <4 x i64>, ptr addrspace(1) %in.gep
+ %b = load <4 x i64>, ptr addrspace(1) %b_ptr
+ %result = shl <4 x i64> %a, %b
+ store <4 x i64> %result, ptr addrspace(1) %out.gep
+ ret void
+}
+
+define amdgpu_kernel void @shl_v_imm_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: shl_v_imm_v4i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_load_b128 v[0:3], v8, s[2:3]
+; GCN-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
+; GCN-NEXT: s_wait_loadcnt 0x1
+; GCN-NEXT: v_lshlrev_b64_e32 v[2:3], 8, v[2:3]
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: v_lshlrev_b64_e32 v[6:7], 8, v[6:7]
+; GCN-NEXT: v_lshlrev_b64_e32 v[4:5], 8, v[4:5]
+; GCN-NEXT: v_lshlrev_b64_e32 v[0:1], 8, v[0:1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GCN-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %in.gep = getelementptr inbounds <4 x i64>, ptr addrspace(1) %in, i64 %tid.ext
+ %out.gep = getelementptr inbounds <4 x i64>, ptr addrspace(1) %out, i64 %tid.ext
+ %vgpr = load <4 x i64>, ptr addrspace(1) %in.gep
+ %result = shl <4 x i64> %vgpr, <i64 8, i64 8, i64 8, i64 8>
+ store <4 x i64> %result, ptr addrspace(1) %out.gep
+ ret void
+}
+
+define amdgpu_kernel void @v_shl_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: v_shl_v8i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 6, v0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_clause 0x7
+; GCN-NEXT: global_load_b128 v[0:3], v32, s[2:3]
+; GCN-NEXT: global_load_b128 v[4:7], v32, s[2:3] offset:16
+; GCN-NEXT: global_load_b128 v[8:11], v32, s[2:3] offset:64
+; GCN-NEXT: global_load_b128 v[12:15], v32, s[2:3] offset:80
+; GCN-NEXT: global_load_b128 v[16:19], v32, s[2:3] offset:32
+; GCN-NEXT: global_load_b128 v[20:23], v32, s[2:3] offset:48
+; GCN-NEXT: global_load_b128 v[24:27], v32, s[2:3] offset:112
+; GCN-NEXT: global_load_b128 v[28:31], v32, s[2:3] offset:96
+; GCN-NEXT: s_wait_loadcnt 0x5
+; GCN-NEXT: v_lshlrev_b64_e32 v[2:3], v10, v[2:3]
+; GCN-NEXT: v_lshlrev_b64_e32 v[0:1], v8, v[0:1]
+; GCN-NEXT: s_wait_loadcnt 0x4
+; GCN-NEXT: v_lshlrev_b64_e32 v[6:7], v14, v[6:7]
+; GCN-NEXT: v_lshlrev_b64_e32 v[4:5], v12, v[4:5]
+; GCN-NEXT: s_wait_loadcnt 0x1
+; GCN-NEXT: v_lshlrev_b64_e32 v[22:23], v26, v[22:23]
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: v_lshlrev_b64_e32 v[18:19], v30, v[18:19]
+; GCN-NEXT: v_lshlrev_b64_e32 v[16:17], v28, v[16:17]
+; GCN-NEXT: v_lshlrev_b64_e32 v[20:21], v24, v[20:21]
+; GCN-NEXT: s_clause 0x3
+; GCN-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:32
+; GCN-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:48
+; GCN-NEXT: global_store_b128 v32, v[0:3], s[0:1]
+; GCN-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %in.gep = getelementptr inbounds <8 x i64>, ptr addrspace(1) %in, i64 %tid.ext
+ %out.gep = getelementptr inbounds <8 x i64>, ptr addrspace(1) %out, i64 %tid.ext
+ %b_ptr = getelementptr <8 x i64>, ptr addrspace(1) %in.gep, i32 1
+ %a = load <8 x i64>, ptr addrspace(1) %in.gep
+ %b = load <8 x i64>, ptr addrspace(1) %b_ptr
+ %result = shl <8 x i64> %a, %b
+ store <8 x i64> %result, ptr addrspace(1) %out.gep
+ ret void
+}
+
+define amdgpu_kernel void @v_shl_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GFX1250-LABEL: v_shl_v16i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v52, 7, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0xc
+; GFX1250-NEXT: global_load_b128 v[0:3], v52, s[2:3] offset:144
+; GFX1250-NEXT: global_load_b128 v[4:7], v52, s[2:3] offset:16
+; GFX1250-NEXT: global_load_b128 v[8:11], v52, s[2:3]
+; GFX1250-NEXT: global_load_b128 v[12:15], v52, s[2:3] offset:128
+; GFX1250-NEXT: global_load_b128 v[16:19], v52, s[2:3] offset:64
+; GFX1250-NEXT: global_load_b128 v[20:23], v52, s[2:3] offset:192
+; GFX1250-NEXT: global_load_b128 v[24:27], v52, s[2:3] offset:96
+; GFX1250-NEXT: global_load_b128 v[28:31], v52, s[2:3] offset:112
+; GFX1250-NEXT: global_load_b128 v[32:35], v52, s[2:3] offset:80
+; GFX1250-NEXT: global_load_b128 v[36:39], v52, s[2:3] offset:32
+; GFX1250-NEXT: global_load_b128 v[40:43], v52, s[2:3] offset:48
+; GFX1250-NEXT: global_load_b128 v[44:47], v52, s[2:3] offset:224
+; GFX1250-NEXT: global_load_b128 v[48:51], v52, s[2:3] offset:240
+; GFX1250-NEXT: s_wait_loadcnt 0xb
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v2, v[6:7]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v0, v[4:5]
+; GFX1250-NEXT: global_load_b128 v[4:7], v52, s[2:3] offset:208
+; GFX1250-NEXT: s_wait_loadcnt 0xa
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[10:11], v14, v[10:11]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[8:9], v12, v[8:9]
+; GFX1250-NEXT: global_load_b128 v[12:15], v52, s[2:3] offset:176
+; GFX1250-NEXT: s_wait_loadcnt 0x9
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[18:19], v22, v[18:19]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[16:17], v20, v[16:17]
+; GFX1250-NEXT: global_load_b128 v[20:23], v52, s[2:3] offset:160
+; GFX1250-NEXT: s_wait_loadcnt 0x4
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[26:27], v46, v[26:27]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[24:25], v44, v[24:25]
+; GFX1250-NEXT: s_wait_loadcnt 0x3
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[30:31], v50, v[30:31]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[28:29], v48, v[28:29]
+; GFX1250-NEXT: s_wait_loadcnt 0x2
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v6, v[34:35]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v4, v[32:33]
+; GFX1250-NEXT: s_wait_loadcnt 0x1
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[14:15], v14, v[42:43]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[12:13], v12, v[40:41]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[22:23], v22, v[38:39]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[20:21], v20, v[36:37]
+; GFX1250-NEXT: s_clause 0x7
+; GFX1250-NEXT: global_store_b128 v52, v[24:27], s[0:1] offset:96
+; GFX1250-NEXT: global_store_b128 v52, v[28:31], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v52, v[16:19], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v52, v[4:7], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v52, v[20:23], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v52, v[12:15], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v52, v[8:11], s[0:1]
+; GFX1250-NEXT: global_store_b128 v52, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1251-LABEL: v_shl_v16i64:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX1251-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_lshlrev_b32_e32 v52, 7, v0
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: s_clause 0xb
+; GFX1251-NEXT: global_load_b128 v[0:3], v52, s[2:3] offset:144
+; GFX1251-NEXT: global_load_b128 v[4:7], v52, s[2:3] offset:16
+; GFX1251-NEXT: global_load_b128 v[8:11], v52, s[2:3]
+; GFX1251-NEXT: global_load_b128 v[12:15], v52, s[2:3] offset:128
+; GFX1251-NEXT: global_load_b128 v[16:19], v52, s[2:3] offset:176
+; GFX1251-NEXT: global_load_b128 v[20:23], v52, s[2:3] offset:48
+; GFX1251-NEXT: global_load_b128 v[24:27], v52, s[2:3] offset:32
+; GFX1251-NEXT: global_load_b128 v[28:31], v52, s[2:3] offset:160
+; GFX1251-NEXT: global_load_b128 v[32:35], v52, s[2:3] offset:96
+; GFX1251-NEXT: global_load_b128 v[36:39], v52, s[2:3] offset:112
+; GFX1251-NEXT: global_load_b128 v[40:43], v52, s[2:3] offset:64
+; GFX1251-NEXT: global_load_b128 v[44:47], v52, s[2:3] offset:80
+; GFX1251-NEXT: s_wait_loadcnt 0xa
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[2:3], v2, v[6:7]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[0:1], v0, v[4:5]
+; GFX1251-NEXT: s_clause 0x1
+; GFX1251-NEXT: global_load_b128 v[4:7], v52, s[2:3] offset:224
+; GFX1251-NEXT: global_load_b128 v[48:51], v52, s[2:3] offset:240
+; GFX1251-NEXT: s_wait_loadcnt 0xa
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[10:11], v14, v[10:11]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[8:9], v12, v[8:9]
+; GFX1251-NEXT: s_wait_loadcnt 0x8
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[14:15], v18, v[22:23]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[12:13], v16, v[20:21]
+; GFX1251-NEXT: s_clause 0x1
+; GFX1251-NEXT: global_load_b128 v[16:19], v52, s[2:3] offset:208
+; GFX1251-NEXT: global_load_b128 v[20:23], v52, s[2:3] offset:192
+; GFX1251-NEXT: s_wait_loadcnt 0x8
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[26:27], v30, v[26:27]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[24:25], v28, v[24:25]
+; GFX1251-NEXT: s_wait_loadcnt 0x3
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[6:7], v6, v[34:35]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[4:5], v4, v[32:33]
+; GFX1251-NEXT: s_wait_loadcnt 0x2
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[38:39], v50, v[38:39]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[36:37], v48, v[36:37]
+; GFX1251-NEXT: s_wait_loadcnt 0x1
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[18:19], v18, v[46:47]
+; GFX1251-NEXT: s_wait_loadcnt 0x0
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[22:23], v22, v[42:43]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[20:21], v20, v[40:41]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[16:17], v16, v[44:45]
+; GFX1251-NEXT: s_clause 0x7
+; GFX1251-NEXT: global_store_b128 v52, v[4:7], s[0:1] offset:96
+; GFX1251-NEXT: global_store_b128 v52, v[36:39], s[0:1] offset:112
+; GFX1251-NEXT: global_store_b128 v52, v[20:23], s[0:1] offset:64
+; GFX1251-NEXT: global_store_b128 v52, v[16:19], s[0:1] offset:80
+; GFX1251-NEXT: global_store_b128 v52, v[24:27], s[0:1] offset:32
+; GFX1251-NEXT: global_store_b128 v52, v[12:15], s[0:1] offset:48
+; GFX1251-NEXT: global_store_b128 v52, v[8:11], s[0:1]
+; GFX1251-NEXT: global_store_b128 v52, v[0:3], s[0:1] offset:16
+; GFX1251-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %in.gep = getelementptr inbounds <16 x i64>, ptr addrspace(1) %in, i64 %tid.ext
+ %out.gep = getelementptr inbounds <16 x i64>, ptr addrspace(1) %out, i64 %tid.ext
+ %b_ptr = getelementptr <16 x i64>, ptr addrspace(1) %in.gep, i32 1
+ %a = load <16 x i64>, ptr addrspace(1) %in.gep
+ %b = load <16 x i64>, ptr addrspace(1) %b_ptr
+ %result = shl <16 x i64> %a, %b
+ store <16 x i64> %result, ptr addrspace(1) %out.gep
+ ret void
+}
+
+define amdgpu_kernel void @v_shl_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GFX1250-LABEL: v_shl_v32i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v24, 8, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x13
+; GFX1250-NEXT: global_load_b128 v[0:3], v24, s[2:3] offset:400
+; GFX1250-NEXT: global_load_b128 v[4:7], v24, s[2:3] offset:144
+; GFX1250-NEXT: global_load_b128 v[8:11], v24, s[2:3] offset:128
+; GFX1250-NEXT: global_load_b128 v[12:15], v24, s[2:3] offset:384
+; GFX1250-NEXT: global_load_b128 v[16:19], v24, s[2:3] offset:432
+; GFX1250-NEXT: global_load_b128 v[20:23], v24, s[2:3] offset:176
+; GFX1250-NEXT: global_load_b128 v[26:29], v24, s[2:3] offset:160
+; GFX1250-NEXT: global_load_b128 v[30:33], v24, s[2:3] offset:416
+; GFX1250-NEXT: global_load_b128 v[34:37], v24, s[2:3] offset:464
+; GFX1250-NEXT: global_load_b128 v[38:41], v24, s[2:3] offset:208
+; GFX1250-NEXT: global_load_b128 v[42:45], v24, s[2:3] offset:192
+; GFX1250-NEXT: global_load_b128 v[46:49], v24, s[2:3] offset:448
+; GFX1250-NEXT: global_load_b128 v[50:53], v24, s[2:3] offset:496
+; GFX1250-NEXT: global_load_b128 v[54:57], v24, s[2:3] offset:240
+; GFX1250-NEXT: global_load_b128 v[58:61], v24, s[2:3] offset:224
+; GFX1250-NEXT: global_load_b128 v[62:65], v24, s[2:3] offset:480
+; GFX1250-NEXT: global_load_b128 v[66:69], v24, s[2:3] offset:16
+; GFX1250-NEXT: global_load_b128 v[70:73], v24, s[2:3]
+; GFX1250-NEXT: global_load_b128 v[74:77], v24, s[2:3] offset:272
+; GFX1250-NEXT: global_load_b128 v[78:81], v24, s[2:3] offset:256
+; GFX1250-NEXT: s_wait_loadcnt 0x12
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v2, v[6:7]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v0, v[4:5]
+; GFX1250-NEXT: s_wait_loadcnt 0x10
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v14, v[10:11]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v12, v[8:9]
+; GFX1250-NEXT: s_wait_loadcnt 0xe
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[10:11], v18, v[22:23]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[8:9], v16, v[20:21]
+; GFX1250-NEXT: s_wait_loadcnt 0xc
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[14:15], v32, v[28:29]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[12:13], v30, v[26:27]
+; GFX1250-NEXT: global_load_b128 v[26:29], v24, s[2:3] offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0xb
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[18:19], v36, v[40:41]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[16:17], v34, v[38:39]
+; GFX1250-NEXT: global_load_b128 v[30:33], v24, s[2:3] offset:288
+; GFX1250-NEXT: s_wait_loadcnt 0xa
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[22:23], v48, v[44:45]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[20:21], v46, v[42:43]
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_load_b128 v[34:37], v24, s[2:3] offset:320
+; GFX1250-NEXT: global_load_b128 v[42:45], v24, s[2:3] offset:64
+; GFX1250-NEXT: s_wait_loadcnt 0xa
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[40:41], v52, v[56:57]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[38:39], v50, v[54:55]
+; GFX1250-NEXT: s_wait_loadcnt 0x8
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[48:49], v64, v[60:61]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[46:47], v62, v[58:59]
+; GFX1250-NEXT: global_load_b128 v[50:53], v24, s[2:3] offset:80
+; GFX1250-NEXT: s_wait_loadcnt 0x6
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[56:57], v76, v[68:69]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[54:55], v74, v[66:67]
+; GFX1250-NEXT: global_load_b128 v[58:61], v24, s[2:3] offset:48
+; GFX1250-NEXT: s_wait_loadcnt 0x6
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[64:65], v80, v[72:73]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[62:63], v78, v[70:71]
+; GFX1250-NEXT: global_load_b128 v[66:69], v24, s[2:3] offset:304
+; GFX1250-NEXT: s_wait_loadcnt 0x5
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[28:29], v32, v[28:29]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[26:27], v30, v[26:27]
+; GFX1250-NEXT: global_load_b128 v[30:33], v24, s[2:3] offset:336
+; GFX1250-NEXT: s_wait_loadcnt 0x4
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[36:37], v36, v[44:45]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[34:35], v34, v[42:43]
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_load_b128 v[42:45], v24, s[2:3] offset:352
+; GFX1250-NEXT: global_load_b128 v[70:73], v24, s[2:3] offset:96
+; GFX1250-NEXT: global_load_b128 v[74:77], v24, s[2:3] offset:112
+; GFX1250-NEXT: global_load_b128 v[78:81], v24, s[2:3] offset:368
+; GFX1250-NEXT: s_wait_loadcnt 0x4
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[32:33], v32, v[52:53]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[30:31], v30, v[50:51]
+; GFX1250-NEXT: s_wait_loadcnt 0x2
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[44:45], v44, v[72:73]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[42:43], v42, v[70:71]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[72:73], v80, v[76:77]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[70:71], v78, v[74:75]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[52:53], v68, v[60:61]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[50:51], v66, v[58:59]
+; GFX1250-NEXT: s_clause 0xf
+; GFX1250-NEXT: global_store_b128 v24, v[42:45], s[0:1] offset:96
+; GFX1250-NEXT: global_store_b128 v24, v[70:73], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v24, v[34:37], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v24, v[30:33], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v24, v[26:29], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v24, v[50:53], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v24, v[62:65], s[0:1]
+; GFX1250-NEXT: global_store_b128 v24, v[54:57], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v24, v[46:49], s[0:1] offset:224
+; GFX1250-NEXT: global_store_b128 v24, v[38:41], s[0:1] offset:240
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:192
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:208
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:160
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:176
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:128
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:144
+; GFX1250-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1251-LABEL: v_shl_v32i64:
+; GFX1251: ; %bb.0:
+; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX1251-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1251-NEXT: v_lshlrev_b32_e32 v24, 8, v0
+; GFX1251-NEXT: s_wait_kmcnt 0x0
+; GFX1251-NEXT: s_clause 0x13
+; GFX1251-NEXT: global_load_b128 v[0:3], v24, s[2:3] offset:400
+; GFX1251-NEXT: global_load_b128 v[4:7], v24, s[2:3] offset:144
+; GFX1251-NEXT: global_load_b128 v[8:11], v24, s[2:3] offset:128
+; GFX1251-NEXT: global_load_b128 v[12:15], v24, s[2:3] offset:384
+; GFX1251-NEXT: global_load_b128 v[16:19], v24, s[2:3] offset:432
+; GFX1251-NEXT: global_load_b128 v[20:23], v24, s[2:3] offset:176
+; GFX1251-NEXT: global_load_b128 v[26:29], v24, s[2:3] offset:160
+; GFX1251-NEXT: global_load_b128 v[30:33], v24, s[2:3] offset:416
+; GFX1251-NEXT: global_load_b128 v[34:37], v24, s[2:3] offset:464
+; GFX1251-NEXT: global_load_b128 v[38:41], v24, s[2:3] offset:208
+; GFX1251-NEXT: global_load_b128 v[42:45], v24, s[2:3] offset:192
+; GFX1251-NEXT: global_load_b128 v[46:49], v24, s[2:3] offset:448
+; GFX1251-NEXT: global_load_b128 v[50:53], v24, s[2:3] offset:496
+; GFX1251-NEXT: global_load_b128 v[54:57], v24, s[2:3] offset:240
+; GFX1251-NEXT: global_load_b128 v[58:61], v24, s[2:3] offset:224
+; GFX1251-NEXT: global_load_b128 v[62:65], v24, s[2:3] offset:480
+; GFX1251-NEXT: global_load_b128 v[66:69], v24, s[2:3] offset:16
+; GFX1251-NEXT: global_load_b128 v[70:73], v24, s[2:3]
+; GFX1251-NEXT: global_load_b128 v[74:77], v24, s[2:3] offset:272
+; GFX1251-NEXT: global_load_b128 v[78:81], v24, s[2:3] offset:256
+; GFX1251-NEXT: s_wait_loadcnt 0x12
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[2:3], v2, v[6:7]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[0:1], v0, v[4:5]
+; GFX1251-NEXT: s_wait_loadcnt 0x10
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[6:7], v14, v[10:11]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[4:5], v12, v[8:9]
+; GFX1251-NEXT: s_wait_loadcnt 0xe
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[10:11], v18, v[22:23]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[8:9], v16, v[20:21]
+; GFX1251-NEXT: s_wait_loadcnt 0xc
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[14:15], v32, v[28:29]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[12:13], v30, v[26:27]
+; GFX1251-NEXT: global_load_b128 v[26:29], v24, s[2:3] offset:48
+; GFX1251-NEXT: s_wait_loadcnt 0xb
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[18:19], v36, v[40:41]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[16:17], v34, v[38:39]
+; GFX1251-NEXT: global_load_b128 v[30:33], v24, s[2:3] offset:304
+; GFX1251-NEXT: s_wait_loadcnt 0xa
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[22:23], v48, v[44:45]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[20:21], v46, v[42:43]
+; GFX1251-NEXT: s_clause 0x1
+; GFX1251-NEXT: global_load_b128 v[34:37], v24, s[2:3] offset:80
+; GFX1251-NEXT: global_load_b128 v[42:45], v24, s[2:3] offset:336
+; GFX1251-NEXT: s_wait_loadcnt 0xa
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[40:41], v52, v[56:57]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[38:39], v50, v[54:55]
+; GFX1251-NEXT: s_wait_loadcnt 0x8
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[48:49], v64, v[60:61]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[46:47], v62, v[58:59]
+; GFX1251-NEXT: global_load_b128 v[50:53], v24, s[2:3] offset:64
+; GFX1251-NEXT: s_wait_loadcnt 0x6
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[56:57], v76, v[68:69]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[54:55], v74, v[66:67]
+; GFX1251-NEXT: global_load_b128 v[58:61], v24, s[2:3] offset:32
+; GFX1251-NEXT: s_wait_loadcnt 0x6
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[64:65], v80, v[72:73]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[62:63], v78, v[70:71]
+; GFX1251-NEXT: global_load_b128 v[66:69], v24, s[2:3] offset:288
+; GFX1251-NEXT: s_wait_loadcnt 0x5
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[28:29], v32, v[28:29]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[26:27], v30, v[26:27]
+; GFX1251-NEXT: global_load_b128 v[30:33], v24, s[2:3] offset:320
+; GFX1251-NEXT: s_wait_loadcnt 0x4
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[36:37], v44, v[36:37]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[34:35], v42, v[34:35]
+; GFX1251-NEXT: s_clause 0x3
+; GFX1251-NEXT: global_load_b128 v[42:45], v24, s[2:3] offset:368
+; GFX1251-NEXT: global_load_b128 v[70:73], v24, s[2:3] offset:112
+; GFX1251-NEXT: global_load_b128 v[74:77], v24, s[2:3] offset:96
+; GFX1251-NEXT: global_load_b128 v[78:81], v24, s[2:3] offset:352
+; GFX1251-NEXT: s_wait_loadcnt 0x4
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[32:33], v32, v[52:53]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[30:31], v30, v[50:51]
+; GFX1251-NEXT: s_wait_loadcnt 0x2
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[44:45], v44, v[72:73]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[42:43], v42, v[70:71]
+; GFX1251-NEXT: s_wait_loadcnt 0x0
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[72:73], v80, v[76:77]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[70:71], v78, v[74:75]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[52:53], v68, v[60:61]
+; GFX1251-NEXT: v_lshlrev_b64_e32 v[50:51], v66, v[58:59]
+; GFX1251-NEXT: s_clause 0xf
+; GFX1251-NEXT: global_store_b128 v24, v[70:73], s[0:1] offset:96
+; GFX1251-NEXT: global_store_b128 v24, v[42:45], s[0:1] offset:112
+; GFX1251-NEXT: global_store_b128 v24, v[30:33], s[0:1] offset:64
+; GFX1251-NEXT: global_store_b128 v24, v[34:37], s[0:1] offset:80
+; GFX1251-NEXT: global_store_b128 v24, v[50:53], s[0:1] offset:32
+; GFX1251-NEXT: global_store_b128 v24, v[26:29], s[0:1] offset:48
+; GFX1251-NEXT: global_store_b128 v24, v[62:65], s[0:1]
+; GFX1251-NEXT: global_store_b128 v24, v[54:57], s[0:1] offset:16
+; GFX1251-NEXT: global_store_b128 v24, v[46:49], s[0:1] offset:224
+; GFX1251-NEXT: global_store_b128 v24, v[38:41], s[0:1] offset:240
+; GFX1251-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:192
+; GFX1251-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:208
+; GFX1251-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:160
+; GFX1251-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:176
+; GFX1251-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:128
+; GFX1251-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:144
+; GFX1251-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1251-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %in.gep = getelementptr inbounds <32 x i64>, ptr addrspace(1) %in, i64 %tid.ext
+ %out.gep = getelementptr inbounds <32 x i64>, ptr addrspace(1) %out, i64 %tid.ext
+ %b_ptr = getelementptr <32 x i64>, ptr addrspace(1) %in.gep, i32 1
+ %a = load <32 x i64>, ptr addrspace(1) %in.gep
+ %b = load <32 x i64>, ptr addrspace(1) %b_ptr
+ %result = shl <32 x i64> %a, %b
+ store <32 x i64> %result, ptr addrspace(1) %out.gep
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/MC/AMDGPU/gfx1251_asm_vop3p.s b/llvm/test/MC/AMDGPU/gfx1251_asm_vop3p.s
index 24ffbb59c985d..159643e6b78c8 100644
--- a/llvm/test/MC/AMDGPU/gfx1251_asm_vop3p.s
+++ b/llvm/test/MC/AMDGPU/gfx1251_asm_vop3p.s
@@ -349,3 +349,55 @@ v_pk_sub_nc_u64 v[4:7], 101, v[8:11]
v_pk_sub_nc_u64 v[4:7], v[8:11], 101
// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
// GFX1251: v_pk_sub_nc_u64 v[4:7], v[8:11], 0x65 ; encoding: [0x04,0x40,0x4d,0xcc,0x08,0xff,0x01,0x1a,0x65,0x00,0x00,0x00]
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x19,0x42,0x1c]
+
+v_pk_lshl_add_u64 v[4:7], s[8:11], s[12:13], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1251: v_pk_lshl_add_u64 v[4:7], s[8:11], s[12:13], v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x18,0x40,0x1c]
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], s[12:13], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], s[12:13], v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x19,0x40,0x1c]
+
+v_pk_lshl_add_u64 v[4:7], s[8:11], v[12:13], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1251: v_pk_lshl_add_u64 v[4:7], s[8:11], v[12:13], v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x18,0x42,0x1c]
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], null, v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], null, v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0xf9,0x40,0x1c]
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], 1, v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], 1, v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x03,0x41,0x1c]
+
+v_pk_lshl_add_u64 v[4:7], 1, v[8:9], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1251: v_pk_lshl_add_u64 v[4:7], 1, v[8:9], v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x81,0x10,0x42,0x1c]
+
+v_pk_lshl_add_u64 v[4:7], 101, v[8:9], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1251: v_pk_lshl_add_u64 v[4:7], 0x65, v[8:9], v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0xff,0x10,0x42,0x1c,0x65,0x00,0x00,0x00]
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], 101, v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], 0x65, v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0xff,0x41,0x1c,0x65,0x00,0x00,0x00]
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], s[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], s[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x19,0x42,0x18]
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], null
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], null ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x19,0xf2,0x19]
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], 1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], 1 ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x19,0x06,0x1a]
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], v[16:17], 101
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], v[16:17], 0x65 ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x21,0xfe,0x1b,0x65,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1251_err.s b/llvm/test/MC/AMDGPU/gfx1251_err.s
index 63b5cc5b09c2c..4a0e99ba6b0fd 100644
--- a/llvm/test/MC/AMDGPU/gfx1251_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1251_err.s
@@ -252,3 +252,37 @@ v_pk_sub_nc_u64 v[4:7], v[8:11], v[12:15] op_sel_hi:[1,0]
v_pk_sub_nc_u64 v[4:7], v[5:8], null
// GFX1251-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned
// GFX1251-ERR: v_pk_sub_nc_u64 v[4:7], v[5:8], null
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], v[16:19] row_share:2
+// GFX1251-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1251-ERR: v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], v[16:19] row_share:2
+// GFX1251-ERR: ^
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], v[16:17], lit64(0x12345678a)
+// GFX1251-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1251-ERR: v_pk_lshl_add_u64 v[4:7], v[8:11], v[16:17], lit64(0x12345678a)
+// GFX1251-ERR: ^
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], v[16:19] op_sel:[1,0,0]
+// GFX1251-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1251-ERR: v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], v[16:19] op_sel:[1,0,0]
+// GFX1251-ERR: ^
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], v[16:19] op_sel_hi:[1,0,0]
+// GFX1251-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1251-ERR: v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], v[16:19] op_sel_hi:[1,0,0]
+// GFX1251-ERR: ^
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], v[16:19] clamp
+// GFX1251-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1251-ERR: v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], v[16:19] clamp
+// GFX1251-ERR: ^
+
+v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], v[16:19] neg_lo:[1,0,0]
+// GFX1251-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1251-ERR: v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], v[16:19] neg_lo:[1,0,0]
+// GFX1251-ERR: ^
+
+v_pk_lshl_add_u64 v[4:7], v[17:20], v[6:7], null
+// GFX1251-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned
+// GFX1251-ERR: v_pk_lshl_add_u64 v[4:7], v[17:20], v[6:7], null
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1251_dasm_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1251_dasm_vop3p.txt
index 51ec116464398..4d19ed09f9b6e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1251_dasm_vop3p.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1251_dasm_vop3p.txt
@@ -261,3 +261,42 @@
0x04,0x40,0x4d,0xcc,0x08,0xff,0x01,0x18,0x65,0x00,0x00,0x00
# GFX1251: v_pk_sub_nc_u64 v[4:7], v[8:11], 0x65 ; encoding: [0x04,0x40,0x4d,0xcc,0x08,0xff,0x01,0x1a,0x65,0x00,0x00,0x00]
+
+0x04,0x40,0x7e,0xcc,0x08,0x19,0x42,0x1c
+# GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x19,0x42,0x1c]
+
+0x04,0x40,0x7e,0xcc,0x08,0x18,0x40,0x1c
+# GFX1251: v_pk_lshl_add_u64 v[4:7], s[8:11], s[12:13], v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x18,0x40,0x1c]
+
+0x04,0x40,0x7e,0xcc,0x08,0x19,0x40,0x1c
+# GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], s[12:13], v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x19,0x40,0x1c]
+
+0x04,0x40,0x7e,0xcc,0x08,0x18,0x42,0x1c
+# GFX1251: v_pk_lshl_add_u64 v[4:7], s[8:11], v[12:13], v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x18,0x42,0x1c]
+
+0x04,0x40,0x7e,0xcc,0x08,0xf9,0x40,0x1c
+# GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], null, v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0xf9,0x40,0x1c]
+
+0x04,0x40,0x7e,0xcc,0x08,0x03,0x41,0x1c
+# GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], 1, v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x03,0x41,0x1c]
+
+0x04,0x40,0x7e,0xcc,0x81,0x10,0x42,0x1c
+# GFX1251: v_pk_lshl_add_u64 v[4:7], 1, v[8:9], v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x81,0x10,0x42,0x1c]
+
+0x04,0x40,0x7e,0xcc,0xff,0x10,0x42,0x1c,0x65,0x00,0x00,0x00
+# GFX1251: v_pk_lshl_add_u64 v[4:7], 0x65, v[8:9], v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0xff,0x10,0x42,0x1c,0x65,0x00,0x00,0x00]
+
+0x04,0x40,0x7e,0xcc,0x08,0xff,0x41,0x1c,0x65,0x00,0x00,0x00
+# GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], 0x65, v[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0xff,0x41,0x1c,0x65,0x00,0x00,0x00]
+
+0x04,0x40,0x7e,0xcc,0x08,0x19,0x42,0x18
+# GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], s[16:19] ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x19,0x42,0x18]
+
+0x04,0x40,0x7e,0xcc,0x08,0x19,0xf2,0x19
+# GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], null ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x19,0xf2,0x19]
+
+0x04,0x40,0x7e,0xcc,0x08,0x19,0x06,0x1a
+# GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], v[12:13], 1 ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x19,0x06,0x1a]
+
+0x04,0x40,0x7e,0xcc,0x08,0x21,0xfe,0x1b,0x65,0x00,0x00,0x00
+# GFX1251: v_pk_lshl_add_u64 v[4:7], v[8:11], v[16:17], 0x65 ; encoding: [0x04,0x40,0x7e,0xcc,0x08,0x21,0xfe,0x1b,0x65,0x00,0x00,0x00]
More information about the llvm-branch-commits
mailing list