[llvm] e7b362d - [AMDGPU] Add v_mov_b64 gfx940 opcode
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 7 12:07:20 PST 2022
Author: Stanislav Mekhanoshin
Date: 2022-03-07T12:07:12-08:00
New Revision: e7b362d75d2a3fdf67550b88738c708a33eec3cc
URL: https://github.com/llvm/llvm-project/commit/e7b362d75d2a3fdf67550b88738c708a33eec3cc
DIFF: https://github.com/llvm/llvm-project/commit/e7b362d75d2a3fdf67550b88738c708a33eec3cc.diff
LOG: [AMDGPU] Add v_mov_b64 gfx940 opcode
Differential Revision: https://reviews.llvm.org/D121023
Added:
Modified:
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/VOP1Instructions.td
llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
llvm/test/MC/AMDGPU/gfx940_asm_features.s
llvm/test/MC/AMDGPU/gfx940_err.s
llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index bf90007afddd3..df9b2c8b6e9f3 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -892,6 +892,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasMadF16() const;
+ bool hasMovB64() const { return GFX940Insts; }
+
bool enableSIScheduler() const {
return EnableSIScheduler;
}
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 034358319b190..f2b835d95c774 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -490,6 +490,8 @@ static bool isUseSafeToFold(const SIInstrInfo *TII,
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
case AMDGPU::V_MOV_B64_PSEUDO:
+ case AMDGPU::V_MOV_B64_e32:
+ case AMDGPU::V_MOV_B64_e64:
// Do not fold into an indirect mov.
return !MI.hasRegisterImplicitUseOperand(AMDGPU::M0);
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9076f9e60296b..87e0a93efea71 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -908,6 +908,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg);
if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
+ if (ST.hasMovB64()) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
if (ST.hasPackedFP32Ops()) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
.addImm(SISrcMods::OP_SEL_1)
@@ -951,7 +956,10 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
(RI.isProperlyAlignedRC(*RC) &&
(SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
// TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
- if (ST.hasPackedFP32Ops()) {
+ if (ST.hasMovB64()) {
+ Opcode = AMDGPU::V_MOV_B64_e32;
+ EltSize = 8;
+ } else if (ST.hasPackedFP32Ops()) {
Opcode = AMDGPU::V_PK_MOV_B32;
EltSize = 8;
}
@@ -1833,6 +1841,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
const MachineOperand &SrcOp = MI.getOperand(1);
// FIXME: Will this work for 64-bit floating point immediates?
assert(!SrcOp.isFPImm());
+ if (ST.hasMovB64()) {
+ MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
+ if (!isLiteralConstant(MI, 1) || isUInt<32>(SrcOp.getImm()))
+ break;
+ }
if (SrcOp.isImm()) {
APInt Imm(64, SrcOp.getImm());
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
@@ -2823,6 +2836,8 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
case AMDGPU::V_MOV_B64_PSEUDO:
+ case AMDGPU::V_MOV_B64_e32:
+ case AMDGPU::V_MOV_B64_e64:
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B64:
case AMDGPU::COPY:
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 0ba6139d4e3d0..a151699cc66a6 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -112,7 +112,8 @@ class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
multiclass VOP1Inst <string opName, VOPProfile P,
SDPatternOperator node = null_frag> {
// We only want to set this on the basic, non-SDWA or DPP forms.
- defvar should_mov_imm = !eq(opName, "v_mov_b32");
+ defvar should_mov_imm = !or(!eq(opName, "v_mov_b32"),
+ !eq(opName, "v_mov_b64"));
let isMoveImm = should_mov_imm in {
def _e32 : VOP1_Pseudo <opName, P>;
@@ -170,6 +171,9 @@ defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>;
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>;
+
+let SubtargetPredicate = isGFX940Plus in
+defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>;
} // End isMoveImm = 1
// FIXME: Specify SchedRW for READFIRSTLANE_B32
@@ -949,6 +953,9 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
+let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in
+defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
+
//===----------------------------------------------------------------------===//
// GFX10
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
index 4dcf488c31b8c..21e963f507e3f 100644
--- a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
+++ b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
@@ -1,6 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX90A %s
+# RUN: llc -march=amdgcn -mcpu=gfx940 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX940 %s
# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
---
@@ -16,6 +17,9 @@ body: |
; GFX90A-LABEL: name: copy_v64_to_v64
; GFX90A: liveins: $vgpr2_vgpr3
; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+ ; GFX940-LABEL: name: copy_v64_to_v64
+ ; GFX940: liveins: $vgpr2_vgpr3
+ ; GFX940: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec
; GFX10-LABEL: name: copy_v64_to_v64
; GFX10: liveins: $vgpr2_vgpr3
; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
@@ -36,6 +40,9 @@ body: |
; GFX90A-LABEL: name: copy_s64_to_v64
; GFX90A: liveins: $sgpr2_sgpr3
; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec
+ ; GFX940-LABEL: name: copy_s64_to_v64
+ ; GFX940: liveins: $sgpr2_sgpr3
+ ; GFX940: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr2_sgpr3, implicit $exec, implicit $exec
; GFX10-LABEL: name: copy_s64_to_v64
; GFX10: liveins: $sgpr2_sgpr3
; GFX10: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
@@ -57,6 +64,10 @@ body: |
; GFX90A: liveins: $agpr2_agpr3
; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec
+ ; GFX940-LABEL: name: copy_a64_to_v64
+ ; GFX940: liveins: $agpr2_agpr3
+ ; GFX940: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
+ ; GFX940: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec
; GFX10-LABEL: name: copy_a64_to_v64
; GFX10: liveins: $agpr2_agpr3
; GFX10: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
@@ -80,6 +91,10 @@ body: |
; GFX90A: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr4_vgpr5, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
+ ; GFX940-LABEL: name: copy_v128_to_v128_fwd
+ ; GFX940: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
+ ; GFX940: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5
+ ; GFX940: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr4_vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
; GFX10-LABEL: name: copy_v128_to_v128_fwd
; GFX10: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5
@@ -105,6 +120,10 @@ body: |
; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5
; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
+ ; GFX940-LABEL: name: copy_v128_to_v128_back
+ ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX940: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX940: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; GFX10-LABEL: name: copy_v128_to_v128_back
; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX10: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3
@@ -130,6 +149,11 @@ body: |
; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec
+ ; GFX940-LABEL: name: copy_v96_to_v96
+ ; GFX940: liveins: $vgpr4_vgpr5_vgpr6
+ ; GFX940: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
+ ; GFX940: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
+ ; GFX940: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec
; GFX10-LABEL: name: copy_v96_to_v96
; GFX10: liveins: $vgpr4_vgpr5_vgpr6
; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
@@ -151,6 +175,9 @@ body: |
; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub0
; GFX90A: liveins: $vgpr3
; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+ ; GFX940-LABEL: name: copy_v64_to_v64_undef_sub0
+ ; GFX940: liveins: $vgpr3
+ ; GFX940: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec
; GFX10-LABEL: name: copy_v64_to_v64_undef_sub0
; GFX10: liveins: $vgpr3
; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
@@ -171,6 +198,9 @@ body: |
; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub1
; GFX90A: liveins: $vgpr2
; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+ ; GFX940-LABEL: name: copy_v64_to_v64_undef_sub1
+ ; GFX940: liveins: $vgpr2
+ ; GFX940: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec
; GFX10-LABEL: name: copy_v64_to_v64_undef_sub1
; GFX10: liveins: $vgpr2
; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
@@ -194,6 +224,10 @@ body: |
; GFX90A: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr4_sgpr5, 12, $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $sgpr6_sgpr7, 12, $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
+ ; GFX940-LABEL: name: copy_s128_to_v128_killed
+ ; GFX940: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
+ ; GFX940: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr4_sgpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7
+ ; GFX940: $vgpr2_vgpr3 = V_MOV_B64_e32 $sgpr6_sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
; GFX10-LABEL: name: copy_s128_to_v128_killed
; GFX10: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
; GFX10: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7
@@ -217,6 +251,10 @@ body: |
; GFX90A: liveins: $vgpr2_vgpr3
; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3
; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec
+ ; GFX940-LABEL: name: copy_v64_to_v64_unaligned
+ ; GFX940: liveins: $vgpr2_vgpr3
+ ; GFX940: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3
+ ; GFX940: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec
; GFX10-LABEL: name: copy_v64_to_v64_unaligned
; GFX10: liveins: $vgpr2_vgpr3
; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3
@@ -238,6 +276,10 @@ body: |
; GFX90A: liveins: $vgpr3_vgpr4
; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4
; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec
+ ; GFX940-LABEL: name: copy_v64_unaligned_to_v64
+ ; GFX940: liveins: $vgpr3_vgpr4
+ ; GFX940: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4
+ ; GFX940: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec
; GFX10-LABEL: name: copy_v64_unaligned_to_v64
; GFX10: liveins: $vgpr3_vgpr4
; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4
@@ -263,6 +305,12 @@ body: |
; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
; GFX90A: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
; GFX90A: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec
+ ; GFX940-LABEL: name: copy_v128_to_v128_unaligned
+ ; GFX940: liveins: $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX940: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX940: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX940: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX940: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec
; GFX10-LABEL: name: copy_v128_to_v128_unaligned
; GFX10: liveins: $vgpr8_vgpr9_vgpr10_vgpr11
; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11
@@ -290,6 +338,12 @@ body: |
; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; GFX90A: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec
+ ; GFX940-LABEL: name: copy_v128_unaligned_to_v128
+ ; GFX940: liveins: $vgpr7_vgpr8_vgpr9_vgpr10
+ ; GFX940: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10
+ ; GFX940: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
+ ; GFX940: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
+ ; GFX940: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec
; GFX10-LABEL: name: copy_v128_unaligned_to_v128
; GFX10: liveins: $vgpr7_vgpr8_vgpr9_vgpr10
; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10
@@ -313,6 +367,10 @@ body: |
; GFX90A: liveins: $sgpr8_sgpr9
; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9
; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec
+ ; GFX940-LABEL: name: copy_s64_to_v64_unaligned
+ ; GFX940: liveins: $sgpr8_sgpr9
+ ; GFX940: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9
+ ; GFX940: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec
; GFX10-LABEL: name: copy_s64_to_v64_unaligned
; GFX10: liveins: $sgpr8_sgpr9
; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9
@@ -338,6 +396,12 @@ body: |
; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
; GFX90A: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
; GFX90A: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec
+ ; GFX940-LABEL: name: copy_s128_to_v128_unaligned
+ ; GFX940: liveins: $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GFX940: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GFX940: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GFX940: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GFX940: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec
; GFX10-LABEL: name: copy_s128_to_v128_unaligned
; GFX10: liveins: $sgpr8_sgpr9_sgpr10_sgpr11
; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11
@@ -363,6 +427,11 @@ body: |
; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10
; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
; GFX90A: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec
+ ; GFX940-LABEL: name: copy_v96_to_v96_unaligned
+ ; GFX940: liveins: $vgpr8_vgpr9_vgpr10
+ ; GFX940: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10
+ ; GFX940: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
+ ; GFX940: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec
; GFX10-LABEL: name: copy_v96_to_v96_unaligned
; GFX10: liveins: $vgpr8_vgpr9_vgpr10
; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10
@@ -387,6 +456,11 @@ body: |
; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9
; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec
+ ; GFX940-LABEL: name: copy_v96_unaligned_to_v96
+ ; GFX940: liveins: $vgpr7_vgpr8_vgpr9
+ ; GFX940: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9
+ ; GFX940: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
+ ; GFX940: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec
; GFX10-LABEL: name: copy_v96_unaligned_to_v96
; GFX10: liveins: $vgpr7_vgpr8_vgpr9
; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9
@@ -411,6 +485,11 @@ body: |
; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2
; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
+ ; GFX940-LABEL: name: copy_s96_to_v96
+ ; GFX940: liveins: $sgpr0_sgpr1_sgpr2
+ ; GFX940: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2
+ ; GFX940: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
+ ; GFX940: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
; GFX10-LABEL: name: copy_s96_to_v96
; GFX10: liveins: $sgpr0_sgpr1_sgpr2
; GFX10: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2
@@ -435,6 +514,11 @@ body: |
; GFX90A: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2
; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
+ ; GFX940-LABEL: name: copy_s96_to_v96_unaligned
+ ; GFX940: liveins: $sgpr0_sgpr1_sgpr2
+ ; GFX940: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2
+ ; GFX940: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
+ ; GFX940: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
; GFX10-LABEL: name: copy_s96_to_v96_unaligned
; GFX10: liveins: $sgpr0_sgpr1_sgpr2
; GFX10: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
index 1fc72422edf57..bb4f5a8bc305c 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
@@ -1,10 +1,12 @@
# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX900 %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX90A %s
+# RUN: llc -march=amdgcn -mcpu=gfx940 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX940 %s
# GCN-LABEL: name: v_mov_b64_from_vgpr
# GFX900: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1
# GFX900: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1
# GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec
+# GFX940: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec
name: v_mov_b64_from_vgpr
body: |
bb.0:
@@ -15,6 +17,7 @@ body: |
# GFX900: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1
# GFX900: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit-def $vgpr0_vgpr1
# GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+# GFX940: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr2_sgpr3, implicit $exec
name: v_mov_b64_from_sgpr
body: |
bb.0:
@@ -26,6 +29,7 @@ body: |
# GFX900: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1
# GFX90A: $vgpr0 = V_MOV_B32_e32 -2, implicit $exec, implicit-def $vgpr0_vgpr1
# GFX90A: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX940: $vgpr0_vgpr1 = V_MOV_B64_e32 -2, implicit $exec
name: v_mov_b64_from_sext_inline_imm
body: |
bb.0:
@@ -63,6 +67,7 @@ body: |
# GFX900: $vgpr0 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1
# GFX900: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1
# GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, -1, 8, -1, 0, 0, 0, 0, 0, implicit $exec
+# GFX940: $vgpr0_vgpr1 = V_MOV_B64_e32 -1, implicit $exec
name: v_mov_b64_from_same_sext_inline_imm
body: |
bb.0:
@@ -73,6 +78,7 @@ body: |
# GFX900: $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec, implicit-def $vgpr0_vgpr1
# GFX900: $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec, implicit-def $vgpr0_vgpr1
# GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, 1065353216, 8, 1065353216, 0, 0, 0, 0, 0, implicit $exec
+# GFX940: $vgpr0_vgpr1 = V_PK_MOV_B32 8, 1065353216, 8, 1065353216, 0, 0, 0, 0, 0, implicit $exec
name: v_mov_b64_from_same_fp_inline_imm
body: |
bb.0:
diff --git a/llvm/test/MC/AMDGPU/gfx940_asm_features.s b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
index 01fa9e1319c39..f70662bb6cb83 100644
--- a/llvm/test/MC/AMDGPU/gfx940_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
@@ -33,6 +33,26 @@ s_load_dword s2, s[2:3], 0x0 glc
// GFX940: buffer_load_dword v5, off, s[8:11], s3 sc0 nt sc1 ; encoding: [0x00,0xc0,0x52,0xe0,0x00,0x05,0x02,0x03]
buffer_load_dword v5, off, s[8:11], s3 sc0 nt sc1
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_mov_b64_e32 v[2:3], v[4:5] ; encoding: [0x04,0x71,0x04,0x7e]
+v_mov_b64 v[2:3], v[4:5]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_mov_b64_dpp v[2:3], v[4:5] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x70,0x04,0x7e,0x04,0x51,0x01,0xff]
+v_mov_b64 v[2:3], v[4:5] row_newbcast:1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_mov_b64_e32 v[2:3], s[4:5] ; encoding: [0x04,0x70,0x04,0x7e]
+v_mov_b64 v[2:3], s[4:5]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_mov_b64_e32 v[2:3], 1 ; encoding: [0x81,0x70,0x04,0x7e]
+v_mov_b64 v[2:3], 1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_mov_b64_e32 v[2:3], 0x64 ; encoding: [0xff,0x70,0x04,0x7e,0x64,0x00,0x00,0x00]
+v_mov_b64 v[2:3], 0x64
+
// NOT-GFX940: error: invalid operand for instruction
// GFX940: buffer_atomic_swap v5, off, s[8:11], s3 sc0 ; encoding: [0x00,0x40,0x00,0xe1,0x00,0x05,0x02,0x03]
buffer_atomic_swap v5, off, s[8:11], s3 sc0
diff --git a/llvm/test/MC/AMDGPU/gfx940_err.s b/llvm/test/MC/AMDGPU/gfx940_err.s
index 1b2297f263f5d..b5ca08d381aad 100644
--- a/llvm/test/MC/AMDGPU/gfx940_err.s
+++ b/llvm/test/MC/AMDGPU/gfx940_err.s
@@ -16,6 +16,21 @@ v_madmk_f32 v0, v1, 0, v2
v_mad_legacy_f32 v0, v1, v2, v3
// GFX940: error: instruction not supported on this GPU
+v_mov_b64 v[2:3], v[4:5] row_shl:1
+// GFX940: error: 64 bit dpp only supports row_newbcast
+
+v_mov_b64 v[2:3], -v[4:5]
+// GFX940: error: not a valid operand.
+
+v_mov_b64 v[2:3], |v[4:5]|
+// GFX940: error: not a valid operand.
+
+v_mov_b64 v[2:3], v[4:5] dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// GFX940: error: not a valid operand.
+
+v_mov_b64_sdwa v[2:3], v[4:5]
+// GFX940: error: sdwa variant of this instruction is not supported
+
global_load_dword v2, v[2:3], off glc
// GFX940: error: invalid operand for instruction
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
index 70b877ec91ab3..18dfff0df4e32 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
@@ -15,6 +15,21 @@
# GFX940: buffer_load_dword v5, off, s[8:11], s3 sc0 nt sc1 ; encoding: [0x00,0xc0,0x52,0xe0,0x00,0x05,0x02,0x03]
0x00,0xc0,0x52,0xe0,0x00,0x05,0x02,0x03
+# GFX940: v_mov_b64_e32 v[2:3], v[4:5] ; encoding: [0x04,0x71,0x04,0x7e]
+0x04,0x71,0x04,0x7e
+
+# GFX940: v_mov_b64_dpp v[2:3], v[4:5] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x70,0x04,0x7e,0x04,0x51,0x01,0xff]
+0xfa,0x70,0x04,0x7e,0x04,0x51,0x01,0xff
+
+# GFX940: v_mov_b64_e32 v[2:3], s[4:5] ; encoding: [0x04,0x70,0x04,0x7e]
+0x04,0x70,0x04,0x7e
+
+# GFX940: v_mov_b64_e32 v[2:3], 1 ; encoding: [0x81,0x70,0x04,0x7e]
+0x81,0x70,0x04,0x7e
+
+# GFX940: v_mov_b64_e32 v[2:3], 0x64 ; encoding: [0xff,0x70,0x04,0x7e,0x64,0x00,0x00,0x00]
+0xff,0x70,0x04,0x7e,0x64,0x00,0x00,0x00
+
# GFX940: buffer_atomic_swap v5, off, s[8:11], s3 sc0 ; encoding: [0x00,0x40,0x00,0xe1,0x00,0x05,0x02,0x03]
0x00,0x40,0x00,0xe1,0x00,0x05,0x02,0x03
More information about the llvm-commits
mailing list