[llvm] r359696 - [AMDGPU] gfx1010 DS implementation
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Wed May 1 09:11:11 PDT 2019
Author: rampitec
Date: Wed May 1 09:11:11 2019
New Revision: 359696
URL: http://llvm.org/viewvc/llvm-project?rev=359696&view=rev
Log:
[AMDGPU] gfx1010 DS implementation
Differential Revision: https://reviews.llvm.org/D61332
Added:
llvm/trunk/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
llvm/trunk/test/MC/AMDGPU/mubuf-gfx10.s
Modified:
llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/trunk/lib/Target/AMDGPU/DSInstructions.td
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
Modified: llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp?rev=359696&r1=359695&r2=359696&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp Wed May 1 09:11:11 2019
@@ -4047,7 +4047,8 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &
}
AMDGPUOperand::ImmTy OffsetType =
- (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_si ||
+ (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx10 ||
+ Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx6_gfx7 ||
Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle :
AMDGPUOperand::ImmTyOffset;
@@ -5164,7 +5165,7 @@ void AMDGPUAsmParser::cvtMubufImpl(MCIns
continue;
}
- HasLdsModifier = Op.isLDS();
+ HasLdsModifier |= Op.isLDS();
// Handle tokens like 'offen' which are sometimes hard-coded into the
// asm string. There are no MCInst operands for these.
Modified: llvm/trunk/lib/Target/AMDGPU/DSInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/DSInstructions.td?rev=359696&r1=359695&r2=359696&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/DSInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/DSInstructions.td Wed May 1 09:11:11 2019
@@ -808,175 +808,219 @@ def : Pat <
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// Base ENC_DS for GFX6, GFX7.
+// Base ENC_DS for GFX6, GFX7, GFX10.
//===----------------------------------------------------------------------===//
-class DS_Real_si <bits<8> op, DS_Pseudo ds> :
- DS_Real <ds>,
- SIMCInstr <ds.Mnemonic, SIEncodingFamily.SI> {
- let AssemblerPredicates=[isGFX6GFX7];
- let DecoderNamespace="GFX6GFX7";
-
- // encoding
- let Inst{7-0} = !if(ds.has_offset0, offset0, 0);
- let Inst{15-8} = !if(ds.has_offset1, offset1, 0);
- let Inst{17} = !if(ds.has_gds, gds, ds.gdsValue);
+class Base_DS_Real_gfx6_gfx7_gfx10<bits<8> op, DS_Pseudo ps, int ef> :
+ DS_Real<ps>, SIMCInstr <ps.Mnemonic, ef> {
+
+ let Inst{7-0} = !if(ps.has_offset0, offset0, 0);
+ let Inst{15-8} = !if(ps.has_offset1, offset1, 0);
+ let Inst{17} = !if(ps.has_gds, gds, ps.gdsValue);
let Inst{25-18} = op;
- let Inst{31-26} = 0x36; // ds prefix
- let Inst{39-32} = !if(ds.has_addr, addr, 0);
- let Inst{47-40} = !if(ds.has_data0, data0, 0);
- let Inst{55-48} = !if(ds.has_data1, data1, 0);
- let Inst{63-56} = !if(ds.has_vdst, vdst, 0);
+ let Inst{31-26} = 0x36;
+ let Inst{39-32} = !if(ps.has_addr, addr, 0);
+ let Inst{47-40} = !if(ps.has_data0, data0, 0);
+ let Inst{55-48} = !if(ps.has_data1, data1, 0);
+ let Inst{63-56} = !if(ps.has_vdst, vdst, 0);
}
-def DS_ADD_U32_si : DS_Real_si<0x0, DS_ADD_U32>;
-def DS_SUB_U32_si : DS_Real_si<0x1, DS_SUB_U32>;
-def DS_RSUB_U32_si : DS_Real_si<0x2, DS_RSUB_U32>;
-def DS_INC_U32_si : DS_Real_si<0x3, DS_INC_U32>;
-def DS_DEC_U32_si : DS_Real_si<0x4, DS_DEC_U32>;
-def DS_MIN_I32_si : DS_Real_si<0x5, DS_MIN_I32>;
-def DS_MAX_I32_si : DS_Real_si<0x6, DS_MAX_I32>;
-def DS_MIN_U32_si : DS_Real_si<0x7, DS_MIN_U32>;
-def DS_MAX_U32_si : DS_Real_si<0x8, DS_MAX_U32>;
-def DS_AND_B32_si : DS_Real_si<0x9, DS_AND_B32>;
-def DS_OR_B32_si : DS_Real_si<0xa, DS_OR_B32>;
-def DS_XOR_B32_si : DS_Real_si<0xb, DS_XOR_B32>;
-def DS_MSKOR_B32_si : DS_Real_si<0xc, DS_MSKOR_B32>;
-def DS_WRITE_B32_si : DS_Real_si<0xd, DS_WRITE_B32>;
-def DS_WRITE2_B32_si : DS_Real_si<0xe, DS_WRITE2_B32>;
-def DS_WRITE2ST64_B32_si : DS_Real_si<0xf, DS_WRITE2ST64_B32>;
-def DS_CMPST_B32_si : DS_Real_si<0x10, DS_CMPST_B32>;
-def DS_CMPST_F32_si : DS_Real_si<0x11, DS_CMPST_F32>;
-def DS_MIN_F32_si : DS_Real_si<0x12, DS_MIN_F32>;
-def DS_MAX_F32_si : DS_Real_si<0x13, DS_MAX_F32>;
-def DS_NOP_si : DS_Real_si<0x14, DS_NOP>;
-def DS_GWS_INIT_si : DS_Real_si<0x19, DS_GWS_INIT>;
-def DS_GWS_SEMA_V_si : DS_Real_si<0x1a, DS_GWS_SEMA_V>;
-def DS_GWS_SEMA_BR_si : DS_Real_si<0x1b, DS_GWS_SEMA_BR>;
-def DS_GWS_SEMA_P_si : DS_Real_si<0x1c, DS_GWS_SEMA_P>;
-def DS_GWS_BARRIER_si : DS_Real_si<0x1d, DS_GWS_BARRIER>;
-def DS_WRITE_B8_si : DS_Real_si<0x1e, DS_WRITE_B8>;
-def DS_WRITE_B16_si : DS_Real_si<0x1f, DS_WRITE_B16>;
-def DS_ADD_RTN_U32_si : DS_Real_si<0x20, DS_ADD_RTN_U32>;
-def DS_SUB_RTN_U32_si : DS_Real_si<0x21, DS_SUB_RTN_U32>;
-def DS_RSUB_RTN_U32_si : DS_Real_si<0x22, DS_RSUB_RTN_U32>;
-def DS_INC_RTN_U32_si : DS_Real_si<0x23, DS_INC_RTN_U32>;
-def DS_DEC_RTN_U32_si : DS_Real_si<0x24, DS_DEC_RTN_U32>;
-def DS_MIN_RTN_I32_si : DS_Real_si<0x25, DS_MIN_RTN_I32>;
-def DS_MAX_RTN_I32_si : DS_Real_si<0x26, DS_MAX_RTN_I32>;
-def DS_MIN_RTN_U32_si : DS_Real_si<0x27, DS_MIN_RTN_U32>;
-def DS_MAX_RTN_U32_si : DS_Real_si<0x28, DS_MAX_RTN_U32>;
-def DS_AND_RTN_B32_si : DS_Real_si<0x29, DS_AND_RTN_B32>;
-def DS_OR_RTN_B32_si : DS_Real_si<0x2a, DS_OR_RTN_B32>;
-def DS_XOR_RTN_B32_si : DS_Real_si<0x2b, DS_XOR_RTN_B32>;
-def DS_MSKOR_RTN_B32_si : DS_Real_si<0x2c, DS_MSKOR_RTN_B32>;
-def DS_WRXCHG_RTN_B32_si : DS_Real_si<0x2d, DS_WRXCHG_RTN_B32>;
-def DS_WRXCHG2_RTN_B32_si : DS_Real_si<0x2e, DS_WRXCHG2_RTN_B32>;
-def DS_WRXCHG2ST64_RTN_B32_si : DS_Real_si<0x2f, DS_WRXCHG2ST64_RTN_B32>;
-def DS_CMPST_RTN_B32_si : DS_Real_si<0x30, DS_CMPST_RTN_B32>;
-def DS_CMPST_RTN_F32_si : DS_Real_si<0x31, DS_CMPST_RTN_F32>;
-def DS_MIN_RTN_F32_si : DS_Real_si<0x32, DS_MIN_RTN_F32>;
-def DS_MAX_RTN_F32_si : DS_Real_si<0x33, DS_MAX_RTN_F32>;
-
-// These instruction are CI/VI only
-def DS_WRAP_RTN_B32_si : DS_Real_si<0x34, DS_WRAP_RTN_B32>;
-def DS_CONDXCHG32_RTN_B64_si : DS_Real_si<0x7e, DS_CONDXCHG32_RTN_B64>;
-def DS_GWS_SEMA_RELEASE_ALL_si : DS_Real_si<0x18, DS_GWS_SEMA_RELEASE_ALL>;
-
-def DS_SWIZZLE_B32_si : DS_Real_si<0x35, DS_SWIZZLE_B32>;
-def DS_READ_B32_si : DS_Real_si<0x36, DS_READ_B32>;
-def DS_READ2_B32_si : DS_Real_si<0x37, DS_READ2_B32>;
-def DS_READ2ST64_B32_si : DS_Real_si<0x38, DS_READ2ST64_B32>;
-def DS_READ_I8_si : DS_Real_si<0x39, DS_READ_I8>;
-def DS_READ_U8_si : DS_Real_si<0x3a, DS_READ_U8>;
-def DS_READ_I16_si : DS_Real_si<0x3b, DS_READ_I16>;
-def DS_READ_U16_si : DS_Real_si<0x3c, DS_READ_U16>;
-def DS_CONSUME_si : DS_Real_si<0x3d, DS_CONSUME>;
-def DS_APPEND_si : DS_Real_si<0x3e, DS_APPEND>;
-def DS_ORDERED_COUNT_si : DS_Real_si<0x3f, DS_ORDERED_COUNT>;
-def DS_ADD_U64_si : DS_Real_si<0x40, DS_ADD_U64>;
-def DS_SUB_U64_si : DS_Real_si<0x41, DS_SUB_U64>;
-def DS_RSUB_U64_si : DS_Real_si<0x42, DS_RSUB_U64>;
-def DS_INC_U64_si : DS_Real_si<0x43, DS_INC_U64>;
-def DS_DEC_U64_si : DS_Real_si<0x44, DS_DEC_U64>;
-def DS_MIN_I64_si : DS_Real_si<0x45, DS_MIN_I64>;
-def DS_MAX_I64_si : DS_Real_si<0x46, DS_MAX_I64>;
-def DS_MIN_U64_si : DS_Real_si<0x47, DS_MIN_U64>;
-def DS_MAX_U64_si : DS_Real_si<0x48, DS_MAX_U64>;
-def DS_AND_B64_si : DS_Real_si<0x49, DS_AND_B64>;
-def DS_OR_B64_si : DS_Real_si<0x4a, DS_OR_B64>;
-def DS_XOR_B64_si : DS_Real_si<0x4b, DS_XOR_B64>;
-def DS_MSKOR_B64_si : DS_Real_si<0x4c, DS_MSKOR_B64>;
-def DS_WRITE_B64_si : DS_Real_si<0x4d, DS_WRITE_B64>;
-def DS_WRITE2_B64_si : DS_Real_si<0x4E, DS_WRITE2_B64>;
-def DS_WRITE2ST64_B64_si : DS_Real_si<0x4f, DS_WRITE2ST64_B64>;
-def DS_CMPST_B64_si : DS_Real_si<0x50, DS_CMPST_B64>;
-def DS_CMPST_F64_si : DS_Real_si<0x51, DS_CMPST_F64>;
-def DS_MIN_F64_si : DS_Real_si<0x52, DS_MIN_F64>;
-def DS_MAX_F64_si : DS_Real_si<0x53, DS_MAX_F64>;
-
-def DS_ADD_RTN_U64_si : DS_Real_si<0x60, DS_ADD_RTN_U64>;
-def DS_SUB_RTN_U64_si : DS_Real_si<0x61, DS_SUB_RTN_U64>;
-def DS_RSUB_RTN_U64_si : DS_Real_si<0x62, DS_RSUB_RTN_U64>;
-def DS_INC_RTN_U64_si : DS_Real_si<0x63, DS_INC_RTN_U64>;
-def DS_DEC_RTN_U64_si : DS_Real_si<0x64, DS_DEC_RTN_U64>;
-def DS_MIN_RTN_I64_si : DS_Real_si<0x65, DS_MIN_RTN_I64>;
-def DS_MAX_RTN_I64_si : DS_Real_si<0x66, DS_MAX_RTN_I64>;
-def DS_MIN_RTN_U64_si : DS_Real_si<0x67, DS_MIN_RTN_U64>;
-def DS_MAX_RTN_U64_si : DS_Real_si<0x68, DS_MAX_RTN_U64>;
-def DS_AND_RTN_B64_si : DS_Real_si<0x69, DS_AND_RTN_B64>;
-def DS_OR_RTN_B64_si : DS_Real_si<0x6a, DS_OR_RTN_B64>;
-def DS_XOR_RTN_B64_si : DS_Real_si<0x6b, DS_XOR_RTN_B64>;
-def DS_MSKOR_RTN_B64_si : DS_Real_si<0x6c, DS_MSKOR_RTN_B64>;
-def DS_WRXCHG_RTN_B64_si : DS_Real_si<0x6d, DS_WRXCHG_RTN_B64>;
-def DS_WRXCHG2_RTN_B64_si : DS_Real_si<0x6e, DS_WRXCHG2_RTN_B64>;
-def DS_WRXCHG2ST64_RTN_B64_si : DS_Real_si<0x6f, DS_WRXCHG2ST64_RTN_B64>;
-def DS_CMPST_RTN_B64_si : DS_Real_si<0x70, DS_CMPST_RTN_B64>;
-def DS_CMPST_RTN_F64_si : DS_Real_si<0x71, DS_CMPST_RTN_F64>;
-def DS_MIN_RTN_F64_si : DS_Real_si<0x72, DS_MIN_RTN_F64>;
-def DS_MAX_RTN_F64_si : DS_Real_si<0x73, DS_MAX_RTN_F64>;
-
-def DS_READ_B64_si : DS_Real_si<0x76, DS_READ_B64>;
-def DS_READ2_B64_si : DS_Real_si<0x77, DS_READ2_B64>;
-def DS_READ2ST64_B64_si : DS_Real_si<0x78, DS_READ2ST64_B64>;
-
-def DS_ADD_SRC2_U32_si : DS_Real_si<0x80, DS_ADD_SRC2_U32>;
-def DS_SUB_SRC2_U32_si : DS_Real_si<0x81, DS_SUB_SRC2_U32>;
-def DS_RSUB_SRC2_U32_si : DS_Real_si<0x82, DS_RSUB_SRC2_U32>;
-def DS_INC_SRC2_U32_si : DS_Real_si<0x83, DS_INC_SRC2_U32>;
-def DS_DEC_SRC2_U32_si : DS_Real_si<0x84, DS_DEC_SRC2_U32>;
-def DS_MIN_SRC2_I32_si : DS_Real_si<0x85, DS_MIN_SRC2_I32>;
-def DS_MAX_SRC2_I32_si : DS_Real_si<0x86, DS_MAX_SRC2_I32>;
-def DS_MIN_SRC2_U32_si : DS_Real_si<0x87, DS_MIN_SRC2_U32>;
-def DS_MAX_SRC2_U32_si : DS_Real_si<0x88, DS_MAX_SRC2_U32>;
-def DS_AND_SRC2_B32_si : DS_Real_si<0x89, DS_AND_SRC2_B32>;
-def DS_OR_SRC2_B32_si : DS_Real_si<0x8a, DS_OR_SRC2_B32>;
-def DS_XOR_SRC2_B32_si : DS_Real_si<0x8b, DS_XOR_SRC2_B32>;
-def DS_WRITE_SRC2_B32_si : DS_Real_si<0x8d, DS_WRITE_SRC2_B32>;
-
-def DS_MIN_SRC2_F32_si : DS_Real_si<0x92, DS_MIN_SRC2_F32>;
-def DS_MAX_SRC2_F32_si : DS_Real_si<0x93, DS_MAX_SRC2_F32>;
-
-def DS_ADD_SRC2_U64_si : DS_Real_si<0xc0, DS_ADD_SRC2_U64>;
-def DS_SUB_SRC2_U64_si : DS_Real_si<0xc1, DS_SUB_SRC2_U64>;
-def DS_RSUB_SRC2_U64_si : DS_Real_si<0xc2, DS_RSUB_SRC2_U64>;
-def DS_INC_SRC2_U64_si : DS_Real_si<0xc3, DS_INC_SRC2_U64>;
-def DS_DEC_SRC2_U64_si : DS_Real_si<0xc4, DS_DEC_SRC2_U64>;
-def DS_MIN_SRC2_I64_si : DS_Real_si<0xc5, DS_MIN_SRC2_I64>;
-def DS_MAX_SRC2_I64_si : DS_Real_si<0xc6, DS_MAX_SRC2_I64>;
-def DS_MIN_SRC2_U64_si : DS_Real_si<0xc7, DS_MIN_SRC2_U64>;
-def DS_MAX_SRC2_U64_si : DS_Real_si<0xc8, DS_MAX_SRC2_U64>;
-def DS_AND_SRC2_B64_si : DS_Real_si<0xc9, DS_AND_SRC2_B64>;
-def DS_OR_SRC2_B64_si : DS_Real_si<0xca, DS_OR_SRC2_B64>;
-def DS_XOR_SRC2_B64_si : DS_Real_si<0xcb, DS_XOR_SRC2_B64>;
-def DS_WRITE_SRC2_B64_si : DS_Real_si<0xcd, DS_WRITE_SRC2_B64>;
-
-def DS_MIN_SRC2_F64_si : DS_Real_si<0xd2, DS_MIN_SRC2_F64>;
-def DS_MAX_SRC2_F64_si : DS_Real_si<0xd3, DS_MAX_SRC2_F64>;
-def DS_WRITE_B96_si : DS_Real_si<0xde, DS_WRITE_B96>;
-def DS_WRITE_B128_si : DS_Real_si<0xdf, DS_WRITE_B128>;
-def DS_READ_B96_si : DS_Real_si<0xfe, DS_READ_B96>;
-def DS_READ_B128_si : DS_Real_si<0xff, DS_READ_B128>;
+//===----------------------------------------------------------------------===//
+// GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ multiclass DS_Real_gfx10<bits<8> op> {
+ def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+ SIEncodingFamily.GFX10>;
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+
+defm DS_ADD_F32 : DS_Real_gfx10<0x015>;
+defm DS_ADD_RTN_F32 : DS_Real_gfx10<0x055>;
+defm DS_ADD_SRC2_F32 : DS_Real_gfx10<0x095>;
+defm DS_WRITE_B8_D16_HI : DS_Real_gfx10<0x0a0>;
+defm DS_WRITE_B16_D16_HI : DS_Real_gfx10<0x0a1>;
+defm DS_READ_U8_D16 : DS_Real_gfx10<0x0a2>;
+defm DS_READ_U8_D16_HI : DS_Real_gfx10<0x0a3>;
+defm DS_READ_I8_D16 : DS_Real_gfx10<0x0a4>;
+defm DS_READ_I8_D16_HI : DS_Real_gfx10<0x0a5>;
+defm DS_READ_U16_D16 : DS_Real_gfx10<0x0a6>;
+defm DS_READ_U16_D16_HI : DS_Real_gfx10<0x0a7>;
+defm DS_WRITE_ADDTID_B32 : DS_Real_gfx10<0x0b0>;
+defm DS_READ_ADDTID_B32 : DS_Real_gfx10<0x0b1>;
+defm DS_PERMUTE_B32 : DS_Real_gfx10<0x0b2>;
+defm DS_BPERMUTE_B32 : DS_Real_gfx10<0x0b3>;
+
+//===----------------------------------------------------------------------===//
+// GFX7, GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
+ multiclass DS_Real_gfx7<bits<8> op> {
+ def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+ SIEncodingFamily.SI>;
+ }
+} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
+
+multiclass DS_Real_gfx7_gfx10<bits<8> op> :
+ DS_Real_gfx7<op>, DS_Real_gfx10<op>;
+
+// FIXME-GFX7: Add tests when upstreaming this part.
+defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10<0x018>;
+defm DS_WRAP_RTN_B32 : DS_Real_gfx7_gfx10<0x034>;
+defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10<0x07e>;
+defm DS_WRITE_B96 : DS_Real_gfx7_gfx10<0x0de>;
+defm DS_WRITE_B128 : DS_Real_gfx7_gfx10<0x0df>;
+defm DS_READ_B96 : DS_Real_gfx7_gfx10<0x0fe>;
+defm DS_READ_B128 : DS_Real_gfx7_gfx10<0x0ff>;
+
+//===----------------------------------------------------------------------===//
+// GFX6, GFX7, GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ multiclass DS_Real_gfx6_gfx7<bits<8> op> {
+ def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+ SIEncodingFamily.SI>;
+ }
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+
+multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op> :
+ DS_Real_gfx6_gfx7<op>, DS_Real_gfx10<op>;
+
+defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10<0x000>;
+defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10<0x001>;
+defm DS_RSUB_U32 : DS_Real_gfx6_gfx7_gfx10<0x002>;
+defm DS_INC_U32 : DS_Real_gfx6_gfx7_gfx10<0x003>;
+defm DS_DEC_U32 : DS_Real_gfx6_gfx7_gfx10<0x004>;
+defm DS_MIN_I32 : DS_Real_gfx6_gfx7_gfx10<0x005>;
+defm DS_MAX_I32 : DS_Real_gfx6_gfx7_gfx10<0x006>;
+defm DS_MIN_U32 : DS_Real_gfx6_gfx7_gfx10<0x007>;
+defm DS_MAX_U32 : DS_Real_gfx6_gfx7_gfx10<0x008>;
+defm DS_AND_B32 : DS_Real_gfx6_gfx7_gfx10<0x009>;
+defm DS_OR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00a>;
+defm DS_XOR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00b>;
+defm DS_MSKOR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00c>;
+defm DS_WRITE_B32 : DS_Real_gfx6_gfx7_gfx10<0x00d>;
+defm DS_WRITE2_B32 : DS_Real_gfx6_gfx7_gfx10<0x00e>;
+defm DS_WRITE2ST64_B32 : DS_Real_gfx6_gfx7_gfx10<0x00f>;
+defm DS_CMPST_B32 : DS_Real_gfx6_gfx7_gfx10<0x010>;
+defm DS_CMPST_F32 : DS_Real_gfx6_gfx7_gfx10<0x011>;
+defm DS_MIN_F32 : DS_Real_gfx6_gfx7_gfx10<0x012>;
+defm DS_MAX_F32 : DS_Real_gfx6_gfx7_gfx10<0x013>;
+defm DS_NOP : DS_Real_gfx6_gfx7_gfx10<0x014>;
+defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10<0x019>;
+defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10<0x01a>;
+defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10<0x01b>;
+defm DS_GWS_SEMA_P : DS_Real_gfx6_gfx7_gfx10<0x01c>;
+defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10<0x01d>;
+defm DS_WRITE_B8 : DS_Real_gfx6_gfx7_gfx10<0x01e>;
+defm DS_WRITE_B16 : DS_Real_gfx6_gfx7_gfx10<0x01f>;
+defm DS_ADD_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x020>;
+defm DS_SUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x021>;
+defm DS_RSUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x022>;
+defm DS_INC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x023>;
+defm DS_DEC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x024>;
+defm DS_MIN_RTN_I32 : DS_Real_gfx6_gfx7_gfx10<0x025>;
+defm DS_MAX_RTN_I32 : DS_Real_gfx6_gfx7_gfx10<0x026>;
+defm DS_MIN_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x027>;
+defm DS_MAX_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x028>;
+defm DS_AND_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x029>;
+defm DS_OR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02a>;
+defm DS_XOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02b>;
+defm DS_MSKOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02c>;
+defm DS_WRXCHG_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02d>;
+defm DS_WRXCHG2_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02e>;
+defm DS_WRXCHG2ST64_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02f>;
+defm DS_CMPST_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x030>;
+defm DS_CMPST_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x031>;
+defm DS_MIN_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x032>;
+defm DS_MAX_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x033>;
+defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10<0x035>;
+defm DS_READ_B32 : DS_Real_gfx6_gfx7_gfx10<0x036>;
+defm DS_READ2_B32 : DS_Real_gfx6_gfx7_gfx10<0x037>;
+defm DS_READ2ST64_B32 : DS_Real_gfx6_gfx7_gfx10<0x038>;
+defm DS_READ_I8 : DS_Real_gfx6_gfx7_gfx10<0x039>;
+defm DS_READ_U8 : DS_Real_gfx6_gfx7_gfx10<0x03a>;
+defm DS_READ_I16 : DS_Real_gfx6_gfx7_gfx10<0x03b>;
+defm DS_READ_U16 : DS_Real_gfx6_gfx7_gfx10<0x03c>;
+defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10<0x03d>;
+defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10<0x03e>;
+defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10<0x03f>;
+defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10<0x040>;
+defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10<0x041>;
+defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10<0x042>;
+defm DS_INC_U64 : DS_Real_gfx6_gfx7_gfx10<0x043>;
+defm DS_DEC_U64 : DS_Real_gfx6_gfx7_gfx10<0x044>;
+defm DS_MIN_I64 : DS_Real_gfx6_gfx7_gfx10<0x045>;
+defm DS_MAX_I64 : DS_Real_gfx6_gfx7_gfx10<0x046>;
+defm DS_MIN_U64 : DS_Real_gfx6_gfx7_gfx10<0x047>;
+defm DS_MAX_U64 : DS_Real_gfx6_gfx7_gfx10<0x048>;
+defm DS_AND_B64 : DS_Real_gfx6_gfx7_gfx10<0x049>;
+defm DS_OR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04a>;
+defm DS_XOR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04b>;
+defm DS_MSKOR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04c>;
+defm DS_WRITE_B64 : DS_Real_gfx6_gfx7_gfx10<0x04d>;
+defm DS_WRITE2_B64 : DS_Real_gfx6_gfx7_gfx10<0x04e>;
+defm DS_WRITE2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x04f>;
+defm DS_CMPST_B64 : DS_Real_gfx6_gfx7_gfx10<0x050>;
+defm DS_CMPST_F64 : DS_Real_gfx6_gfx7_gfx10<0x051>;
+defm DS_MIN_F64 : DS_Real_gfx6_gfx7_gfx10<0x052>;
+defm DS_MAX_F64 : DS_Real_gfx6_gfx7_gfx10<0x053>;
+defm DS_ADD_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x060>;
+defm DS_SUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x061>;
+defm DS_RSUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x062>;
+defm DS_INC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x063>;
+defm DS_DEC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x064>;
+defm DS_MIN_RTN_I64 : DS_Real_gfx6_gfx7_gfx10<0x065>;
+defm DS_MAX_RTN_I64 : DS_Real_gfx6_gfx7_gfx10<0x066>;
+defm DS_MIN_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x067>;
+defm DS_MAX_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x068>;
+defm DS_AND_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x069>;
+defm DS_OR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06a>;
+defm DS_XOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06b>;
+defm DS_MSKOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06c>;
+defm DS_WRXCHG_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06d>;
+defm DS_WRXCHG2_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06e>;
+defm DS_WRXCHG2ST64_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06f>;
+defm DS_CMPST_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x070>;
+defm DS_CMPST_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x071>;
+defm DS_MIN_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x072>;
+defm DS_MAX_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x073>;
+defm DS_READ_B64 : DS_Real_gfx6_gfx7_gfx10<0x076>;
+defm DS_READ2_B64 : DS_Real_gfx6_gfx7_gfx10<0x077>;
+defm DS_READ2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x078>;
+defm DS_ADD_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x080>;
+defm DS_SUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x081>;
+defm DS_RSUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x082>;
+defm DS_INC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x083>;
+defm DS_DEC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x084>;
+defm DS_MIN_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x085>;
+defm DS_MAX_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x086>;
+defm DS_MIN_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x087>;
+defm DS_MAX_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x088>;
+defm DS_AND_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x089>;
+defm DS_OR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08a>;
+defm DS_XOR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08b>;
+defm DS_WRITE_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08d>;
+defm DS_MIN_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x092>;
+defm DS_MAX_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x093>;
+defm DS_ADD_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c0>;
+defm DS_SUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c1>;
+defm DS_RSUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c2>;
+defm DS_INC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c3>;
+defm DS_DEC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c4>;
+defm DS_MIN_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c5>;
+defm DS_MAX_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c6>;
+defm DS_MIN_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c7>;
+defm DS_MAX_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c8>;
+defm DS_AND_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0c9>;
+defm DS_OR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0ca>;
+defm DS_XOR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cb>;
+defm DS_WRITE_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cd>;
+defm DS_MIN_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d2>;
+defm DS_MAX_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d3>;
//===----------------------------------------------------------------------===//
// GFX8, GFX9 (VI).
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=359696&r1=359695&r2=359696&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Wed May 1 09:11:11 2019
@@ -6649,6 +6649,11 @@ SDValue SITargetLowering::LowerLOAD(SDVa
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
return DAG.getMergeValues(Ops, DL);
}
+ if (Subtarget->hasLDSMisalignedBug() &&
+ AS == AMDGPUAS::FLAT_ADDRESS &&
+ Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
+ return SplitVectorLoad(Op, DAG);
+ }
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -7110,6 +7115,12 @@ SDValue SITargetLowering::LowerSTORE(SDV
return expandUnalignedStore(Store, DAG);
}
+ if (Subtarget->hasLDSMisalignedBug() &&
+ AS == AMDGPUAS::FLAT_ADDRESS &&
+ Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
+ return SplitVectorStore(Op, DAG);
+ }
+
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// If there is a possibilty that flat instruction access scratch memory
Added: llvm/trunk/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/lds-misaligned-bug.ll?rev=359696&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/lds-misaligned-bug.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/lds-misaligned-bug.ll Wed May 1 09:11:11 2019
@@ -0,0 +1,262 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s
+
+; GCN-LABEL: test_local_misaligned_v2:
+; GCN-DAG: ds_read2_b32
+; GCN-DAG: ds_write2_b32
+define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) {
+bb:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
+ %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
+ %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4
+ %v1 = extractelement <2 x i32> %load, i32 0
+ %v2 = extractelement <2 x i32> %load, i32 1
+ %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
+ %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
+ store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4
+ ret void
+}
+
+; GCN-LABEL: test_local_misaligned_v4:
+; GCN-DAG: ds_read2_b32
+; GCN-DAG: ds_read2_b32
+; GCN-DAG: ds_write2_b32
+; GCN-DAG: ds_write2_b32
+define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
+bb:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
+ %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
+ %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
+ %v1 = extractelement <4 x i32> %load, i32 0
+ %v2 = extractelement <4 x i32> %load, i32 1
+ %v3 = extractelement <4 x i32> %load, i32 2
+ %v4 = extractelement <4 x i32> %load, i32 3
+ %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
+ %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
+ %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
+ %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
+ store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4
+ ret void
+}
+
+; GCN-LABEL: test_local_misaligned_v3:
+; GCN-DAG: ds_read2_b32
+; GCN-DAG: ds_read_b32
+; GCN-DAG: ds_write2_b32
+; GCN-DAG: ds_write_b32
+define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
+bb:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
+ %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
+ %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
+ %v1 = extractelement <3 x i32> %load, i32 0
+ %v2 = extractelement <3 x i32> %load, i32 1
+ %v3 = extractelement <3 x i32> %load, i32 2
+ %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
+ %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
+ %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
+ store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4
+ ret void
+}
+
+; GCN-LABEL: test_flat_misaligned_v2:
+; VECT-DAG: flat_load_dwordx2 v
+; VECT-DAG: flat_store_dwordx2 v
+; SPLIT-DAG: flat_load_dword v
+; SPLIT-DAG: flat_load_dword v
+; SPLIT-DAG: flat_store_dword v
+; SPLIT-DAG: flat_store_dword v
+define amdgpu_kernel void @test_flat_misaligned_v2(i32* %arg) {
+bb:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
+ %ptr = bitcast i32* %gep to <2 x i32>*
+ %load = load <2 x i32>, <2 x i32>* %ptr, align 4
+ %v1 = extractelement <2 x i32> %load, i32 0
+ %v2 = extractelement <2 x i32> %load, i32 1
+ %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
+ %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
+ store <2 x i32> %v4, <2 x i32>* %ptr, align 4
+ ret void
+}
+
+; GCN-LABEL: test_flat_misaligned_v4:
+; VECT-DAG: flat_load_dwordx4 v
+; VECT-DAG: flat_store_dwordx4 v
+; SPLIT-DAG: flat_load_dword v
+; SPLIT-DAG: flat_load_dword v
+; SPLIT-DAG: flat_load_dword v
+; SPLIT-DAG: flat_load_dword v
+; SPLIT-DAG: flat_store_dword v
+; SPLIT-DAG: flat_store_dword v
+; SPLIT-DAG: flat_store_dword v
+; SPLIT-DAG: flat_store_dword v
+define amdgpu_kernel void @test_flat_misaligned_v4(i32* %arg) {
+bb:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
+ %ptr = bitcast i32* %gep to <4 x i32>*
+ %load = load <4 x i32>, <4 x i32>* %ptr, align 4
+ %v1 = extractelement <4 x i32> %load, i32 0
+ %v2 = extractelement <4 x i32> %load, i32 1
+ %v3 = extractelement <4 x i32> %load, i32 2
+ %v4 = extractelement <4 x i32> %load, i32 3
+ %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
+ %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
+ %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
+ %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
+ store <4 x i32> %v8, <4 x i32>* %ptr, align 4
+ ret void
+}
+
+; GCN-LABEL: test_flat_misaligned_v3:
+; VECT-DAG: flat_load_dwordx3 v
+; VECT-DAG: flat_store_dwordx3 v
+; SPLIT-DAG: flat_load_dword v
+; SPLIT-DAG: flat_load_dword v
+; SPLIT-DAG: flat_load_dword v
+; SPLIT-DAG: flat_store_dword v
+; SPLIT-DAG: flat_store_dword v
+; SPLIT-DAG: flat_store_dword v
+define amdgpu_kernel void @test_flat_misaligned_v3(i32* %arg) {
+bb:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
+ %ptr = bitcast i32* %gep to <3 x i32>*
+ %load = load <3 x i32>, <3 x i32>* %ptr, align 4
+ %v1 = extractelement <3 x i32> %load, i32 0
+ %v2 = extractelement <3 x i32> %load, i32 1
+ %v3 = extractelement <3 x i32> %load, i32 2
+ %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
+ %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
+ %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
+ store <3 x i32> %v7, <3 x i32>* %ptr, align 4
+ ret void
+}
+
+; GCN-LABEL: test_local_aligned_v2:
+; GCN-DAG: ds_read_b64
+; GCN-DAG: ds_write_b64
+define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) {
+bb:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
+ %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
+ %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8
+ %v1 = extractelement <2 x i32> %load, i32 0
+ %v2 = extractelement <2 x i32> %load, i32 1
+ %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
+ %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
+ store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8
+ ret void
+}
+
+; GCN-LABEL: test_local_aligned_v3:
+; GCN-DAG: ds_read_b64
+; GCN-DAG: ds_read_b32
+; GCN-DAG: ds_write_b64
+; GCN-DAG: ds_write_b32
+define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) {
+bb:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
+ %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
+ %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
+ %v1 = extractelement <3 x i32> %load, i32 0
+ %v2 = extractelement <3 x i32> %load, i32 1
+ %v3 = extractelement <3 x i32> %load, i32 2
+ %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
+ %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
+ %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
+ store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16
+ ret void
+}
+
+; GCN-LABEL: test_flat_aligned_v2:
+; GCN-DAG: flat_load_dwordx2 v
+; GCN-DAG: flat_store_dwordx2 v
+define amdgpu_kernel void @test_flat_aligned_v2(i32* %arg) {
+bb:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
+ %ptr = bitcast i32* %gep to <2 x i32>*
+ %load = load <2 x i32>, <2 x i32>* %ptr, align 8
+ %v1 = extractelement <2 x i32> %load, i32 0
+ %v2 = extractelement <2 x i32> %load, i32 1
+ %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
+ %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
+ store <2 x i32> %v4, <2 x i32>* %ptr, align 8
+ ret void
+}
+
+; GCN-LABEL: test_flat_aligned_v4:
+; GCN-DAG: flat_load_dwordx4 v
+; GCN-DAG: flat_store_dwordx4 v
+define amdgpu_kernel void @test_flat_aligned_v4(i32* %arg) {
+bb:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
+ %ptr = bitcast i32* %gep to <4 x i32>*
+ %load = load <4 x i32>, <4 x i32>* %ptr, align 16
+ %v1 = extractelement <4 x i32> %load, i32 0
+ %v2 = extractelement <4 x i32> %load, i32 1
+ %v3 = extractelement <4 x i32> %load, i32 2
+ %v4 = extractelement <4 x i32> %load, i32 3
+ %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
+ %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
+ %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
+ %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
+ store <4 x i32> %v8, <4 x i32>* %ptr, align 16
+ ret void
+}
+
+; GCN-LABEL: test_local_v4_aligned8:
+; GCN-DAG: ds_read2_b64
+; GCN-DAG: ds_write2_b64
+define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
+bb:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
+ %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
+ %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
+ %v1 = extractelement <4 x i32> %load, i32 0
+ %v2 = extractelement <4 x i32> %load, i32 1
+ %v3 = extractelement <4 x i32> %load, i32 2
+ %v4 = extractelement <4 x i32> %load, i32 3
+ %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
+ %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
+ %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
+ %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
+ store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8
+ ret void
+}
+
+; GCN-LABEL: test_flat_v4_aligned8:
+; VECT-DAG: flat_load_dwordx4 v
+; VECT-DAG: flat_store_dwordx4 v
+; SPLIT-DAG: flat_load_dwordx2 v
+; SPLIT-DAG: flat_load_dwordx2 v
+; SPLIT-DAG: flat_store_dwordx2 v
+; SPLIT-DAG: flat_store_dwordx2 v
+define amdgpu_kernel void @test_flat_v4_aligned8(i32* %arg) {
+bb:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
+ %ptr = bitcast i32* %gep to <4 x i32>*
+ %load = load <4 x i32>, <4 x i32>* %ptr, align 8
+ %v1 = extractelement <4 x i32> %load, i32 0
+ %v2 = extractelement <4 x i32> %load, i32 1
+ %v3 = extractelement <4 x i32> %load, i32 2
+ %v4 = extractelement <4 x i32> %load, i32 3
+ %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
+ %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
+ %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
+ %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
+ store <4 x i32> %v8, <4 x i32>* %ptr, align 8
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
Added: llvm/trunk/test/MC/AMDGPU/mubuf-gfx10.s
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/MC/AMDGPU/mubuf-gfx10.s?rev=359696&view=auto
==============================================================================
--- llvm/trunk/test/MC/AMDGPU/mubuf-gfx10.s (added)
+++ llvm/trunk/test/MC/AMDGPU/mubuf-gfx10.s Wed May 1 09:11:11 2019
@@ -0,0 +1,10 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -show-encoding %s | FileCheck -check-prefix=GFX10 %s
+
+buffer_load_sbyte v5, off, s[8:11], s3 glc slc lds
+// GFX10: buffer_load_sbyte v5, off, s[8:11], s3 glc slc lds ; encoding: [0x00,0x40,0x25,0xe0,0x00,0x05,0x42,0x03]
+
+buffer_load_sbyte v5, off, s[8:11], s3 glc slc lds dlc
+// GFX10: buffer_load_sbyte v5, off, s[8:11], s3 glc slc lds dlc ; encoding: [0x00,0xc0,0x25,0xe0,0x00,0x05,0x42,0x03]
+
+buffer_load_sbyte v5, off, s[8:11], s3 glc slc dlc
+// GFX10: buffer_load_sbyte v5, off, s[8:11], s3 glc slc dlc ; encoding: [0x00,0xc0,0x24,0xe0,0x00,0x05,0x42,0x03]
More information about the llvm-commits
mailing list