[llvm] r313716 - AMDGPU: Match load d16 hi instructions
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 19 22:01:53 PDT 2017
Author: arsenm
Date: Tue Sep 19 22:01:53 2017
New Revision: 313716
URL: http://llvm.org/viewvc/llvm-project?rev=313716&view=rev
Log:
AMDGPU: Match load d16 hi instructions
Also starts selecting global loads for constant address
in some cases. Some end up selecting to mubuf still, which
requires investigation.
We still get sub-optimal regalloc and extra waitcnts inserted
due to not really tracking the liveness of the separate register
halves.
Added:
llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td
llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td
llvm/trunk/lib/Target/AMDGPU/DSInstructions.td
llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td
llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll
llvm/trunk/test/CodeGen/AMDGPU/packed-op-sel.ll
llvm/trunk/test/CodeGen/AMDGPU/sext-in-reg.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp?rev=313716&r1=313715&r2=313716&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp Tue Sep 19 22:01:53 2017
@@ -126,10 +126,10 @@ private:
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
SDValue &SLC) const;
- bool SelectMUBUFScratchOffen(SDNode *Root,
+ bool SelectMUBUFScratchOffen(SDNode *Parent,
SDValue Addr, SDValue &RSrc, SDValue &VAddr,
SDValue &SOffset, SDValue &ImmOffset) const;
- bool SelectMUBUFScratchOffset(SDNode *Root,
+ bool SelectMUBUFScratchOffset(SDNode *Parent,
SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
@@ -1107,7 +1107,7 @@ std::pair<SDValue, SDValue> AMDGPUDAGToD
MVT::i32));
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
SDValue Addr, SDValue &Rsrc,
SDValue &VAddr, SDValue &SOffset,
SDValue &ImmOffset) const {
@@ -1130,7 +1130,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScra
// In a call sequence, stores to the argument stack area are relative to the
// stack pointer.
- const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo();
+ const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
@@ -1160,7 +1160,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScra
return true;
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
SDValue Addr,
SDValue &SRsrc,
SDValue &SOffset,
@@ -1175,7 +1175,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScra
SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
- const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo();
+ const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td?rev=313716&r1=313715&r2=313716&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td Tue Sep 19 22:01:53 2017
@@ -252,6 +252,11 @@ class GlobalAddress : CodePatPred<[{
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
}]>;
+class GlobalLoadAddress : CodePatPred<[{
+ auto AS = cast<MemSDNode>(N)->getAddressSpace();
+ return AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.CONSTANT_ADDRESS;
+}]>;
+
class FlatLoadAddress : CodePatPred<[{
const auto AS = cast<MemSDNode>(N)->getAddressSpace();
return AS == AMDGPUASI.FLAT_ADDRESS ||
@@ -292,7 +297,7 @@ class PrivateStore <SDPatternOperator op
class LocalLoad <SDPatternOperator op> : LoadFrag <op>, LocalAddress;
class LocalStore <SDPatternOperator op> : StoreFrag <op>, LocalAddress;
-class GlobalLoad <SDPatternOperator op> : LoadFrag<op>, GlobalAddress;
+class GlobalLoad <SDPatternOperator op> : LoadFrag<op>, GlobalLoadAddress;
class GlobalStore <SDPatternOperator op> : StoreFrag<op>, GlobalAddress;
class FlatLoad <SDPatternOperator op> : LoadFrag <op>, FlatLoadAddress;
Modified: llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td?rev=313716&r1=313715&r2=313716&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td Tue Sep 19 22:01:53 2017
@@ -11,8 +11,8 @@ def MUBUFAddr32 : ComplexPattern<i64, 9,
def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
-def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantRoot]>;
-def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantRoot], 20>;
+def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
+def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
@@ -425,16 +425,18 @@ class MUBUF_SetupAddr<int addrKind> {
class MUBUF_Load_Pseudo <string opName,
int addrKind,
RegisterClass vdataClass,
+ bit HasTiedDest = 0,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind>
: MUBUF_Pseudo<opName,
(outs vdataClass:$vdata),
- getMUBUFIns<addrKindCopy>.ret,
+ !con(getMUBUFIns<addrKindCopy>.ret, !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
" $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+ let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
let mayLoad = 1;
let mayStore = 0;
let maybeAtomic = 1;
@@ -444,27 +446,30 @@ class MUBUF_Load_Pseudo <string opName,
// opcode because it needs an N+1 register class dest register.
multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
ValueType load_vt = i32,
- SDPatternOperator ld = null_frag> {
+ SDPatternOperator ld = null_frag,
+ bit TiedDest = 0> {
def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ TiedDest,
[(set load_vt:$vdata,
(ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
MUBUFAddr64Table<0>;
def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ TiedDest,
[(set load_vt:$vdata,
(ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
MUBUFAddr64Table<1>;
- def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>;
+ def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>;
+ def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
- def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest>;
+ def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>;
+ def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>;
+ def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>;
}
}
@@ -812,7 +817,7 @@ defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseud
>;
defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads <
- "buffer_load_ubyte_d16_hi", VGPR_32, i32
+ "buffer_load_ubyte_d16_hi", VGPR_32, i32, null_frag, 1
>;
defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads <
@@ -820,7 +825,7 @@ defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseud
>;
defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads <
- "buffer_load_sbyte_d16_hi", VGPR_32, i32
+ "buffer_load_sbyte_d16_hi", VGPR_32, i32, null_frag, 1
>;
defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads <
@@ -828,7 +833,7 @@ defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseud
>;
defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads <
- "buffer_load_short_d16_hi", VGPR_32, i32
+ "buffer_load_short_d16_hi", VGPR_32, i32, null_frag, 1
>;
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores <
@@ -1149,6 +1154,34 @@ multiclass MUBUFScratchLoadPat <MUBUF_Ps
>;
}
+// XXX - Is it possible to have a complex pattern in a PatFrag?
+multiclass MUBUFScratchLoadPat_Hi16 <MUBUF_Pseudo InstrOffen,
+ MUBUF_Pseudo InstrOffset,
+ ValueType vt, PatFrag ld> {
+ def : Pat <
+ (build_vector vt:$lo, (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
+ i32:$soffset, u16imm:$offset)))),
+ (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo))
+ >;
+
+ def : Pat <
+ (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
+ i32:$soffset, u16imm:$offset)))))),
+ (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo))
+ >;
+
+
+ def : Pat <
+ (build_vector vt:$lo, (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))),
+ (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo))
+ >;
+
+ def : Pat <
+ (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))))),
+ (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo))
+ >;
+}
+
defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, az_extloadi8_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>;
@@ -1160,6 +1193,12 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
+let Predicates = [HasD16LoadStore] in {
+defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>;
+defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>;
+defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>;
+}
+
// BUFFER_LOAD_DWORD*, addr64=0
multiclass MUBUF_Load_Dword <ValueType vt,
MUBUF_Pseudo offset,
Modified: llvm/trunk/lib/Target/AMDGPU/DSInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/DSInstructions.td?rev=313716&r1=313715&r2=313716&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/DSInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/DSInstructions.td Tue Sep 19 22:01:53 2017
@@ -145,16 +145,22 @@ class DS_1A2D_Off8_RET<string opName,
let hasPostISelHook = 1;
}
-class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, Operand ofs = offset>
+class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset>
: DS_Pseudo<opName,
(outs rc:$vdst),
- (ins VGPR_32:$addr, ofs:$offset, gds:$gds),
+ !if(HasTiedOutput,
+ (ins VGPR_32:$addr, ofs:$offset, gds:$gds, rc:$vdst_in),
+ (ins VGPR_32:$addr, ofs:$offset, gds:$gds)),
"$vdst, $addr$offset$gds"> {
-
+ let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
+ let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
let has_data0 = 0;
let has_data1 = 0;
}
+class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> :
+ DS_1A_RET<opName, rc, 1>;
+
class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs rc:$vdst),
@@ -450,7 +456,7 @@ def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_
def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">;
let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
-def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, SwizzleImm>;
+def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, SwizzleImm>;
}
let mayStore = 0 in {
@@ -468,12 +474,12 @@ def DS_READ2_B64 : DS_1A_Off8_RET<"d
def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>;
let SubtargetPredicate = HasD16LoadStore in {
-def DS_READ_U8_D16 : DS_1A_RET<"ds_read_u8_d16">;
-def DS_READ_U8_D16_HI : DS_1A_RET<"ds_read_u8_d16_hi">;
-def DS_READ_I8_D16 : DS_1A_RET<"ds_read_i8_d16">;
-def DS_READ_I8_D16_HI : DS_1A_RET<"ds_read_i8_d16_hi">;
-def DS_READ_U16_D16 : DS_1A_RET<"ds_read_u16_d16">;
-def DS_READ_U16_D16_HI : DS_1A_RET<"ds_read_u16_d16_hi">;
+def DS_READ_U8_D16 : DS_1A_RET_Tied<"ds_read_u8_d16">;
+def DS_READ_U8_D16_HI : DS_1A_RET_Tied<"ds_read_u8_d16_hi">;
+def DS_READ_I8_D16 : DS_1A_RET_Tied<"ds_read_i8_d16">;
+def DS_READ_I8_D16_HI : DS_1A_RET_Tied<"ds_read_i8_d16_hi">;
+def DS_READ_U16_D16 : DS_1A_RET_Tied<"ds_read_u16_d16">;
+def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">;
}
let SubtargetPredicate = HasDSAddTid in {
@@ -543,6 +549,18 @@ class DSReadPat <DS_Pseudo inst, ValueTy
(inst $ptr, (as_i16imm $offset), (i1 0))
>;
+multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
+ def : Pat <
+ (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))),
+ (v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
+ >;
+
+ def : Pat <
+ (build_vector f16:$lo, (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))))),
+ (v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
+ >;
+}
+
def : DSReadPat <DS_READ_I8, i32, sextloadi8_local_m0>;
def : DSReadPat <DS_READ_U8, i32, az_extloadi8_local_m0>;
def : DSReadPat <DS_READ_I8, i16, sextloadi8_local_m0>;
@@ -565,6 +583,15 @@ def : Pat <
(DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
>;
+
+let Predicates = [HasD16LoadStore] in {
+let AddedComplexity = 100 in {
+defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
+defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
+defm : DSReadPat_Hi16<DS_READ_I8_D16_HI, sextloadi8_local>;
+}
+}
+
class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
(frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
(inst $ptr, $value, (as_i16imm $offset), (i1 0))
Modified: llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td?rev=313716&r1=313715&r2=313716&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td Tue Sep 19 22:01:53 2017
@@ -125,15 +125,18 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo
// same encoding value as exec_hi, so it isn't possible to use that if
// saddr is 32-bit (which isn't handled here yet).
class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
+ bit HasTiedOutput = 0,
bit HasSignedOffset = 0, bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
(outs regClass:$vdst),
!con(
!con(
- !con((ins VReg_64:$vaddr),
- !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
- (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
- (ins GLC:$glc, slc:$slc)),
+ !con(
+ !con((ins VReg_64:$vaddr),
+ !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
+ (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
+ (ins GLC:$glc, slc:$slc)),
+ !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
" $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
let has_data = 0;
let mayLoad = 1;
@@ -141,6 +144,9 @@ class FLAT_Load_Pseudo <string opName, R
let enabled_saddr = EnableSaddr;
let PseudoInstr = opName#!if(!and(HasSaddr, EnableSaddr), "_SADDR", "");
let maybeAtomic = 1;
+
+ let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
+ let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
}
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
@@ -163,10 +169,10 @@ class FLAT_Store_Pseudo <string opName,
let maybeAtomic = 1;
}
-multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass> {
+multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
let is_flat_global = 1 in {
- def "" : FLAT_Load_Pseudo<opName, regClass, 1, 1>;
- def _SADDR : FLAT_Load_Pseudo<opName, regClass, 1, 1, 1>;
+ def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>;
+ def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>;
}
}
@@ -360,12 +366,12 @@ def FLAT_STORE_DWORDX4 : FLAT_Store_Pseu
def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
let SubtargetPredicate = HasD16LoadStore in {
-def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32>;
-def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32>;
-def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32>;
-def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32>;
-def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32>;
-def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32>;
+def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32, 1>;
+def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
+def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32, 1>;
+def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
+def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32, 1>;
+def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
@@ -483,12 +489,12 @@ defm GLOBAL_LOAD_DWORDX2 : FLAT_Global_
defm GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>;
defm GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>;
-defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16", VGPR_32>;
-defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", VGPR_32>;
-defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16", VGPR_32>;
-defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32>;
-defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32>;
-defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32>;
+defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16", VGPR_32, 1>;
+defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", VGPR_32, 1>;
+defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16", VGPR_32, 1>;
+defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32, 1>;
+defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32, 1>;
+defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>;
defm GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>;
defm GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>;
@@ -624,6 +630,30 @@ class FlatLoadPat <FLAT_Pseudo inst, SDP
(inst $vaddr, $offset, 0, $slc)
>;
+multiclass FlatLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
+ def : Pat <
+ (build_vector vt:$elt0, (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))),
+ (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0))
+ >;
+
+ def : Pat <
+ (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))))),
+ (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0))
+ >;
+}
+
+multiclass FlatSignedLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
+ def : Pat <
+ (build_vector vt:$elt0, (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))),
+ (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0))
+ >;
+
+ def : Pat <
+ (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))))),
+ (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0))
+ >;
+}
+
class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
(vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))),
(inst $vaddr, $offset, 0, $slc)
@@ -729,6 +759,12 @@ def : FlatStorePat <FLAT_STORE_SHORT, st
let Predicates = [HasD16LoadStore] in {
def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
+
+let AddedComplexity = 3 in {
+defm : FlatLoadPat_Hi16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_flat>;
+defm : FlatLoadPat_Hi16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_flat>;
+defm : FlatLoadPat_Hi16 <FLAT_LOAD_SHORT_D16_HI, load_flat>;
+}
}
} // End Predicates = [HasFlatAddressSpace]
@@ -761,6 +797,10 @@ def : FlatStoreSignedPat <GLOBAL_STORE_D
let Predicates = [HasD16LoadStore] in {
def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
+
+defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_global>;
+defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_global>;
+defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SHORT_D16_HI, load_global>;
}
def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>;
Modified: llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll?rev=313716&r1=313715&r2=313716&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll Tue Sep 19 22:01:53 2017
@@ -92,14 +92,18 @@ define amdgpu_kernel void @extract_vecto
}
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_load_ushort
; SICIVI: buffer_store_short
; SICIVI: buffer_store_short
; SICIVI: buffer_store_short
+; GFX9: buffer_load_ushort
+; GFX9: buffer_load_ushort
+; GFX9: global_load_short_d16_hi
+
; GFX9: buffer_store_dword
; GFX9: buffer_store_dword
Modified: llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll?rev=313716&r1=313715&r2=313716&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll Tue Sep 19 22:01:53 2017
@@ -7,7 +7,7 @@
; unless isFabsFree returns true
; GCN-LABEL: {{^}}s_fabs_free_f16:
-; GCN: flat_load_ushort [[VAL:v[0-9]+]],
+; GCN: {{flat|global}}_load_ushort [[VAL:v[0-9]+]],
; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]]
; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -75,8 +75,8 @@ define amdgpu_kernel void @s_fabs_v4f16(
}
; GCN-LABEL: {{^}}fabs_fold_f16:
-; GCN: flat_load_ushort [[IN0:v[0-9]+]]
-; GCN: flat_load_ushort [[IN1:v[0-9]+]]
+; GCN: {{flat|global}}_load_ushort [[IN0:v[0-9]+]]
+; GCN: {{flat|global}}_load_ushort [[IN1:v[0-9]+]]
; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]]
; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]|
Added: llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll?rev=313716&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll Tue Sep 19 22:01:53 2017
@@ -0,0 +1,506 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+
+; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo:
+; GCN: s_waitcnt
+; GFX9-NEXT: ds_read_u16_d16_hi v0, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u16
+define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
+entry:
+ %load = load i16, i16 addrspace(3)* %in
+ %build = insertelement <2 x i16> undef, i16 %load, i32 1
+ ret <2 x i16> %build
+}
+
+; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo:
+; GCN: s_waitcnt
+; GFX9-NEXT: ds_read_u16_d16_hi v1, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u16
+define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
+entry:
+ %load = load i16, i16 addrspace(3)* %in
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
+ ret <2 x i16> %build1
+}
+
+; Show that we get reasonable regalloc without physreg constraints.
+; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: ds_read_u16_d16_hi v1, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u16
+define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
+entry:
+ %load = load i16, i16 addrspace(3)* %in
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo:
+; GCN: s_waitcnt
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: ds_read_u16_d16_hi v1, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u16
+define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
+entry:
+ %load = load i16, i16 addrspace(3)* %in
+ %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
+ ret <2 x i16> %build
+}
+
+; FIXME: Remove m0 initialization
+; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift:
+; GCN: s_waitcnt
+; GFX9-NEXT: s_mov_b32 m0, -1
+; GFX9-NEXT: ds_read_u16 v0, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u16
+; VI: v_lshlrev_b32_e32 v0, 16, v0
+define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 {
+entry:
+ %load = load i16, i16 addrspace(3)* %in
+ %zext = zext i16 %load to i32
+ %shift = shl i32 %zext, 16
+ ret i32 %shift
+}
+
+; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: ds_read_u16_d16_hi v1, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u16
+define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
+entry:
+ %load = load half, half addrspace(3)* %in
+ %build0 = insertelement <2 x half> undef, half %reg, i32 0
+ %build1 = insertelement <2 x half> %build0, half %load, i32 1
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: ds_read_u8_d16_hi v1, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u8
+define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
+entry:
+ %load = load i8, i8 addrspace(3)* %in
+ %ext = zext i8 %load to i16
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: ds_read_i8_d16_hi v1, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_i8
+define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
+entry:
+ %load = load i8, i8 addrspace(3)* %in
+ %ext = sext i8 %load to i16
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 {
+entry:
+ %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
+ %load = load i16, i16 addrspace(1)* %gep
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 {
+entry:
+ %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
+ %load = load half, half addrspace(1)* %gep
+ %build0 = insertelement <2 x half> undef, half %reg, i32 0
+ %build1 = insertelement <2 x half> %build0, half %load, i32 1
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
+entry:
+ %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
+ %load = load i8, i8 addrspace(1)* %gep
+ %ext = zext i8 %load to i16
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
+entry:
+ %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
+ %load = load i8, i8 addrspace(1)* %gep
+ %ext = sext i8 %load to i16
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: flat_load_short_d16_hi v2, v[0:1]
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: flat_load_ushort v{{[0-9]+}}
+; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_sdwa
+define void @load_flat_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 {
+entry:
+ %load = load i16, i16 addrspace(4)* %in
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: flat_load_short_d16_hi v2, v[0:1]
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: flat_load_ushort v{{[0-9]+}}
+; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_sdwa
+define void @load_flat_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 {
+entry:
+ %load = load half, half addrspace(4)* %in
+ %build0 = insertelement <2 x half> undef, half %reg, i32 0
+ %build1 = insertelement <2 x half> %build0, half %load, i32 1
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: flat_load_ubyte v{{[0-9]+}}
+; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_sdwa
+define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i16 %reg) #0 {
+entry:
+ %load = load i8, i8 addrspace(4)* %in
+ %ext = zext i8 %load to i16
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: flat_load_sbyte v{{[0-9]+}}
+; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_sdwa
+define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i16 %reg) #0 {
+entry:
+ %load = load i8, i8 addrspace(4)* %in
+ %ext = sext i8 %load to i16
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}}
+define void @load_private_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 {
+entry:
+ %gep = getelementptr inbounds i16, i16* %in, i64 2047
+ %load = load i16, i16* %gep
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}}
+define void @load_private_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 {
+entry:
+ %gep = getelementptr inbounds half, half* %in, i64 2047
+ %load = load half, half* %gep
+ %build0 = insertelement <2 x half> undef, half %reg, i32 0
+ %build1 = insertelement <2 x half> %build0, half %load, i32 1
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
+define void @load_private_hi_v2i16_reglo_vreg_nooff(i16* %in, i16 %reg) #0 {
+entry:
+ %load = load volatile i16, i16* inttoptr (i32 4094 to i16*)
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
+define void @load_private_hi_v2f16_reglo_vreg_nooff(half* %in, half %reg) #0 {
+entry:
+ %load = load volatile half, half* inttoptr (i32 4094 to half*)
+ %build0 = insertelement <2 x half> undef, half %reg, i32 0
+ %build1 = insertelement <2 x half> %build0, half %load, i32 1
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_ubyte_d16_hi v1, v0, s[0:3], s4 offen offset:2047{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ubyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}}
+define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 {
+entry:
+ %gep = getelementptr inbounds i8, i8* %in, i64 2047
+ %load = load i8, i8* %gep
+ %ext = zext i8 %load to i16
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_sbyte_d16_hi v1, v0, s[0:3], s4 offen offset:2047{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_sbyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}}
+define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 {
+entry:
+ %gep = getelementptr inbounds i8, i8* %in, i64 2047
+ %load = load i8, i8* %gep
+ %ext = sext i8 %load to i16
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
+define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8* %in, i16 %reg) #0 {
+entry:
+ %load = load volatile i8, i8* inttoptr (i32 4094 to i8*)
+ %ext = zext i8 %load to i16
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}}
+define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8* %in, i16 %reg) #0 {
+entry:
+ %load = load volatile i8, i8* inttoptr (i32 4094 to i8*)
+ %ext = sext i8 %load to i16
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
+define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8* %in, half %reg) #0 {
+entry:
+ %load = load volatile i8, i8* inttoptr (i32 4094 to i8*)
+ %ext = zext i8 %load to i16
+ %bc.ext = bitcast i16 %ext to half
+ %build0 = insertelement <2 x half> undef, half %reg, i32 0
+ %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: flat_load_ushort
+define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(2)* %in, i16 %reg) #0 {
+entry:
+ %gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 -2047
+ %load = load i16, i16 addrspace(2)* %gep
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg
+; GCN: s_waitcnt
+; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: flat_load_ushort
+define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(2)* %in, half %reg) #0 {
+entry:
+ %gep = getelementptr inbounds half, half addrspace(2)* %in, i64 -2047
+ %load = load half, half addrspace(2)* %gep
+ %build0 = insertelement <2 x half> undef, half %reg, i32 0
+ %build1 = insertelement <2 x half> %build0, half %load, i32 1
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+attributes #0 = { nounwind }
Modified: llvm/trunk/test/CodeGen/AMDGPU/packed-op-sel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/packed-op-sel.ll?rev=313716&r1=313715&r2=313716&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/packed-op-sel.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/packed-op-sel.ll Tue Sep 19 22:01:53 2017
@@ -228,15 +228,13 @@ bb:
ret void
}
+; FIXME: Can we avoid waitcnt between the two halves?
; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
-; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]]
-
-; FIXME: Remove and
-; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]]
-; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
+; GCN: ds_read_u16 [[PACKED:v[0-9]+]]
+; GCN-NEXT: s_waitcnt
+; GCN: ds_read_u16_d16_hi [[PACKED]]
; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
Modified: llvm/trunk/test/CodeGen/AMDGPU/sext-in-reg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sext-in-reg.ll?rev=313716&r1=313715&r2=313716&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sext-in-reg.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sext-in-reg.ll Tue Sep 19 22:01:53 2017
@@ -663,10 +663,10 @@ define amdgpu_kernel void @sext_in_reg_v
; FUNC-LABEL: {{^}}sext_in_reg_v3i1_to_v3i16:
; GFX9: v_pk_add_u16
-; GFX9: v_pk_add_u16
-; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
+; GFX9: v_pk_add_u16
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
%c = add <3 x i16> %a, %b ; add to prevent folding into extload
@@ -702,10 +702,11 @@ define amdgpu_kernel void @sext_in_reg_v
; FUNC-LABEL: {{^}}sext_in_reg_v3i8_to_v3i16:
; GFX9: v_pk_add_u16
-; GFX9: v_pk_add_u16
-; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+
+; GFX9: v_pk_add_u16
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
%c = add <3 x i16> %a, %b ; add to prevent folding into extload
More information about the llvm-commits
mailing list