[PATCH] D22838: AMDGPU/SI: Implement amdgcn image intrinsics

Tue Jul 26 18:05:10 PDT 2016

arsenm added inline comments.

================
Comment at: include/llvm/IR/IntrinsicsAMDGPU.td:203
@@ -205,2 +202,3 @@
 def int_amdgcn_image_load_mip : AMDGPUImageLoad;
+def int_amdgcn_getresinfo : AMDGPUImageLoad;
 
----------------
The full instruction name is image_get_resinfo, so the intrinsic should be int_amdgcn_image_getresinfo

================
Comment at: include/llvm/IR/IntrinsicsAMDGPU.td:211
@@ -212,6 +210,3 @@
    llvm_i32_ty,       // dmask(imm)
-   llvm_i1_ty,        // r128(imm)
-   llvm_i1_ty,        // da(imm)
-   llvm_i1_ty,        // glc(imm)
-   llvm_i1_ty],       // slc(imm)
+   llvm_i32_ty],      // flags (imm)
   []>;
----------------
This requires a descriptive comment (including the values for which bits)

================
Comment at: include/llvm/IR/IntrinsicsAMDGPU.td:276-307
@@ +275,34 @@
+// Basic gather4
+def int_amdgcn_gather4 : AMDGPUSampleRaw;
+def int_amdgcn_gather4_cl : AMDGPUSampleRaw;
+def int_amdgcn_gather4_l : AMDGPUSampleRaw;
+def int_amdgcn_gather4_b : AMDGPUSampleRaw;
+def int_amdgcn_gather4_b_cl : AMDGPUSampleRaw;
+def int_amdgcn_gather4_lz : AMDGPUSampleRaw;
+
+// Gather4 with comparison
+def int_amdgcn_gather4_c : AMDGPUSampleRaw;
+def int_amdgcn_gather4_c_cl : AMDGPUSampleRaw;
+def int_amdgcn_gather4_c_l : AMDGPUSampleRaw;
+def int_amdgcn_gather4_c_b : AMDGPUSampleRaw;
+def int_amdgcn_gather4_c_b_cl : AMDGPUSampleRaw;
+def int_amdgcn_gather4_c_lz : AMDGPUSampleRaw;
+
+// Gather4 with offsets
+def int_amdgcn_gather4_o : AMDGPUSampleRaw;
+def int_amdgcn_gather4_cl_o : AMDGPUSampleRaw;
+def int_amdgcn_gather4_l_o : AMDGPUSampleRaw;
+def int_amdgcn_gather4_b_o : AMDGPUSampleRaw;
+def int_amdgcn_gather4_b_cl_o : AMDGPUSampleRaw;
+def int_amdgcn_gather4_lz_o : AMDGPUSampleRaw;
+
+// Gather4 with comparison and offsets
+def int_amdgcn_gather4_c_o : AMDGPUSampleRaw;
+def int_amdgcn_gather4_c_cl_o : AMDGPUSampleRaw;
+def int_amdgcn_gather4_c_l_o : AMDGPUSampleRaw;
+def int_amdgcn_gather4_c_b_o : AMDGPUSampleRaw;
+def int_amdgcn_gather4_c_b_cl_o : AMDGPUSampleRaw;
+def int_amdgcn_gather4_c_lz_o : AMDGPUSampleRaw;
+
+def int_amdgcn_getlod : AMDGPUSampleRaw;
+
----------------
These are also missing the image part of the name as well

================
Comment at: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp:156-162
@@ -155,1 +155,9 @@
 
+  bool SelectImageFlagBits(SDValue ArgNode, SDValue &unorm,
+                                            SDValue &glc,
+                                            SDValue &slc,
+                                            SDValue &r128,
+                                            SDValue &tfe,
+                                            SDValue &lwe,
+                                            SDValue &da);
+
----------------
Param names should be capitalized, the function name itself should be lowercase

================
Comment at: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp:1103-1116
@@ +1102,16 @@
+  int64_t ArgImm = C->getZExtValue();
+  int UNORMBit=0, GLCBit=0, SLCBit=0, R128Bit=0,
+      TFEBit=0,   LWEBit=0, DABit=0;
+
+  if (ArgImm & llvm::MIMGFlags::UNORM)
+    UNORMBit = 1;
+  if (ArgImm & llvm::MIMGFlags::GLC)
+    GLCBit = 1;
+  if (ArgImm & llvm::MIMGFlags::SLC)
+    SLCBit = 1;
+  if (ArgImm & llvm::MIMGFlags::R128)
+    R128Bit = 1;
+  if (ArgImm & llvm::MIMGFlags::TFE)
+    TFEBit = 1;
+  if (ArgImm & llvm::MIMGFlags::LWE)
+    LWEBit = 1;
----------------
You don't need these intermediate variables, you can just check the bit as the argument to getTargetConstant directly

================
Comment at: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp:1134-1135
@@ +1133,4 @@
+  da = CurDAG->getTargetConstant(DABit, SL, MVT::i1);
+ // d16 = CurDAG->getTargetConstant(D16Bit, SL, MVT::i1);
+ // rsvd = CurDAG->getTargetConstant(RSVDBit, SL, MVT::i1);
+
----------------
Commented out code

================
Comment at: lib/Target/AMDGPU/SIDefines.h:111-114
@@ +110,6 @@
+  UNORM = 1 << 0,
+  GLC   = 1 << 1,
+  SLC   = 1 << 2,
+  R128  = 1 << 3,
+  TFE   = 1 << 4,
+  LWE   = 1 << 5,
----------------
Not all of these bits should be exposed. GLC, SLC, and TFE definitely should not be. TFE changes the register class of the output. This should strictly be the ones for controlling the sampling

================
Comment at: lib/Target/AMDGPU/SIInstructions.td:2738
@@ -2717,1 +2737,3 @@
 
+//======================== update for amdgcn =================================
+// Basic sample
----------------
Not sure what this means

================
Comment at: lib/Target/AMDGPU/SIInstructions.td:2740-2749
@@ +2739,12 @@
+// Basic sample
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample,           "IMAGE_SAMPLE">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl,        "IMAGE_SAMPLE_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d,         "IMAGE_SAMPLE_D">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl,      "IMAGE_SAMPLE_D_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l,         "IMAGE_SAMPLE_L">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b,         "IMAGE_SAMPLE_B">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl,      "IMAGE_SAMPLE_B_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz,        "IMAGE_SAMPLE_LZ">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd,        "IMAGE_SAMPLE_CD">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl,     "IMAGE_SAMPLE_CD_CL">;
+
----------------
I think you can reduce the number of repeated lines by passing in the suffix, and then concat + cast to the intrinsic (Similar to how MUBUF_LoadIntrinsicPat does it)

================
Comment at: test/CodeGen/AMDGPU/llvm.amdgcn.gather4.ll:4-5
@@ +3,4 @@
+
+;CHECK-LABEL: {{^}}gather4_v2:
+;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define void @gather4_v2() {
----------------
Space between ; and start of comment. Also should probably switch to GCN prefix because it seems likely we'll need to distinguish subtargets here

================
Comment at: test/CodeGen/AMDGPU/llvm.amdgcn.gather4.ll:9-13
@@ +8,7 @@
+  %r = call <4 x float> @llvm.amdgcn.gather4.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 64)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
----------------
It should work to just store the output directly into an output pointer rather than having to scalarize and export since this is a compute shader


https://reviews.llvm.org/D22838