[llvm] f99580c - [DAGCombine] Fix bug in load scalarization

Wed Nov 4 11:09:00 PST 2020

Author: Fraser Cormack
Date: 2020-11-04T19:02:40Z
New Revision: f99580c1e57885c8337a7deb08459095f4c16422

URL: https://github.com/llvm/llvm-project/commit/f99580c1e57885c8337a7deb08459095f4c16422
DIFF: https://github.com/llvm/llvm-project/commit/f99580c1e57885c8337a7deb08459095f4c16422.diff

LOG: [DAGCombine] Fix bug in load scalarization

Summary:
For vector element types which are not byte-sized, we would generate
incorrect scalar offsets and produce incorrect codegen.

This optimization could potentially be supported in the future, e.g. by
loading in bytes, then shifting and masking out the remaining bits of
the vector element. However, without an upstream target to test against
it's best to avoid the bad codegen in the simplest possible way.

Related to this bug:

  https://bugs.llvm.org/show_bug.cgi?id=27600

Reviewed by: foad

Differential Revision: https://reviews.llvm.org/D78568

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AMDGPU/extract-load-i1.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index fbb5262ae905..b44471d533bf 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17780,6 +17780,13 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
 
   EVT ResultVT = EVE->getValueType(0);
   EVT VecEltVT = InVecVT.getVectorElementType();
+
+  // If the vector element type is not a multiple of a byte then we are unable
+  // to correctly compute an address to load only the extracted element as a
+  // scalar.
+  if (!VecEltVT.isByteSized())
+    return SDValue();
+
   Align Alignment = OriginalLoad->getAlign();
   Align NewAlign = DAG.getDataLayout().getABITypeAlign(
       VecEltVT.getTypeForEVT(*DAG.getContext()));

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
index 14bfe398a12e..5a4c83d478b0 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
@@ -1,16 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
 
-; FIXME: Incorrect codegen during an optimization of load + extractelement when
-; the vector element type is not byte-sized: this test loads from %idx-many
-; bytes from address %ptr, instead of that many bits.
+; FIXME: Inefficient codegen which skips an optimization of load +
+; extractelement when the vector element type is not byte-sized.
 define i1 @extractloadi1(<8 x i1> *%ptr, i32 %idx) {
 ; CHECK-LABEL: extractloadi1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, 1, v[0:1]
 ; CHECK-NEXT:    flat_load_ubyte v0, v[0:1]
+; CHECK-NEXT:    v_and_b32_e32 v1, 7, v2
+; CHECK-NEXT:    v_lshr_b32_e64 v9, s32, 6
+; CHECK-NEXT:    v_or_b32_e32 v1, v9, v1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v2, v0, 1, 1
+; CHECK-NEXT:    v_bfe_u32 v3, v0, 2, 2
+; CHECK-NEXT:    v_bfe_u32 v4, v0, 3, 1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 4, v0
+; CHECK-NEXT:    v_bfe_u32 v6, v0, 5, 1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 6, v0
+; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 7, v0
+; CHECK-NEXT:    buffer_store_byte v0, off, s[0:3], s32
+; CHECK-NEXT:    buffer_store_byte v8, off, s[0:3], s32 offset:7
+; CHECK-NEXT:    buffer_store_byte v7, off, s[0:3], s32 offset:6
+; CHECK-NEXT:    buffer_store_byte v6, off, s[0:3], s32 offset:5
+; CHECK-NEXT:    buffer_store_byte v5, off, s[0:3], s32 offset:4
+; CHECK-NEXT:    buffer_store_byte v4, off, s[0:3], s32 offset:3
+; CHECK-NEXT:    buffer_store_byte v3, off, s[0:3], s32 offset:2
+; CHECK-NEXT:    buffer_store_byte v2, off, s[0:3], s32 offset:1
+; CHECK-NEXT:    buffer_load_ubyte v0, v1, s[0:3], 0 offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %val = load <8 x i1>, <8 x i1> *%ptr
   %ret = extractelement <8 x i1> %val, i32 %idx