[llvm] 80ab06c - [InstCombine] fold fake vector insert to bit-logic

Wed Oct 20 11:21:44 PDT 2021

Author: Sanjay Patel
Date: 2021-10-20T14:21:40-04:00
New Revision: 80ab06c599a0f5a90951c36a57b2a9b492b19d61

URL: https://github.com/llvm/llvm-project/commit/80ab06c599a0f5a90951c36a57b2a9b492b19d61
DIFF: https://github.com/llvm/llvm-project/commit/80ab06c599a0f5a90951c36a57b2a9b492b19d61.diff

LOG: [InstCombine] fold fake vector insert to bit-logic

bitcast (inselt (bitcast X), Y, 0) --> or (and X, MaskC), (zext Y)

https://alive2.llvm.org/ce/z/Ux-662

Similar to D111082 / db231ebdb07f :
We want to avoid relatively opaque vector ops on types that are
likely supported by the backend as scalar integers. The bitwise
logic ops are more likely to allow further combining.

We probably want to generalize this to allow a shift too, but
that would oppose instcombine's general rule of not creating
extra instructions, so that's left as a potential follow-up.
Alternatively, we could do that transform in VectorCombine
with the help of the TTI cost model.

This is part of solving:
https://llvm.org/PR52057

Added: 
    

Modified: 
    llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
    llvm/test/Transforms/InstCombine/bitcast-inselt-bitcast.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 44059fcba73a4..4937b48acdec6 100644

--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2763,6 +2763,30 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
       if (auto *InsElt = dyn_cast<InsertElementInst>(Src))
         return new BitCastInst(InsElt->getOperand(1), DestTy);
     }
+
+    // Convert an artificial vector insert into more analyzable bitwise logic.
+    unsigned BitWidth = DestTy->getScalarSizeInBits();
+    Value *X, *Y;
+    uint64_t IndexC;
+    if (match(Src, m_OneUse(m_InsertElt(m_OneUse(m_BitCast(m_Value(X))),
+                                        m_Value(Y), m_ConstantInt(IndexC)))) &&
+        DestTy->isIntegerTy() && X->getType() == DestTy &&
+        isDesirableIntType(BitWidth)) {
+      // Adjust for big endian - the LSBs are at the high index.
+      if (DL.isBigEndian())
+        IndexC = SrcVTy->getNumElements() - 1 - IndexC;
+
+      // We only handle (endian-normalized) insert to index 0. Any other insert
+      // would require a left-shift, so that is an extra instruction.
+      if (IndexC == 0) {
+        // bitcast (inselt (bitcast X), Y, 0) --> or (and X, MaskC), (zext Y)
+        unsigned EltWidth = Y->getType()->getScalarSizeInBits();
+        APInt MaskC = APInt::getHighBitsSet(BitWidth, BitWidth - EltWidth);
+        Value *AndX = Builder.CreateAnd(X, MaskC);
+        Value *ZextY = Builder.CreateZExt(Y, DestTy);
+        return BinaryOperator::CreateOr(AndX, ZextY);
+      }
+    }
   }
 
   if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Src)) {

diff  --git a/llvm/test/Transforms/InstCombine/bitcast-inselt-bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast-inselt-bitcast.ll
index 2c3c59fdc68b6..464a438f286f7 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-inselt-bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-inselt-bitcast.ll
@@ -1,15 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S -data-layout="E-n64" | FileCheck %s --check-prefixes=ALL
-; RUN: opt < %s -instcombine -S -data-layout="e-n64" | FileCheck %s --check-prefixes=ALL
+; RUN: opt < %s -instcombine -S -data-layout="E-n64" | FileCheck %s --check-prefixes=ALL,BE
+; RUN: opt < %s -instcombine -S -data-layout="e-n64" | FileCheck %s --check-prefixes=ALL,LE
 
 declare void @use(<2 x i8>)
 
+; i16 is a common type, so we can convert independently of the data layout.
+; Endian determines if a shift is needed (and so the transform is avoided).
+
 define i16 @insert0_v2i8(i16 %x, i8 %y) {
-; ALL-LABEL: @insert0_v2i8(
-; ALL-NEXT:    [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8>
-; ALL-NEXT:    [[I:%.*]] = insertelement <2 x i8> [[V]], i8 [[Y:%.*]], i8 0
-; ALL-NEXT:    [[R:%.*]] = bitcast <2 x i8> [[I]] to i16
-; ALL-NEXT:    ret i16 [[R]]
+; BE-LABEL: @insert0_v2i8(
+; BE-NEXT:    [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8>
+; BE-NEXT:    [[I:%.*]] = insertelement <2 x i8> [[V]], i8 [[Y:%.*]], i8 0
+; BE-NEXT:    [[R:%.*]] = bitcast <2 x i8> [[I]] to i16
+; BE-NEXT:    ret i16 [[R]]
+;
+; LE-LABEL: @insert0_v2i8(
+; LE-NEXT:    [[TMP1:%.*]] = and i16 [[X:%.*]], -256
+; LE-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i16
+; LE-NEXT:    [[R:%.*]] = or i16 [[TMP1]], [[TMP2]]
+; LE-NEXT:    ret i16 [[R]]
 ;
   %v = bitcast i16 %x to <2 x i8>
   %i = insertelement <2 x i8> %v, i8 %y, i8 0
@@ -17,12 +26,21 @@ define i16 @insert0_v2i8(i16 %x, i8 %y) {
   ret i16 %r
 }
 
+; i16 is a common type, so we can convert independently of the data layout.
+; Endian determines if a shift is needed (and so the transform is avoided).
+
 define i16 @insert1_v2i8(i16 %x, i8 %y) {
-; ALL-LABEL: @insert1_v2i8(
-; ALL-NEXT:    [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8>
-; ALL-NEXT:    [[I:%.*]] = insertelement <2 x i8> [[V]], i8 [[Y:%.*]], i8 1
-; ALL-NEXT:    [[R:%.*]] = bitcast <2 x i8> [[I]] to i16
-; ALL-NEXT:    ret i16 [[R]]
+; BE-LABEL: @insert1_v2i8(
+; BE-NEXT:    [[TMP1:%.*]] = and i16 [[X:%.*]], -256
+; BE-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i16
+; BE-NEXT:    [[R:%.*]] = or i16 [[TMP1]], [[TMP2]]
+; BE-NEXT:    ret i16 [[R]]
+;
+; LE-LABEL: @insert1_v2i8(
+; LE-NEXT:    [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8>
+; LE-NEXT:    [[I:%.*]] = insertelement <2 x i8> [[V]], i8 [[Y:%.*]], i8 1
+; LE-NEXT:    [[R:%.*]] = bitcast <2 x i8> [[I]] to i16
+; LE-NEXT:    ret i16 [[R]]
 ;
   %v = bitcast i16 %x to <2 x i8>
   %i = insertelement <2 x i8> %v, i8 %y, i8 1
@@ -30,12 +48,21 @@ define i16 @insert1_v2i8(i16 %x, i8 %y) {
   ret i16 %r
 }
 
+; i32 is a common type, so we can convert independently of the data layout.
+; Endian determines if a shift is needed (and so the transform is avoided).
+
 define i32 @insert0_v4i8(i32 %x, i8 %y) {
-; ALL-LABEL: @insert0_v4i8(
-; ALL-NEXT:    [[V:%.*]] = bitcast i32 [[X:%.*]] to <4 x i8>
-; ALL-NEXT:    [[I:%.*]] = insertelement <4 x i8> [[V]], i8 [[Y:%.*]], i8 0
-; ALL-NEXT:    [[R:%.*]] = bitcast <4 x i8> [[I]] to i32
-; ALL-NEXT:    ret i32 [[R]]
+; BE-LABEL: @insert0_v4i8(
+; BE-NEXT:    [[V:%.*]] = bitcast i32 [[X:%.*]] to <4 x i8>
+; BE-NEXT:    [[I:%.*]] = insertelement <4 x i8> [[V]], i8 [[Y:%.*]], i8 0
+; BE-NEXT:    [[R:%.*]] = bitcast <4 x i8> [[I]] to i32
+; BE-NEXT:    ret i32 [[R]]
+;
+; LE-LABEL: @insert0_v4i8(
+; LE-NEXT:    [[TMP1:%.*]] = and i32 [[X:%.*]], -256
+; LE-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
+; LE-NEXT:    [[R:%.*]] = or i32 [[TMP1]], [[TMP2]]
+; LE-NEXT:    ret i32 [[R]]
 ;
   %v = bitcast i32 %x to <4 x i8>
   %i = insertelement <4 x i8> %v, i8 %y, i8 0
@@ -43,12 +70,21 @@ define i32 @insert0_v4i8(i32 %x, i8 %y) {
   ret i32 %r
 }
 
+; i64 is a legal type, so we can convert based on the data layout.
+; Endian determines if a shift is needed (and so the transform is avoided).
+
 define i64 @insert0_v4i16(i64 %x, i16 %y) {
-; ALL-LABEL: @insert0_v4i16(
-; ALL-NEXT:    [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
-; ALL-NEXT:    [[I:%.*]] = insertelement <4 x i16> [[V]], i16 [[Y:%.*]], i8 0
-; ALL-NEXT:    [[R:%.*]] = bitcast <4 x i16> [[I]] to i64
-; ALL-NEXT:    ret i64 [[R]]
+; BE-LABEL: @insert0_v4i16(
+; BE-NEXT:    [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
+; BE-NEXT:    [[I:%.*]] = insertelement <4 x i16> [[V]], i16 [[Y:%.*]], i8 0
+; BE-NEXT:    [[R:%.*]] = bitcast <4 x i16> [[I]] to i64
+; BE-NEXT:    ret i64 [[R]]
+;
+; LE-LABEL: @insert0_v4i16(
+; LE-NEXT:    [[TMP1:%.*]] = and i64 [[X:%.*]], -65536
+; LE-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i64
+; LE-NEXT:    [[R:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; LE-NEXT:    ret i64 [[R]]
 ;
   %v = bitcast i64 %x to <4 x i16>
   %i = insertelement <4 x i16> %v, i16 %y, i8 0
@@ -56,6 +92,8 @@ define i64 @insert0_v4i16(i64 %x, i16 %y) {
   ret i64 %r
 }
 
+; Negative test - shifts needed for both endians.
+
 define i64 @insert1_v4i16(i64 %x, i16 %y) {
 ; ALL-LABEL: @insert1_v4i16(
 ; ALL-NEXT:    [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
@@ -69,12 +107,21 @@ define i64 @insert1_v4i16(i64 %x, i16 %y) {
   ret i64 %r
 }
 
+; i64 is a legal type, so we can convert based on the data layout.
+; Endian determines if a shift is needed (and so the transform is avoided).
+
 define i64 @insert3_v4i16(i64 %x, i16 %y) {
-; ALL-LABEL: @insert3_v4i16(
-; ALL-NEXT:    [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
-; ALL-NEXT:    [[I:%.*]] = insertelement <4 x i16> [[V]], i16 [[Y:%.*]], i8 3
-; ALL-NEXT:    [[R:%.*]] = bitcast <4 x i16> [[I]] to i64
-; ALL-NEXT:    ret i64 [[R]]
+; BE-LABEL: @insert3_v4i16(
+; BE-NEXT:    [[TMP1:%.*]] = and i64 [[X:%.*]], -65536
+; BE-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i64
+; BE-NEXT:    [[R:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; BE-NEXT:    ret i64 [[R]]
+;
+; LE-LABEL: @insert3_v4i16(
+; LE-NEXT:    [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
+; LE-NEXT:    [[I:%.*]] = insertelement <4 x i16> [[V]], i16 [[Y:%.*]], i8 3
+; LE-NEXT:    [[R:%.*]] = bitcast <4 x i16> [[I]] to i64
+; LE-NEXT:    ret i64 [[R]]
 ;
   %v = bitcast i64 %x to <4 x i16>
   %i = insertelement <4 x i16> %v, i16 %y, i8 3
@@ -82,6 +129,8 @@ define i64 @insert3_v4i16(i64 %x, i16 %y) {
   ret i64 %r
 }
 
+; Negative test - i128 is not a legal type, so we do not convert based on the data layout.
+
 define i128 @insert0_v4i32(i128 %x, i32 %y) {
 ; ALL-LABEL: @insert0_v4i32(
 ; ALL-NEXT:    [[V:%.*]] = bitcast i128 [[X:%.*]] to <4 x i32>
@@ -95,6 +144,8 @@ define i128 @insert0_v4i32(i128 %x, i32 %y) {
   ret i128 %r
 }
 
+; Negative test - extra use requires more instructions.
+
 define i16 @insert0_v2i8_use1(i16 %x, i8 %y) {
 ; ALL-LABEL: @insert0_v2i8_use1(
 ; ALL-NEXT:    [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8>
@@ -110,6 +161,8 @@ define i16 @insert0_v2i8_use1(i16 %x, i8 %y) {
   ret i16 %r
 }
 
+; Negative test - extra use requires more instructions.
+
 define i16 @insert0_v2i8_use2(i16 %x, i8 %y) {
 ; ALL-LABEL: @insert0_v2i8_use2(
 ; ALL-NEXT:    [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8>