[llvm] 04e5e64 - [VectorCombine] Generalize foldBitOpOfBitcasts to support more cast operations (#148350)

Mon Jul 21 09:15:00 PDT 2025

Author: Rahul Yadav
Date: 2025-07-21T17:14:56+01:00
New Revision: 04e5e643f526090ec872c0e505c487918992e21d

URL: https://github.com/llvm/llvm-project/commit/04e5e643f526090ec872c0e505c487918992e21d
DIFF: https://github.com/llvm/llvm-project/commit/04e5e643f526090ec872c0e505c487918992e21d.diff

LOG: [VectorCombine] Generalize foldBitOpOfBitcasts to support more cast operations (#148350)

This patch generalizes the existing foldBitOpOfBitcasts optimization in the VectorCombine pass to handle additional cast operations beyond just bitcast.

  Fixes: [#146037](https://github.com/llvm/llvm-project/issues/146037)

  Summary

The optimization now supports folding bitwise operations (AND/OR/XOR)
with the following cast operations:
  - bitcast (original functionality)
  - trunc (truncate)
  - sext (sign extend)
  - zext (zero extend)

  The transformation pattern is:
  bitop(castop(x), castop(y)) -> castop(bitop(x, y))

This reduces the number of cast instructions from 2 to 1, improving
performance on targets where cast operations
are expensive or where performing bitwise operations on narrower types
is beneficial.
  
  Implementation Details

- Renamed foldBitOpOfBitcasts to foldBitOpOfCastops to reflect broader
functionality
  - Extended pattern matching to handle any CastInst operation
- Added validation for each cast type's constraints (e.g., trunc
requires source > dest)
  - Updated cost model to use the actual cast opcode
  - Preserves IR flags from original instructions
  - Handles multi-use scenarios appropriately

  Testing

- Added comprehensive tests in
test/Transforms/VectorCombine/bitop-of-castops.ll
  - Tests cover all supported cast types with all bitwise operations
  - Includes negative tests for unsupported patterns
  - All existing VectorCombine tests pass

Added: 
    llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fe8d74c43dfdc..82adc34fdbd84 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -115,7 +115,7 @@ class VectorCombine {
   bool foldInsExtFNeg(Instruction &I);
   bool foldInsExtBinop(Instruction &I);
   bool foldInsExtVectorToShuffle(Instruction &I);
-  bool foldBitOpOfBitcasts(Instruction &I);
+  bool foldBitOpOfCastops(Instruction &I);
   bool foldBitcastShuffle(Instruction &I);
   bool scalarizeOpOrCmp(Instruction &I);
   bool scalarizeVPIntrinsic(Instruction &I);
@@ -808,48 +808,87 @@ bool VectorCombine::foldInsExtBinop(Instruction &I) {
   return true;
 }
 
-bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) {
-  // Match: bitop(bitcast(x), bitcast(y)) -> bitcast(bitop(x, y))
-  Value *LHSSrc, *RHSSrc;
-  if (!match(&I, m_BitwiseLogic(m_BitCast(m_Value(LHSSrc)),
-                                m_BitCast(m_Value(RHSSrc)))))
+/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
+/// Supports: bitcast, trunc, sext, zext
+bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
+  // Check if this is a bitwise logic operation
+  auto *BinOp = dyn_cast<BinaryOperator>(&I);
+  if (!BinOp || !BinOp->isBitwiseLogicOp())
     return false;
 
+  // Get the cast instructions
+  auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
+  auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
+  if (!LHSCast || !RHSCast) {
+    LLVM_DEBUG(dbgs() << "  One or both operands are not cast instructions\n");
+    return false;
+  }
+
+  // Both casts must be the same type
+  Instruction::CastOps CastOpcode = LHSCast->getOpcode();
+  if (CastOpcode != RHSCast->getOpcode())
+    return false;
+
+  // Only handle supported cast operations
+  switch (CastOpcode) {
+  case Instruction::BitCast:
+  case Instruction::Trunc:
+  case Instruction::SExt:
+  case Instruction::ZExt:
+    break;
+  default:
+    return false;
+  }
+
+  Value *LHSSrc = LHSCast->getOperand(0);
+  Value *RHSSrc = RHSCast->getOperand(0);
+
   // Source types must match
   if (LHSSrc->getType() != RHSSrc->getType())
     return false;
-  if (!LHSSrc->getType()->getScalarType()->isIntegerTy())
-    return false;
 
-  // Only handle vector types
+  // Only handle vector types with integer elements
   auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType());
   auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
   if (!SrcVecTy || !DstVecTy)
     return false;
 
-  // Same total bit width
-  assert(SrcVecTy->getPrimitiveSizeInBits() ==
-             DstVecTy->getPrimitiveSizeInBits() &&
-         "Bitcast should preserve total bit width");
+  if (!SrcVecTy->getScalarType()->isIntegerTy() ||
+      !DstVecTy->getScalarType()->isIntegerTy())
+    return false;
 
   // Cost Check :
-  // OldCost = bitlogic + 2*bitcasts
-  // NewCost = bitlogic + bitcast
-  auto *BinOp = cast<BinaryOperator>(&I);
+  // OldCost = bitlogic + 2*casts
+  // NewCost = bitlogic + cast
+
+  // Calculate specific costs for each cast with instruction context
+  InstructionCost LHSCastCost =
+      TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
+                           TTI::CastContextHint::None, CostKind, LHSCast);
+  InstructionCost RHSCastCost =
+      TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
+                           TTI::CastContextHint::None, CostKind, RHSCast);
+
   InstructionCost OldCost =
-      TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy) +
-      TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, LHSSrc->getType(),
-                           TTI::CastContextHint::None) +
-      TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, RHSSrc->getType(),
-                           TTI::CastContextHint::None);
+      TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy, CostKind) +
+      LHSCastCost + RHSCastCost;
+
+  // For new cost, we can't provide an instruction (it doesn't exist yet)
+  InstructionCost GenericCastCost = TTI.getCastInstrCost(
+      CastOpcode, DstVecTy, SrcVecTy, TTI::CastContextHint::None, CostKind);
+
   InstructionCost NewCost =
-      TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy) +
-      TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, SrcVecTy,
-                           TTI::CastContextHint::None);
+      TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy, CostKind) +
+      GenericCastCost;
 
-  LLVM_DEBUG(dbgs() << "Found a bitwise logic op of bitcasted values: " << I
-                    << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
-                    << "\n");
+  // Account for multi-use casts using specific costs
+  if (!LHSCast->hasOneUse())
+    NewCost += LHSCastCost;
+  if (!RHSCast->hasOneUse())
+    NewCost += RHSCastCost;
+
+  LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
+                    << " NewCost=" << NewCost << "\n");
 
   if (NewCost > OldCost)
     return false;
@@ -862,8 +901,16 @@ bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) {
 
   Worklist.pushValue(NewOp);
 
-  // Bitcast the result back
-  Value *Result = Builder.CreateBitCast(NewOp, I.getType());
+  // Create the cast operation directly to ensure we get a new instruction
+  Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
+
+  // Preserve cast instruction flags
+  NewCast->copyIRFlags(LHSCast);
+  NewCast->andIRFlags(RHSCast);
+
+  // Insert the new instruction
+  Value *Result = Builder.Insert(NewCast);
+
   replaceValue(I, *Result);
   return true;
 }
@@ -3773,7 +3820,7 @@ bool VectorCombine::run() {
       case Instruction::And:
       case Instruction::Or:
       case Instruction::Xor:
-        MadeChange |= foldBitOpOfBitcasts(I);
+        MadeChange |= foldBitOpOfCastops(I);
         break;
       default:
         MadeChange |= shrinkType(I);

diff  --git a/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll b/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll
new file mode 100644
index 0000000000000..220556c8c38c3
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll
@@ -0,0 +1,262 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- | FileCheck %s
+
+; Negative test: bitcast from float to int (optimization should not apply)
+define <4 x i32> @and_bitcast_v4f32_to_v4i32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @and_bitcast_v4f32_to_v4i32(
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[BC1]], [[BC2]]
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %bc1 = bitcast <4 x float> %a to <4 x i32>
+  %bc2 = bitcast <4 x float> %b to <4 x i32>
+  %and = and <4 x i32> %bc1, %bc2
+  ret <4 x i32> %and
+}
+
+; Test bitwise operations with integer-to-integer bitcast
+define <2 x i32> @or_bitcast_v4i16_to_v2i32(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @or_bitcast_v4i16_to_v2i32(
+; CHECK-NEXT:    [[B:%.*]] = or <4 x i16> [[A:%.*]], [[B1:%.*]]
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <4 x i16> [[B]] to <2 x i32>
+; CHECK-NEXT:    ret <2 x i32> [[BC2]]
+;
+  %bc1 = bitcast <4 x i16> %a to <2 x i32>
+  %bc2 = bitcast <4 x i16> %b to <2 x i32>
+  %or = or <2 x i32> %bc1, %bc2
+  ret <2 x i32> %or
+}
+
+define <16 x i8> @xor_bitcast_v2i64_to_v16i8(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @xor_bitcast_v2i64_to_v16i8(
+; CHECK-NEXT:    [[B:%.*]] = xor <2 x i64> [[A:%.*]], [[B1:%.*]]
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+; CHECK-NEXT:    ret <16 x i8> [[BC2]]
+;
+  %bc1 = bitcast <2 x i64> %a to <16 x i8>
+  %bc2 = bitcast <2 x i64> %b to <16 x i8>
+  %xor = xor <16 x i8> %bc1, %bc2
+  ret <16 x i8> %xor
+}
+
+; Test bitwise operations with truncate
+define <4 x i16> @and_trunc_v4i32_to_v4i16(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @and_trunc_v4i32_to_v4i16(
+; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = trunc <4 x i32> [[AND_INNER]] to <4 x i16>
+; CHECK-NEXT:    ret <4 x i16> [[AND]]
+;
+  %t1 = trunc <4 x i32> %a to <4 x i16>
+  %t2 = trunc <4 x i32> %b to <4 x i16>
+  %and = and <4 x i16> %t1, %t2
+  ret <4 x i16> %and
+}
+
+define <8 x i8> @or_trunc_v8i16_to_v8i8(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @or_trunc_v8i16_to_v8i8(
+; CHECK-NEXT:    [[OR_INNER:%.*]] = or <8 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = trunc <8 x i16> [[OR_INNER]] to <8 x i8>
+; CHECK-NEXT:    ret <8 x i8> [[OR]]
+;
+  %t1 = trunc <8 x i16> %a to <8 x i8>
+  %t2 = trunc <8 x i16> %b to <8 x i8>
+  %or = or <8 x i8> %t1, %t2
+  ret <8 x i8> %or
+}
+
+define <2 x i32> @xor_trunc_v2i64_to_v2i32(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @xor_trunc_v2i64_to_v2i32(
+; CHECK-NEXT:    [[XOR_INNER:%.*]] = xor <2 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[XOR:%.*]] = trunc <2 x i64> [[XOR_INNER]] to <2 x i32>
+; CHECK-NEXT:    ret <2 x i32> [[XOR]]
+;
+  %t1 = trunc <2 x i64> %a to <2 x i32>
+  %t2 = trunc <2 x i64> %b to <2 x i32>
+  %xor = xor <2 x i32> %t1, %t2
+  ret <2 x i32> %xor
+}
+
+; Test bitwise operations with zero extend
+define <4 x i32> @and_zext_v4i16_to_v4i32(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @and_zext_v4i16_to_v4i32(
+; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = zext <4 x i16> [[AND_INNER]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %z1 = zext <4 x i16> %a to <4 x i32>
+  %z2 = zext <4 x i16> %b to <4 x i32>
+  %and = and <4 x i32> %z1, %z2
+  ret <4 x i32> %and
+}
+
+define <8 x i16> @or_zext_v8i8_to_v8i16(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @or_zext_v8i8_to_v8i16(
+; CHECK-NEXT:    [[OR_INNER:%.*]] = or <8 x i8> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = zext <8 x i8> [[OR_INNER]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[OR]]
+;
+  %z1 = zext <8 x i8> %a to <8 x i16>
+  %z2 = zext <8 x i8> %b to <8 x i16>
+  %or = or <8 x i16> %z1, %z2
+  ret <8 x i16> %or
+}
+
+define <2 x i64> @xor_zext_v2i32_to_v2i64(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @xor_zext_v2i32_to_v2i64(
+; CHECK-NEXT:    [[XOR_INNER:%.*]] = xor <2 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[XOR:%.*]] = zext <2 x i32> [[XOR_INNER]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[XOR]]
+;
+  %z1 = zext <2 x i32> %a to <2 x i64>
+  %z2 = zext <2 x i32> %b to <2 x i64>
+  %xor = xor <2 x i64> %z1, %z2
+  ret <2 x i64> %xor
+}
+
+; Test bitwise operations with sign extend
+define <4 x i32> @and_sext_v4i16_to_v4i32(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @and_sext_v4i16_to_v4i32(
+; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = sext <4 x i16> [[AND_INNER]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %s1 = sext <4 x i16> %a to <4 x i32>
+  %s2 = sext <4 x i16> %b to <4 x i32>
+  %and = and <4 x i32> %s1, %s2
+  ret <4 x i32> %and
+}
+
+define <8 x i16> @or_sext_v8i8_to_v8i16(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @or_sext_v8i8_to_v8i16(
+; CHECK-NEXT:    [[OR_INNER:%.*]] = or <8 x i8> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = sext <8 x i8> [[OR_INNER]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[OR]]
+;
+  %s1 = sext <8 x i8> %a to <8 x i16>
+  %s2 = sext <8 x i8> %b to <8 x i16>
+  %or = or <8 x i16> %s1, %s2
+  ret <8 x i16> %or
+}
+
+define <2 x i64> @xor_sext_v2i32_to_v2i64(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @xor_sext_v2i32_to_v2i64(
+; CHECK-NEXT:    [[XOR_INNER:%.*]] = xor <2 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[XOR:%.*]] = sext <2 x i32> [[XOR_INNER]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[XOR]]
+;
+  %s1 = sext <2 x i32> %a to <2 x i64>
+  %s2 = sext <2 x i32> %b to <2 x i64>
+  %xor = xor <2 x i64> %s1, %s2
+  ret <2 x i64> %xor
+}
+
+; Negative test: mismatched cast types (zext and sext)
+define <4 x i32> @and_zext_sext_mismatch(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @and_zext_sext_mismatch(
+; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[S2:%.*]] = sext <4 x i16> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[Z1]], [[S2]]
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %z1 = zext <4 x i16> %a to <4 x i32>
+  %s2 = sext <4 x i16> %b to <4 x i32>
+  %and = and <4 x i32> %z1, %s2
+  ret <4 x i32> %and
+}
+
+; Negative test: mismatched source types
+define <4 x i32> @or_zext_
diff erent_src_types(<4 x i16> %a, <4 x i8> %b) {
+; CHECK-LABEL: @or_zext_
diff erent_src_types(
+; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[Z2:%.*]] = zext <4 x i8> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[Z1]], [[Z2]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %z1 = zext <4 x i16> %a to <4 x i32>
+  %z2 = zext <4 x i8> %b to <4 x i32>
+  %or = or <4 x i32> %z1, %z2
+  ret <4 x i32> %or
+}
+
+; Negative test: scalar types (not vectors)
+define i32 @xor_zext_scalar(i16 %a, i16 %b) {
+; CHECK-LABEL: @xor_zext_scalar(
+; CHECK-NEXT:    [[Z1:%.*]] = zext i16 [[A:%.*]] to i32
+; CHECK-NEXT:    [[Z2:%.*]] = zext i16 [[B:%.*]] to i32
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Z1]], [[Z2]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %z1 = zext i16 %a to i32
+  %z2 = zext i16 %b to i32
+  %xor = xor i32 %z1, %z2
+  ret i32 %xor
+}
+
+; Test multi-use: one cast has multiple uses
+define <4 x i32> @and_zext_multiuse(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @and_zext_multiuse(
+; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i16> [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = zext <4 x i16> [[AND_INNER]] to <4 x i32>
+; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[Z1]], [[AND]]
+; CHECK-NEXT:    ret <4 x i32> [[ADD]]
+;
+  %z1 = zext <4 x i16> %a to <4 x i32>
+  %z2 = zext <4 x i16> %b to <4 x i32>
+  %and = and <4 x i32> %z1, %z2
+  %add = add <4 x i32> %z1, %and  ; z1 has multiple uses
+  ret <4 x i32> %add
+}
+
+; Test with 
diff erent vector sizes
+define <16 x i16> @or_zext_v16i8_to_v16i16(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @or_zext_v16i8_to_v16i16(
+; CHECK-NEXT:    [[OR_INNER:%.*]] = or <16 x i8> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = zext <16 x i8> [[OR_INNER]] to <16 x i16>
+; CHECK-NEXT:    ret <16 x i16> [[OR]]
+;
+  %z1 = zext <16 x i8> %a to <16 x i16>
+  %z2 = zext <16 x i8> %b to <16 x i16>
+  %or = or <16 x i16> %z1, %z2
+  ret <16 x i16> %or
+}
+
+; Test bitcast with 
diff erent element counts
+define <8 x i16> @xor_bitcast_v4i32_to_v8i16(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @xor_bitcast_v4i32_to_v8i16(
+; CHECK-NEXT:    [[XOR_INNER:%.*]] = xor <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[XOR:%.*]] = bitcast <4 x i32> [[XOR_INNER]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[XOR]]
+;
+  %bc1 = bitcast <4 x i32> %a to <8 x i16>
+  %bc2 = bitcast <4 x i32> %b to <8 x i16>
+  %xor = xor <8 x i16> %bc1, %bc2
+  ret <8 x i16> %xor
+}
+
+; Test truncate with flag preservation
+define <4 x i16> @and_trunc_nuw_nsw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @and_trunc_nuw_nsw(
+; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = trunc nuw nsw <4 x i32> [[AND_INNER]] to <4 x i16>
+; CHECK-NEXT:    ret <4 x i16> [[AND]]
+;
+  %t1 = trunc nuw nsw <4 x i32> %a to <4 x i16>
+  %t2 = trunc nuw nsw <4 x i32> %b to <4 x i16>
+  %and = and <4 x i16> %t1, %t2
+  ret <4 x i16> %and
+}
+
+; Test sign extend with nneg flag
+define <4 x i32> @or_zext_nneg(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @or_zext_nneg(
+; CHECK-NEXT:    [[OR_INNER:%.*]] = or <4 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = zext nneg <4 x i16> [[OR_INNER]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %z1 = zext nneg <4 x i16> %a to <4 x i32>
+  %z2 = zext nneg <4 x i16> %b to <4 x i32>
+  %or = or <4 x i32> %z1, %z2
+  ret <4 x i32> %or
+}