[flang-commits] [flang] [llvm] [GlobalISel] Port over `simplifyDemandedBits` to GlobalISel (PR #198808)

Fri Jun 12 05:09:40 PDT 2026

https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/198808

>From 344d8200b169d210b77e3a5d523a04106c788994 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Thu, 11 Jun 2026 17:12:51 -0700
Subject: [PATCH 1/4] [flang] Relax offload image suffix check in
 omp-driver-offload.f90

The driver now emits a .s file for the AMDGPU offload image in this
path, while the test still expected .bc. Drop the suffix from the
FileCheck pattern; the test's intent is to verify that
llvm-offload-binary is invoked with the correct triple/arch/kind,
not the image file type.
---
 flang/test/Driver/omp-driver-offload.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/test/Driver/omp-driver-offload.f90 b/flang/test/Driver/omp-driver-offload.f90
index a4184f11bf3b0..217ced988ceac 100644
--- a/flang/test/Driver/omp-driver-offload.f90
+++ b/flang/test/Driver/omp-driver-offload.f90
@@ -61,7 +61,7 @@
 ! OPENMP-OFFLOAD-ARGS-SAME:  "-fopenmp"
 ! OPENMP-OFFLOAD-ARGS-SAME:  "-fopenmp-host-ir-file-path" "{{.*}}.bc" "-fopenmp-is-target-device"
 ! OPENMP-OFFLOAD-ARGS-SAME:  {{.*}}.f90"
-! OPENMP-OFFLOAD-ARGS: "{{[^"]*}}llvm-offload-binary{{.*}}" {{.*}} "--image=file={{.*}}.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp"
+! OPENMP-OFFLOAD-ARGS: "{{[^"]*}}llvm-offload-binary{{.*}}" {{.*}} "--image=file={{.*}},triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp"
 ! OPENMP-OFFLOAD-ARGS-NEXT: "{{[^"]*}}flang" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
 ! OPENMP-OFFLOAD-ARGS-SAME:  "-fopenmp"
 ! OPENMP-OFFLOAD-ARGS-SAME:  "-fembed-offload-object={{.*}}.out" {{.*}}.bc"

>From 2c0ad7672c245340854ccc3ffeae301d0b227396 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Thu, 11 Jun 2026 17:12:51 -0700
Subject: [PATCH 2/4] [AArch64][GlobalISel] Look through G_UBFX in the test-bit
 walker

Bit B of (ubfx X, lsb, width) is bit B + lsb of X whenever B < width,
so the compare-branch bit-test fusion can keep walking through a UBFX
the same way it walks shifts. Without this, combines that canonicalize
and/shift chains into G_UBFX split single tbz/tbnz instructions into
ubfx + tbnz pairs.
---
 .../GISel/AArch64InstructionSelector.cpp      |  22 ++++
 .../GlobalISel/opt-fold-ubfx-tbz-tbnz.mir     | 120 ++++++++++++++++++
 2 files changed, 142 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-ubfx-tbz-tbnz.mir

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index cf650fd5c4e72..852a45b38173f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -1483,6 +1483,19 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
         C = VRegAndVal->Value.getSExtValue();
       break;
     }
+    case TargetOpcode::G_UBFX: {
+      // For G_UBFX, only walk through when the tested bit lies within the
+      // extracted field; bits at or above the width are known zero, which we
+      // don't handle here.
+      TestReg = MI->getOperand(1).getReg();
+      auto Lsb =
+          getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
+      auto Width =
+          getIConstantVRegValWithLookThrough(MI->getOperand(3).getReg(), MRI);
+      if (Lsb && Width && Width->Value.ugt(Bit))
+        C = Lsb->Value.getZExtValue();
+      break;
+    }
     }
 
     // Didn't find a constant or viable register. Bail out of the loop.
@@ -1524,6 +1537,15 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
         Bit = Bit + *C;
       }
       break;
+    case TargetOpcode::G_UBFX:
+      // (tbz (ubfx x, lsb, width), b) -> (tbz x, b+lsb) when b < width, since
+      // bit b of the extract is bit b+lsb of x. (b >= width was rejected when
+      // matching the constant above.)
+      if ((Bit + *C) < TestRegSize) {
+        NextReg = TestReg;
+        Bit = Bit + *C;
+      }
+      break;
     case TargetOpcode::G_XOR:
       // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
       // appropriate.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-ubfx-tbz-tbnz.mir b/llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-ubfx-tbz-tbnz.mir
new file mode 100644
index 0000000000000..9ac86277dd814
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-ubfx-tbz-tbnz.mir
@@ -0,0 +1,120 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64-unknown-unknown -run-pass=instruction-select -global-isel-abort=1 -verify-machineinstrs %s -o - | FileCheck %s
+#
+# Check folding a G_UBFX into a G_BRCOND which has been matched as a TB(N)Z.
+...
+---
+name:            fold_ubfx_in_field
+alignment:       4
+legalized:       true
+regBankSelected: true
+body:             |
+  ; CHECK-LABEL: name: fold_ubfx_in_field
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %copy:gpr64all = COPY $x0
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %copy.sub_32
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]]
+  ; CHECK-NEXT:   TBNZW [[COPY1]], 4, %bb.1
+  ; CHECK-NEXT:   B %bb.0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    liveins: $x0
+
+    ; G_UBFX %x, lsb=3, width=4 — testing bit 1 of the extract.
+    ; Bit 1 is within the field (1 < 4), so fold: tbnz x, 1+3=4
+    %copy:gpr(s64) = COPY $x0
+    %lsb:gpr(s64) = G_CONSTANT i64 3
+    %width:gpr(s64) = G_CONSTANT i64 4
+    %ubfx:gpr(s64) = G_UBFX %copy, %lsb, %width
+
+    %bit:gpr(s64) = G_CONSTANT i64 2
+    %zero:gpr(s64) = G_CONSTANT i64 0
+    %and:gpr(s64) = G_AND %ubfx, %bit
+    %cmp:gpr(s32) = G_ICMP intpred(ne), %and(s64), %zero
+    G_BRCOND %cmp, %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR
+...
+---
+name:            dont_fold_bit_ge_width
+alignment:       4
+legalized:       true
+regBankSelected: true
+body:             |
+  ; CHECK-LABEL: name: dont_fold_bit_ge_width
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %copy:gpr64 = COPY $x0
+  ; CHECK-NEXT:   %ubfx:gpr64 = UBFMXri %copy, 3, 6
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %ubfx.sub_32
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]]
+  ; CHECK-NEXT:   TBNZW [[COPY1]], 5, %bb.1
+  ; CHECK-NEXT:   B %bb.0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    liveins: $x0
+
+    ; G_UBFX %x, lsb=3, width=4 — testing bit 5 of the extract (>= width 4).
+    ; Walker must NOT fold; the UBFX survives in the output.
+    %copy:gpr(s64) = COPY $x0
+    %lsb:gpr(s64) = G_CONSTANT i64 3
+    %width:gpr(s64) = G_CONSTANT i64 4
+    %ubfx:gpr(s64) = G_UBFX %copy, %lsb, %width
+
+    %bit:gpr(s64) = G_CONSTANT i64 32
+    %zero:gpr(s64) = G_CONSTANT i64 0
+    %and:gpr(s64) = G_AND %ubfx, %bit
+    %cmp:gpr(s32) = G_ICMP intpred(ne), %and(s64), %zero
+    G_BRCOND %cmp, %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR
+...
+---
+name:            fold_ubfx_max_bit
+alignment:       4
+legalized:       true
+regBankSelected: true
+body:             |
+  ; CHECK-LABEL: name: fold_ubfx_max_bit
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %copy:gpr64 = COPY $x0
+  ; CHECK-NEXT:   TBNZX %copy, 59, %bb.1
+  ; CHECK-NEXT:   B %bb.0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    liveins: $x0
+
+    ; G_UBFX %x:s64, lsb=40, width=20 — testing bit 19 of the extract.
+    ; Bit 19 is within the field (19 < 20), so fold: tbnz x, 19+40=59
+    %copy:gpr(s64) = COPY $x0
+    %lsb:gpr(s64) = G_CONSTANT i64 40
+    %width:gpr(s64) = G_CONSTANT i64 20
+    %ubfx:gpr(s64) = G_UBFX %copy, %lsb, %width
+
+    %bit:gpr(s64) = G_CONSTANT i64 524288
+    %zero:gpr(s64) = G_CONSTANT i64 0
+    %and:gpr(s64) = G_AND %ubfx, %bit
+    %cmp:gpr(s32) = G_ICMP intpred(ne), %and(s64), %zero
+    G_BRCOND %cmp, %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR

>From 4ac950bc63c90aae70c677beca54d2aa6ea70b12 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Thu, 11 Jun 2026 17:13:08 -0700
Subject: [PATCH 3/4] [GlobalISel] Port SimplifyDemandedBits to GlobalISel

Add a demanded-bits simplifier to CombinerHelper, structured after
InstCombine's SimplifyDemandedBits and SelectionDAG's
TargetLowering::SimplifyDemandedBits:

* simplifyDemandedBits(MI, OpNo, DemandedBits, Known, Depth) mirrors
  InstCombine's entry point: it simplifies the use operand under the
  demanded mask and returns the known bits computed during the same
  recursive walk.
* A per-opcode switch handles G_AND/G_OR (constant elimination,
  RHS-known demand shrinking) and constant-amount G_SHL/G_LSHR/G_ASHR
  (demand transfer through the shift; ASHR becomes LSHR when the
  sign-fill bits are not demanded), falling back to getKnownBits
  elsewhere, just as SDAG falls back to computeKnownBits.
* Mirroring SDAG's one-use clause, the walk only recurses into defs
  with a single non-debug use unless every bit is demanded; rewriting
  a shared def under partial demand would corrupt its other users.
* Single-use defs are rewritten or erased; multi-use defs only have
  the visited use rerouted (InstCombine's
  SimplifyMultipleUseDemandedBits behavior, folded into the same walk).
* New combine rule simplify_demanded_bits (wired into the AArch64
  post-legalizer combiner) with G_AND/G_OR/shift roots, plus
  narrow_trunc_shr_const for trunc-of-shift narrowing.
* applyCombineTruncOfShift uses the simplifier on the shift source so
  the narrowed shift does not inherit dead high-bit masking.

Deliberate deviations from SDAG, kept minimal:

* Multi-use look-through is fused into the one walk instead of a
  second SimplifyMultipleUseDemandedBits function - same behavior,
  less duplication.
* Only constant shift amounts transfer demand; variable amounts fall
  back conservatively. SDAG's variable-amount handling is left to a
  follow-up with its own soundness review.

GISelValueTracking is untouched. AArch64 codegen test updates are
mask-elimination improvements; AMDGPU sext_inreg improvements come
from the trunc-of-shift narrowing rule.

Tracks #150515.
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |  40 ++
 .../include/llvm/Target/GlobalISel/Combine.td |  37 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 356 ++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64Combine.td     |   3 +-
 .../GlobalISel/combine-narrow-trunc-shr.mir   | 205 ++++++++++
 .../combine-simplify-demanded-bits.mir        | 334 ++++++++++++++++
 .../combine-trunc-shift-demanded-and.mir      |  33 ++
 .../form-bitfield-extract-from-shr-and.mir    |   6 +-
 .../GlobalISel/split-wide-shifts-multiway.ll  |   2 +-
 llvm/test/CodeGen/AArch64/arm64-csel.ll       |   4 +-
 llvm/test/CodeGen/AArch64/arm64-srl-and.ll    |   7 +-
 llvm/test/CodeGen/AArch64/arm64-vhadd.ll      |   6 +-
 llvm/test/CodeGen/AArch64/bswap.ll            |  27 +-
 llvm/test/CodeGen/AArch64/combine-sdiv.ll     |  29 +-
 .../CodeGen/AArch64/hadd-combine-scalar.ll    |  21 +-
 llvm/test/CodeGen/AArch64/hadd-combine.ll     |   6 +-
 .../AArch64/signed-truncation-check.ll        |   4 +-
 .../AArch64/vec-combine-compare-to-bitmask.ll |   9 +-
 .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll   |  24 +-
 .../CodeGen/GlobalISel/KnownBitsTest.cpp      | 328 ++++++++++++++--
 20 files changed, 1373 insertions(+), 108 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-trunc-shr.mir
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-simplify-demanded-bits.mir
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc-shift-demanded-and.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index aa61310994a67..f3ac3b3bc7a35 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -39,6 +39,7 @@ class MachineRegisterInfo;
 class MachineInstr;
 class MachineOperand;
 class GISelValueTracking;
+struct KnownBits;
 class MachineDominatorTree;
 class LegalizerInfo;
 struct LegalityQuery;
@@ -1177,7 +1178,46 @@ class CombinerHelper {
   LLVM_ABI bool matchAVG(MachineInstr &MI, MachineRegisterInfo &MRI, Register X,
                          Register Y, unsigned TargetOpc) const;
 
+  /// Simplify operand \p OpNo of \p MI given that only \p DemandedBits of the
+  /// operand's value are observed, and return the known bits discovered
+  /// during the same recursive walk in \p Known. If the operand's defining
+  /// value has one non-debug use, the defining instruction may be rewritten
+  /// or erased. If the defining value has multiple uses, only this operand
+  /// use may be replaced.
+  LLVM_ABI bool simplifyDemandedBits(MachineInstr &MI, unsigned OpNo,
+                                     const APInt &DemandedBits,
+                                     KnownBits &Known,
+                                     unsigned Depth = 0) const;
+
+  /// Demand transfer for a shift by constant \p ShAmt: which source bits can
+  /// influence the demanded result bits. For G_ASHR the source sign bit is
+  /// demanded whenever any of the top ShAmt result bits (copies of it) are.
+  LLVM_ABI static APInt
+  getDemandedSrcBitsForShiftConst(unsigned Opcode, const APInt &DemandedBits,
+                                  unsigned ShAmt);
+
+  /// Match per-instruction demanded-bits simplification in the current combine
+  /// context. The matcher may simplify a use operand of \p MI; it does not
+  /// compute a union of demands from all users of \p MI's def.
+  LLVM_ABI bool matchSimplifyDemandedBits(MachineInstr &MI,
+                                          BuildFnTy &MatchInfo) const;
+
+  /// Match (G_TRUNC (G_LSHR/G_ASHR X, K-const)) when X's bits beyond DstBW
+  /// (zero for LSHR, sign-bit replicated for ASHR) are provably idle. Rewrites
+  /// to (G_LSHR/G_ASHR (G_TRUNC X), K-trunc), eliminating the outer trunc.
+  LLVM_ABI bool matchNarrowTruncShrConst(MachineInstr &MI,
+                                         BuildFnTy &MatchInfo) const;
+
 private:
+  /// Recursive demanded-bits walk on use operand \p OpNo of \p MI.
+  /// \p DoRewrite false = pure analysis (match phase): reports whether a
+  /// rewrite would happen without touching MIR. \p DoRewrite true = apply
+  /// rewrites at whatever depth they are found (single-use defs may be
+  /// erased; multi-use defs only have this use's operand rerouted).
+  bool simplifyDemandedBitsImpl(MachineInstr &MI, unsigned OpNo,
+                                const APInt &DemandedBits, KnownBits &Known,
+                                unsigned Depth, bool DoRewrite) const;
+
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
 
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index f3474eb95c436..615adab1471d4 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1102,6 +1102,41 @@ def trunc_shift: GICombineRule <
   (apply [{ Helper.applyCombineTruncOfShift(*${root}, ${matchinfo}); }])
 >;
 
+// Demanded-bits-driven narrowing of trunc-of-right-shift with a constant
+// shift amount. Fires when X's bits beyond the destination width are
+// provably idle (known-zero for LSHR, sign-bit-replicated for ASHR), in
+// which case the outer truncate can be dropped:
+//
+//   (trunc (lshr X, K))  -> (lshr (trunc X), K)  when X[DstBW..K+DstBW) == 0
+//   (trunc (ashr X, K))  -> (ashr (trunc X), K)  when X has SrcBW-DstBW+1
+//                                                sign bits.
+//
+// K must be a constant less than DstBW so the rewrite never recurses into
+// a re-truncated outer pattern -- guarantees fixpoint termination and
+// avoids the deadloop that a generic trunc-of-shift demand combine has
+// against existing rules like ``trunc_shift``.
+def narrow_trunc_shr_const : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_TRUNC):$root,
+    [{ return Helper.matchNarrowTruncShrConst(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
+// Demand-driven elimination of redundant G_AND / disjoint G_OR by a constant,
+// and simplification of operands feeding G_SHL / G_LSHR / G_ASHR with a
+// constant amount, where the mask is a no-op on every source bit the shift
+// can observe (including the case where the demanded bits are entirely
+// outside the mask and the operand folds to the constant).
+// Invokes a demanded-bits operand simplifier in the current combine context;
+// it does not compute a union of demands from all users of the root def.
+// Deadloop-safe: match returns true only when the rewrite will make progress.
+// Wired into the AArch64 post-legalizer combiner; not in the generic groups so
+// other targets opt in explicitly.
+def simplify_demanded_bits : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_AND, G_OR, G_SHL, G_LSHR, G_ASHR):$root,
+         [{ return Helper.matchSimplifyDemandedBits(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>;
+
 // Transform (mul x, -1) -> (sub 0, x)
 def mul_by_neg_one: GICombineRule <
   (defs root:$dst),
@@ -2506,7 +2541,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     reassocs, ptr_add_immed_chain, cmp_combines,
     shl_ashr_to_sext_inreg, neg_and_one_to_sext_inreg, sext_inreg_of_load,
     width_reduction_combines, select_combines, select_zero_false, select_not,
-    known_bits_simplifications, trunc_shift,
+    known_bits_simplifications, trunc_shift, narrow_trunc_shr_const,
     not_cmp_fold, opt_brcond_by_inverting_cond,
     const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
     shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 88b68d7685c63..863be388e9361 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/Analysis/CmpInstAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
@@ -2852,6 +2853,19 @@ void CombinerHelper::applyCombineTruncOfShift(
 
   Register ShiftAmt = ShiftMI->getOperand(2).getReg();
   Register ShiftSrc = ShiftMI->getOperand(1).getReg();
+
+  // Demanded-bits hook: the upcoming inner G_TRUNC consumes only the
+  // low NewShiftTy bits of ShiftSrc. If this use can see through a redundant
+  // mask/disjoint-or, simplify the shift operand before truncating so we don't
+  // synthesise dead high-bit work that DCE has to clean up later.
+  APInt LowMask =
+      APInt::getLowBitsSet(MRI.getType(ShiftSrc).getScalarSizeInBits(),
+                           NewShiftTy.getScalarSizeInBits());
+  KnownBits Known(MRI.getType(ShiftSrc).getScalarSizeInBits());
+  simplifyDemandedBits(*ShiftMI, /*OpNo=*/1, LowMask, Known);
+  // Re-read the operand: the demanded-bits simplifier may have rerouted it.
+  ShiftSrc = ShiftMI->getOperand(1).getReg();
+
   ShiftSrc = Builder.buildTrunc(NewShiftTy, ShiftSrc).getReg(0);
 
   Register NewShift =
@@ -8816,3 +8830,345 @@ bool CombinerHelper::matchAVG(MachineInstr &MI, MachineRegisterInfo &MRI,
   LLT XTy = MRI.getType(X);
   return XTy == MRI.getType(Y) && isLegal({TargetOpc, {XTy}});
 }
+
+static bool isRegUseOperand(const MachineInstr &MI, unsigned OpNo) {
+  return OpNo < MI.getNumOperands() && MI.getOperand(OpNo).isReg() &&
+         !MI.getOperand(OpNo).isDef();
+}
+
+APInt CombinerHelper::getDemandedSrcBitsForShiftConst(unsigned Opcode,
+                                                      const APInt &DemandedBits,
+                                                      unsigned ShAmt) {
+  unsigned BW = DemandedBits.getBitWidth();
+  assert(ShAmt < BW && "shift amount must be in range");
+  switch (Opcode) {
+  case TargetOpcode::G_SHL:
+    return DemandedBits.lshr(ShAmt);
+  case TargetOpcode::G_LSHR:
+    return DemandedBits.shl(ShAmt);
+  case TargetOpcode::G_ASHR: {
+    APInt Src = DemandedBits.shl(ShAmt);
+    // The top ShAmt result bits are copies of the source sign bit.
+    if (DemandedBits.intersects(APInt::getHighBitsSet(BW, ShAmt)))
+      Src.setSignBit();
+    return Src;
+  }
+  default:
+    llvm_unreachable("not a shift opcode");
+  }
+}
+
+bool CombinerHelper::simplifyDemandedBitsImpl(MachineInstr &MI, unsigned OpNo,
+                                              const APInt &DemandedBits,
+                                              KnownBits &Known, unsigned Depth,
+                                              bool DoRewrite) const {
+  Known = KnownBits(DemandedBits.getBitWidth());
+  if (!isRegUseOperand(MI, OpNo) || DemandedBits.isZero() || !VT)
+    return false;
+
+  Register OpReg = MI.getOperand(OpNo).getReg();
+  LLT OpTy = MRI.getType(OpReg);
+  if (!OpTy.isValid())
+    return false;
+  assert(DemandedBits.getBitWidth() == OpTy.getScalarSizeInBits() &&
+         "DemandedBits width must match the operand scalar type");
+
+  unsigned BW = DemandedBits.getBitWidth();
+  auto GiveUp = [&]() {
+    APInt DemandedElts = OpTy.isFixedVector()
+                             ? APInt::getAllOnes(OpTy.getNumElements())
+                             : APInt(1, 1);
+    Known = VT->getKnownBits(OpReg, DemandedElts, Depth);
+    return false;
+  };
+
+  if (Depth >= MaxAnalysisRecursionDepth)
+    return GiveUp();
+
+  MachineInstr *DefMI = OpReg.isVirtual() ? MRI.getVRegDef(OpReg) : nullptr;
+  if (!DefMI || DefMI->getNumExplicitDefs() != 1 ||
+      !DefMI->getOperand(0).isReg())
+    return GiveUp();
+
+  // Applies \p Repl for this use. Single-use def: full RAUW + erase.
+  // Multi-use def: reroute only this operand (the demand that justified the
+  // replacement covers only this use).
+  auto Rewrite = [&](Register Repl) {
+    if (!DoRewrite || Repl == OpReg)
+      return Repl != OpReg;
+    if (OpReg.isVirtual() && MRI.hasOneNonDBGUse(OpReg)) {
+      replaceRegWith(MRI, OpReg, Repl);
+      eraseInst(*DefMI);
+    } else {
+      replaceRegOpWith(MRI, MI.getOperand(OpNo), Repl);
+    }
+    return true;
+  };
+
+  // Descending into a def that has other users with a partial demand could
+  // rewrite it in ways those users observe (they demand bits we don't).
+  // Mirror SDAG: only recurse into single-use defs, unless every bit is
+  // demanded (then any derived deeper demand is intrinsic to the operators
+  // and value-preserving for all users).
+  bool CanRecurse = MRI.hasOneNonDBGUse(OpReg) || DemandedBits.isAllOnes();
+
+  unsigned Opcode = DefMI->getOpcode();
+  switch (Opcode) {
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR: {
+    if (!isRegUseOperand(*DefMI, 1) || !isRegUseOperand(*DefMI, 2))
+      return GiveUp();
+    Register Dst = DefMI->getOperand(0).getReg();
+    LLT DstTy = MRI.getType(Dst);
+    if (!DstTy.isValid() || BW != DstTy.getScalarSizeInBits())
+      return GiveUp();
+
+    Register LHS = DefMI->getOperand(1).getReg();
+    Register RHS = DefMI->getOperand(2).getReg();
+    auto SimplifyWithConst = [&](Register X,
+                                 Register CReg) -> std::optional<Register> {
+      if (MRI.getType(X) != DstTy || MRI.getType(CReg) != DstTy)
+        return std::nullopt;
+      std::optional<APInt> C = getConstantOrConstantSplatVector(CReg);
+      if (!C || C->getBitWidth() != BW)
+        return std::nullopt;
+      if (Opcode == TargetOpcode::G_AND) {
+        if (DemandedBits.isSubsetOf(*C))
+          return X;
+        if (DemandedBits.isSubsetOf(~*C))
+          return CReg;
+        return std::nullopt;
+      }
+      if (DemandedBits.isSubsetOf(~*C))
+        return X;
+      if (DemandedBits.isSubsetOf(*C))
+        return CReg;
+      return std::nullopt;
+    };
+
+    // Constant elimination on this node wins over recursing deeper: it
+    // replaces the whole def. Compute the replacement's known bits before
+    // rewriting (the rewrite may erase the def).
+    if (std::optional<Register> Repl = SimplifyWithConst(LHS, RHS)) {
+      Known = VT->getKnownBits(*Repl);
+      if (Rewrite(*Repl))
+        return true;
+    }
+    if (std::optional<Register> Repl = SimplifyWithConst(RHS, LHS)) {
+      Known = VT->getKnownBits(*Repl);
+      if (Rewrite(*Repl))
+        return true;
+    }
+
+    // Node-local constant elimination above is multi-use safe (Rewrite
+    // reroutes only this operand for multi-use defs), but recursing into the
+    // def's operands with a partial demand is not.
+    if (!CanRecurse)
+      return GiveUp();
+
+    KnownBits RHSKnown(BW);
+    bool Changed = simplifyDemandedBitsImpl(*DefMI, /*OpNo=*/2, DemandedBits,
+                                            RHSKnown, Depth + 1, DoRewrite);
+    APInt LHSDemand = DemandedBits;
+    if (Opcode == TargetOpcode::G_AND)
+      LHSDemand &= ~RHSKnown.Zero;
+    else
+      LHSDemand &= ~RHSKnown.One;
+    KnownBits LHSKnown(BW);
+    Changed |= simplifyDemandedBitsImpl(*DefMI, /*OpNo=*/1, LHSDemand, LHSKnown,
+                                        Depth + 1, DoRewrite);
+    Known = Opcode == TargetOpcode::G_AND ? LHSKnown & RHSKnown
+                                          : LHSKnown | RHSKnown;
+    return Changed;
+  }
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_ASHR: {
+    if (!isRegUseOperand(*DefMI, 1) || !isRegUseOperand(*DefMI, 2))
+      return GiveUp();
+    Register Dst = DefMI->getOperand(0).getReg();
+    LLT DstTy = MRI.getType(Dst);
+    if (!DstTy.isValid() || BW != DstTy.getScalarSizeInBits())
+      return GiveUp();
+    std::optional<APInt> Amt =
+        getConstantOrConstantSplatVector(DefMI->getOperand(2).getReg());
+    if (!Amt || Amt->uge(BW))
+      return GiveUp(); // Variable or out-of-range amount: SDAG-style bail.
+    unsigned ShAmt = Amt->getZExtValue();
+    if (ShAmt == 0)
+      return GiveUp();
+    // Bail entirely for a multi-use def under partial demand (SDAG does the
+    // same): both the recursion and the ASHR->LSHR rewrite below derive from
+    // a demand the other users do not share.
+    if (!CanRecurse)
+      return GiveUp();
+
+    APInt SrcDemand =
+        getDemandedSrcBitsForShiftConst(Opcode, DemandedBits, ShAmt);
+    KnownBits SrcKnown(BW);
+    bool Changed = simplifyDemandedBitsImpl(*DefMI, /*OpNo=*/1, SrcDemand,
+                                            SrcKnown, Depth + 1, DoRewrite);
+    KnownBits AmtKnown = KnownBits::makeConstant(APInt(BW, ShAmt));
+    switch (Opcode) {
+    case TargetOpcode::G_SHL:
+      Known = KnownBits::shl(SrcKnown, AmtKnown);
+      break;
+    case TargetOpcode::G_LSHR:
+      Known = KnownBits::lshr(SrcKnown, AmtKnown);
+      break;
+    case TargetOpcode::G_ASHR:
+      // If none of the sign-fill result bits [BW-ShAmt, BW) are demanded, or
+      // the sign bit is known zero, an unsigned shift computes the same
+      // demanded bits (SDAG's SRA-case rewrite).
+      if (DemandedBits.countLeadingZeros() >= ShAmt ||
+          SrcKnown.isNonNegative()) {
+        if (DoRewrite) {
+          Builder.setInstrAndDebugLoc(*DefMI);
+          auto Lshr = Builder.buildLShr(DstTy, DefMI->getOperand(1).getReg(),
+                                        DefMI->getOperand(2).getReg());
+          Rewrite(Lshr.getReg(0));
+          Known = KnownBits::lshr(SrcKnown, AmtKnown);
+          return true;
+        }
+        return true; // Dry run: a rewrite would happen.
+      }
+      Known = KnownBits::ashr(SrcKnown, AmtKnown);
+      break;
+    }
+    return Changed;
+  }
+  default:
+    return GiveUp();
+  }
+}
+
+bool CombinerHelper::simplifyDemandedBits(MachineInstr &MI, unsigned OpNo,
+                                          const APInt &DemandedBits,
+                                          KnownBits &Known,
+                                          unsigned Depth) const {
+  return simplifyDemandedBitsImpl(MI, OpNo, DemandedBits, Known, Depth,
+                                  /*DoRewrite=*/true);
+}
+
+bool CombinerHelper::matchSimplifyDemandedBits(MachineInstr &MI,
+                                               BuildFnTy &MatchInfo) const {
+  if (MI.getNumExplicitDefs() != 1 || !MI.getOperand(0).isReg() ||
+      !isRegUseOperand(MI, 1) || !isRegUseOperand(MI, 2))
+    return false;
+
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  if (!Ty.isValid() || Ty.isVector())
+    return false;
+
+  APInt RootDemand = APInt::getAllOnes(Ty.getScalarSizeInBits());
+  auto Probe = [&](unsigned OpNo, const APInt &OpDemand, KnownBits &Known) {
+    if (!simplifyDemandedBitsImpl(MI, OpNo, OpDemand, Known, /*Depth=*/0,
+                                  /*DoRewrite=*/false))
+      return false;
+
+    MatchInfo = [this, &MI, OpNo, OpDemand](MachineIRBuilder &) {
+      KnownBits K(OpDemand.getBitWidth());
+      simplifyDemandedBits(MI, OpNo, OpDemand, K);
+    };
+    return true;
+  };
+
+  unsigned Opcode = MI.getOpcode();
+  // Shift roots: operand 2 is the amount, not a data use; probe operand 1
+  // with demand transferred through the constant shift.
+  if (Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_LSHR ||
+      Opcode == TargetOpcode::G_ASHR) {
+    std::optional<APInt> Amt =
+        getConstantOrConstantSplatVector(MI.getOperand(2).getReg());
+    if (!Amt || Amt->uge(RootDemand.getBitWidth()) || Amt->isZero())
+      return false;
+    APInt SrcDemand = getDemandedSrcBitsForShiftConst(Opcode, RootDemand,
+                                                      Amt->getZExtValue());
+    KnownBits SrcKnown(RootDemand.getBitWidth());
+    return Probe(/*OpNo=*/1, SrcDemand, SrcKnown);
+  }
+
+  KnownBits RHSKnown(RootDemand.getBitWidth());
+  if (Probe(/*OpNo=*/2, RootDemand, RHSKnown))
+    return true;
+
+  APInt LHSDemand = RootDemand;
+  if (Opcode == TargetOpcode::G_AND)
+    LHSDemand &= ~RHSKnown.Zero;
+  else if (Opcode == TargetOpcode::G_OR)
+    LHSDemand &= ~RHSKnown.One;
+  else
+    return false;
+
+  KnownBits LHSKnown(RootDemand.getBitWidth());
+  return Probe(/*OpNo=*/1, LHSDemand, LHSKnown);
+}
+
+// (trunc (lshr X, K)) with bits [DstBW, DstBW+K) of X known-zero
+//   -> (lshr (trunc X), K)
+// (trunc (ashr X, K)) when X has at least (BW - DstBW + 1) sign bits
+//   -> (ashr (trunc X), K)
+bool CombinerHelper::matchNarrowTruncShrConst(MachineInstr &MI,
+                                              BuildFnTy &MatchInfo) const {
+  assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected G_TRUNC");
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  if (!MRI.hasOneNonDBGUse(Src))
+    return false;
+
+  MachineInstr *ShrMI = getDefIgnoringCopies(Src, MRI);
+  if (!ShrMI)
+    return false;
+  unsigned ShrOpc = ShrMI->getOpcode();
+  if (ShrOpc != TargetOpcode::G_LSHR && ShrOpc != TargetOpcode::G_ASHR)
+    return false;
+
+  Register X = ShrMI->getOperand(1).getReg();
+  Register AmtReg = ShrMI->getOperand(2).getReg();
+  LLT SrcTy = MRI.getType(X);
+  LLT DstTy = MRI.getType(Dst);
+  if (SrcTy.isVector() != DstTy.isVector())
+    return false;
+  if (SrcTy.isVector() && SrcTy.getElementCount() != DstTy.getElementCount())
+    return false;
+
+  unsigned SrcBW = SrcTy.getScalarSizeInBits();
+  unsigned DstBW = DstTy.getScalarSizeInBits();
+  if (DstBW >= SrcBW)
+    return false;
+
+  std::optional<APInt> K = getConstantOrConstantSplatVector(AmtReg);
+  if (!K)
+    return false;
+  if (K->uge(DstBW))
+    return false;
+  unsigned KVal = K->getZExtValue();
+  if (KVal + DstBW > SrcBW)
+    return false;
+
+  if (!VT)
+    return false;
+
+  if (ShrOpc == TargetOpcode::G_LSHR) {
+    KnownBits Known = VT->getKnownBits(X);
+    APInt HiZeroes = Known.Zero.extractBits(KVal, DstBW);
+    if (!HiZeroes.isAllOnes())
+      return false;
+  } else {
+    unsigned SignBits = VT->computeNumSignBits(X);
+    if (SignBits < SrcBW - DstBW + 1)
+      return false;
+  }
+
+  LLT AmtTy = getTargetLowering().getPreferredShiftAmountTy(DstTy);
+  if (!isLegalOrBeforeLegalizer({ShrOpc, {DstTy, AmtTy}}))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    Register NarrowX = B.buildTrunc(DstTy, X).getReg(0);
+    Register NarrowAmt = B.buildConstant(AmtTy, KVal).getReg(0);
+    B.buildInstr(ShrOpc, {Dst}, {NarrowX, NarrowAmt});
+  };
+  return true;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index a9c447336cd5e..f2d12f46e0fca 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -397,5 +397,6 @@ def AArch64PostLegalizerCombiner
                         combine_mul_cmlt, combine_use_vector_truncate,
                         extmultomull, subaddmulreassoc, truncsat_combines,
                         lshr_of_trunc_of_lshr,
-                        funnel_shift_from_or_shift_constants_are_legal]> {
+                        funnel_shift_from_or_shift_constants_are_legal,
+                        simplify_demanded_bits]> {
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-trunc-shr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-trunc-shr.mir
new file mode 100644
index 0000000000000..256e10d9b054c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-trunc-shr.mir
@@ -0,0 +1,205 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -aarch64prelegalizercombiner-only-enable-rule=narrow_trunc_shr_const -verify-machineinstrs %s | FileCheck %s --check-prefixes=CHECK,ISO
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s --check-prefixes=CHECK,FULL
+#
+# Two runs share this file:
+#  - the isolated run enables only narrow_trunc_shr_const, so each test pins
+#    this rule (no trunc_shift / bitfield-extract / narrow_binop can confound
+#    the output) -- the ISO-prefixed checks.
+#  - the full run lets the whole prelegalizer combiner go, so
+#    narrow_trunc_shr_const coexists with trunc_shift et al -- the FULL-prefixed
+#    checks. It doubles as a deadloop guard: a reintroduced trunc-of-shift
+#    ping-pong trips the combiner iteration limit / hangs rather than diffing.
+
+# narrow_trunc_shr_const: (trunc (lshr (and X, low-32-mask), K-const)) -> (lshr (trunc X), K-const)
+# AND proves bits [32..) zero; K=5; K+DstBW=37 <= SrcBW=64. Outer trunc dropped.
+---
+name:            narrow_trunc_lshr_const_lowmask
+legalized: false
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; ISO-LABEL: name: narrow_trunc_lshr_const_lowmask
+    ; ISO: liveins: $x0
+    ; ISO-NEXT: {{  $}}
+    ; ISO-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; ISO-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; ISO-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; ISO-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; ISO-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+    ; ISO-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C1]](s32)
+    ; ISO-NEXT: $w0 = COPY [[LSHR]](s32)
+    ; ISO-NEXT: RET_ReallyLR implicit $w0
+    ;
+    ; FULL-LABEL: name: narrow_trunc_lshr_const_lowmask
+    ; FULL: liveins: $x0
+    ; FULL-NEXT: {{  $}}
+    ; FULL-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; FULL-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+    ; FULL-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 27
+    ; FULL-NEXT: [[UBFX:%[0-9]+]]:_(s64) = G_UBFX [[COPY]], [[C]](s64), [[C1]]
+    ; FULL-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[UBFX]](s64)
+    ; FULL-NEXT: $w0 = COPY [[TRUNC]](s32)
+    ; FULL-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = G_CONSTANT i64 4294967295
+    %2:_(s64) = G_AND %0, %1
+    %3:_(s64) = G_CONSTANT i64 5
+    %4:_(s64) = G_LSHR %2, %3
+    %5:_(s32) = G_TRUNC %4(s64)
+    $w0 = COPY %5(s32)
+    RET_ReallyLR implicit $w0
+...
+
+# Same as above but with zext-derived source: high bits provably zero.
+---
+name:            narrow_trunc_lshr_const_zext
+legalized: false
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0
+    ; ISO-LABEL: name: narrow_trunc_lshr_const_zext
+    ; ISO: liveins: $w0
+    ; ISO-NEXT: {{  $}}
+    ; ISO-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; ISO-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+    ; ISO-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ZEXT]](s64)
+    ; ISO-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; ISO-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C]](s32)
+    ; ISO-NEXT: $w0 = COPY [[LSHR]](s32)
+    ; ISO-NEXT: RET_ReallyLR implicit $w0
+    ;
+    ; FULL-LABEL: name: narrow_trunc_lshr_const_zext
+    ; FULL: liveins: $w0
+    ; FULL-NEXT: {{  $}}
+    ; FULL-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; FULL-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; FULL-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; FULL-NEXT: $w0 = COPY [[LSHR]](s32)
+    ; FULL-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s64) = G_ZEXT %0(s32)
+    %2:_(s64) = G_CONSTANT i64 7
+    %3:_(s64) = G_LSHR %1, %2
+    %4:_(s32) = G_TRUNC %3(s64)
+    $w0 = COPY %4(s32)
+    RET_ReallyLR implicit $w0
+...
+
+# ASHR positive: sext-derived source has 33 sign bits >= 64-32+1.
+---
+name:            narrow_trunc_ashr_const_sext
+legalized: false
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0
+    ; ISO-LABEL: name: narrow_trunc_ashr_const_sext
+    ; ISO: liveins: $w0
+    ; ISO-NEXT: {{  $}}
+    ; ISO-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; ISO-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s32)
+    ; ISO-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT]](s64)
+    ; ISO-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
+    ; ISO-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[TRUNC]], [[C]](s32)
+    ; ISO-NEXT: $w0 = COPY [[ASHR]](s32)
+    ; ISO-NEXT: RET_ReallyLR implicit $w0
+    ;
+    ; FULL-LABEL: name: narrow_trunc_ashr_const_sext
+    ; FULL: liveins: $w0
+    ; FULL-NEXT: {{  $}}
+    ; FULL-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; FULL-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
+    ; FULL-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s32)
+    ; FULL-NEXT: $w0 = COPY [[ASHR]](s32)
+    ; FULL-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s64) = G_SEXT %0(s32)
+    %2:_(s64) = G_CONSTANT i64 11
+    %3:_(s64) = G_ASHR %1, %2
+    %4:_(s32) = G_TRUNC %3(s64)
+    $w0 = COPY %4(s32)
+    RET_ReallyLR implicit $w0
+...
+
+# Negative: unknown high bits. Must NOT strip the outer trunc to lshr(trunc).
+---
+name:            negative_unknown_high_bits
+legalized: false
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: negative_unknown_high_bits
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
+    ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = G_CONSTANT i64 5
+    %2:_(s64) = G_LSHR %0, %1
+    %3:_(s32) = G_TRUNC %2(s64)
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+...
+
+# Negative: K >= DstBW. Source is an unknown s64 (no constant fold can hide the
+# decision), so the unchanged lshr+trunc proves the K>=DstBW guard fired.
+---
+name:            negative_shift_geq_dstbw
+legalized: false
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: negative_shift_geq_dstbw
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
+    ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = G_CONSTANT i64 32
+    %2:_(s64) = G_LSHR %0, %1
+    %3:_(s32) = G_TRUNC %2(s64)
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+...
+
+# Negative: shift has multiple uses. The single-use guard blocks the rewrite.
+---
+name:            negative_multi_use_shift
+legalized: false
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0
+    ; CHECK-LABEL: name: negative_multi_use_shift
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[ZEXT]], [[C]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
+    ; CHECK-NEXT: $x0 = COPY [[LSHR]](s64)
+    ; CHECK-NEXT: $w1 = COPY [[TRUNC]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0, implicit $w1
+    %0:_(s32) = COPY $w0
+    %1:_(s64) = G_ZEXT %0(s32)
+    %2:_(s64) = G_CONSTANT i64 3
+    %3:_(s64) = G_LSHR %1, %2
+    %4:_(s32) = G_TRUNC %3(s64)
+    $x0 = COPY %3(s64)
+    $w1 = COPY %4(s32)
+    RET_ReallyLR implicit $x0, implicit $w1
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-simplify-demanded-bits.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-simplify-demanded-bits.mir
new file mode 100644
index 0000000000000..aea943716da9f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-simplify-demanded-bits.mir
@@ -0,0 +1,334 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -run-pass=aarch64-postlegalizer-combiner -aarch64postlegalizercombiner-only-enable-rule=simplify_demanded_bits -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,RULE
+# RUN: llc -mtriple=aarch64 -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,FULL
+---
+name:            drop_redundant_outer_mask
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0
+    ; CHECK-LABEL: name: drop_redundant_outer_mask
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %lowmask:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: %and2:_(s32) = G_AND [[COPY]], %lowmask
+    ; CHECK-NEXT: $w0 = COPY %and2(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %mask:_(s32) = G_CONSTANT i32 255
+    %and:_(s32) = G_AND %0, %mask
+    %lowmask:_(s32) = G_CONSTANT i32 15
+    %and2:_(s32) = G_AND %and, %lowmask
+    $w0 = COPY %and2(s32)
+    RET_ReallyLR implicit $w0
+...
+---
+name:            keep_and_multiuse_high_bits
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; RULE-LABEL: name: keep_and_multiuse_high_bits
+    ; RULE: liveins: $x0
+    ; RULE-NEXT: {{  $}}
+    ; RULE-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; RULE-NEXT: %c:_(s64) = G_CONSTANT i64 4294967295
+    ; RULE-NEXT: %and:_(s64) = G_AND [[COPY]], %c
+    ; RULE-NEXT: %lo:_(s32) = G_TRUNC %and(s64)
+    ; RULE-NEXT: %amt:_(s64) = G_CONSTANT i64 40
+    ; RULE-NEXT: %hi:_(s64) = G_LSHR %c, %amt(s64)
+    ; RULE-NEXT: $w0 = COPY %lo(s32)
+    ; RULE-NEXT: $x1 = COPY %hi(s64)
+    ; RULE-NEXT: RET_ReallyLR implicit $w0, implicit $x1
+    ;
+    ; FULL-LABEL: name: keep_and_multiuse_high_bits
+    ; FULL: liveins: $x0
+    ; FULL-NEXT: {{  $}}
+    ; FULL-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; FULL-NEXT: %c:_(s64) = G_CONSTANT i64 4294967295
+    ; FULL-NEXT: %and:_(s64) = G_AND [[COPY]], %c
+    ; FULL-NEXT: %lo:_(s32) = G_TRUNC %and(s64)
+    ; FULL-NEXT: %hi:_(s64) = G_CONSTANT i64 0
+    ; FULL-NEXT: $w0 = COPY %lo(s32)
+    ; FULL-NEXT: $x1 = COPY %hi(s64)
+    ; FULL-NEXT: RET_ReallyLR implicit $w0, implicit $x1
+    %0:_(s64) = COPY $x0
+    %c:_(s64) = G_CONSTANT i64 4294967295
+    %and:_(s64) = G_AND %0, %c
+    %lo:_(s32) = G_TRUNC %and(s64)
+    %amt:_(s64) = G_CONSTANT i64 40
+    %hi:_(s64) = G_LSHR %and, %amt(s64)
+    $w0 = COPY %lo(s32)
+    $x1 = COPY %hi(s64)
+    RET_ReallyLR implicit $w0, implicit $x1
+...
+---
+# (or X, C) feeding a use that only demands bits where C is zero: the constant
+# or-bits are entirely undemanded, so the disjoint G_OR is dropped and the use
+# reads X directly.
+name:            drop_disjoint_or_high_const
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0
+    ; CHECK-LABEL: name: drop_disjoint_or_high_const
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %lowmask:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: %and:_(s32) = G_AND [[COPY]], %lowmask
+    ; CHECK-NEXT: $w0 = COPY %and(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %hc:_(s32) = G_CONSTANT i32 65280
+    %or:_(s32) = G_OR %0, %hc
+    %lowmask:_(s32) = G_CONSTANT i32 255
+    %and:_(s32) = G_AND %or, %lowmask
+    $w0 = COPY %and(s32)
+    RET_ReallyLR implicit $w0
+...
+---
+# Negative: the demanded window overlaps the OR constant's set bits, so the OR
+# is observable and must be kept.
+name:            keep_or_demanded_const
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0
+    ; CHECK-LABEL: name: keep_or_demanded_const
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %hc:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: %or:_(s32) = G_OR [[COPY]], %hc
+    ; CHECK-NEXT: %lowmask:_(s32) = G_CONSTANT i32 4095
+    ; CHECK-NEXT: %and:_(s32) = G_AND %or, %lowmask
+    ; CHECK-NEXT: $w0 = COPY %and(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %hc:_(s32) = G_CONSTANT i32 255
+    %or:_(s32) = G_OR %0, %hc
+    %lowmask:_(s32) = G_CONSTANT i32 4095
+    %and:_(s32) = G_AND %or, %lowmask
+    $w0 = COPY %and(s32)
+    RET_ReallyLR implicit $w0
+...
+---
+# A shift root transfers its demand through the constant amount: shl-by-4
+# observes only source bits [0,28), exactly the bits the mask keeps, so the
+# single-use inner G_AND is redundant and must be erased.
+name:            shl_root_drops_inner_mask
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0
+    ; CHECK-LABEL: name: shl_root_drops_inner_mask
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %amt:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: %shl:_(s32) = G_SHL [[COPY]], %amt(s32)
+    ; CHECK-NEXT: $w0 = COPY %shl(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %mask:_(s32) = G_CONSTANT i32 268435455
+    %and:_(s32) = G_AND %0, %mask
+    %amt:_(s32) = G_CONSTANT i32 4
+    %shl:_(s32) = G_SHL %and, %amt
+    $w0 = COPY %shl(s32)
+    RET_ReallyLR implicit $w0
+...
+---
+# AND root with demand propagated through shl-by-4: outer mask demands [0,12),
+# which translates to src demand [0,8). Inner mask 255 covers [0,8), so the
+# inner G_AND is redundant and gets dropped.
+name:            and_root_sees_through_shl
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0
+    ; RULE-LABEL: name: and_root_sees_through_shl
+    ; RULE: liveins: $w0
+    ; RULE-NEXT: {{  $}}
+    ; RULE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; RULE-NEXT: %amt:_(s32) = G_CONSTANT i32 4
+    ; RULE-NEXT: %shl:_(s32) = G_SHL [[COPY]], %amt(s32)
+    ; RULE-NEXT: %lowmask:_(s32) = G_CONSTANT i32 4095
+    ; RULE-NEXT: %and2:_(s32) = G_AND %shl, %lowmask
+    ; RULE-NEXT: $w0 = COPY %and2(s32)
+    ; RULE-NEXT: RET_ReallyLR implicit $w0
+    ;
+    ; FULL-LABEL: name: and_root_sees_through_shl
+    ; FULL: liveins: $w0
+    ; FULL-NEXT: {{  $}}
+    ; FULL-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; FULL-NEXT: %mask:_(s32) = G_CONSTANT i32 255
+    ; FULL-NEXT: %and:_(s32) = G_AND [[COPY]], %mask
+    ; FULL-NEXT: %amt:_(s32) = G_CONSTANT i32 4
+    ; FULL-NEXT: %shl:_(s32) = G_SHL %and, %amt(s32)
+    ; FULL-NEXT: $w0 = COPY %shl(s32)
+    ; FULL-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %mask:_(s32) = G_CONSTANT i32 255
+    %and:_(s32) = G_AND %0, %mask
+    %amt:_(s32) = G_CONSTANT i32 4
+    %shl:_(s32) = G_SHL %and, %amt
+    %lowmask:_(s32) = G_CONSTANT i32 4095
+    %and2:_(s32) = G_AND %shl, %lowmask
+    $w0 = COPY %and2(s32)
+    RET_ReallyLR implicit $w0
+...
+---
+# G_ASHR-by-8 under an AND that only demands bits [0,16). The sign-fill region
+# [24,32) is undemanded, so the arithmetic shift converts to a logical shift.
+name:            ashr_to_lshr_low_demand
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0
+    ; RULE-LABEL: name: ashr_to_lshr_low_demand
+    ; RULE: liveins: $w0
+    ; RULE-NEXT: {{  $}}
+    ; RULE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; RULE-NEXT: %amt:_(s32) = G_CONSTANT i32 8
+    ; RULE-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], %amt(s32)
+    ; RULE-NEXT: %lowmask:_(s32) = G_CONSTANT i32 65535
+    ; RULE-NEXT: %and:_(s32) = G_AND [[LSHR]], %lowmask
+    ; RULE-NEXT: $w0 = COPY %and(s32)
+    ; RULE-NEXT: RET_ReallyLR implicit $w0
+    ;
+    ; FULL-LABEL: name: ashr_to_lshr_low_demand
+    ; FULL: liveins: $w0
+    ; FULL-NEXT: {{  $}}
+    ; FULL-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; FULL-NEXT: %amt:_(s32) = G_CONSTANT i32 8
+    ; FULL-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; FULL-NEXT: %and:_(s32) = G_UBFX [[COPY]], %amt(s32), [[C]]
+    ; FULL-NEXT: $w0 = COPY %and(s32)
+    ; FULL-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %amt:_(s32) = G_CONSTANT i32 8
+    %ashr:_(s32) = G_ASHR %0, %amt
+    %lowmask:_(s32) = G_CONSTANT i32 65535
+    %and:_(s32) = G_AND %ashr, %lowmask
+    $w0 = COPY %and(s32)
+    RET_ReallyLR implicit $w0
+...
+---
+# G_ASHR is the root (all bits demanded). The sign-fill region is demanded, so
+# ASHR must survive. Guards against unsound unconditional conversion.
+name:            keep_ashr_sign_demanded
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0
+    ; CHECK-LABEL: name: keep_ashr_sign_demanded
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %amt:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: %ashr:_(s32) = G_ASHR [[COPY]], %amt(s32)
+    ; CHECK-NEXT: $w0 = COPY %ashr(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %amt:_(s32) = G_CONSTANT i32 8
+    %ashr:_(s32) = G_ASHR %0, %amt
+    $w0 = COPY %ashr(s32)
+    RET_ReallyLR implicit $w0
+...
+---
+# %and is multi-use (feeds both the G_SHL and the G_ADD). The combiner walks
+# from %and2 through the single-use %shl, then reaches %and with partial
+# demand. Node-local const-elim on %and is allowed (reroutes only the shl's
+# operand), but the G_ADD still observes the masked value so %and must remain.
+name:            multiuse_shift_inner_mask_survives
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0
+    ; RULE-LABEL: name: multiuse_shift_inner_mask_survives
+    ; RULE: liveins: $w0
+    ; RULE-NEXT: {{  $}}
+    ; RULE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; RULE-NEXT: %mask:_(s32) = G_CONSTANT i32 255
+    ; RULE-NEXT: %and:_(s32) = G_AND [[COPY]], %mask
+    ; RULE-NEXT: %amt:_(s32) = G_CONSTANT i32 4
+    ; RULE-NEXT: %shl:_(s32) = G_SHL [[COPY]], %amt(s32)
+    ; RULE-NEXT: %lowmask:_(s32) = G_CONSTANT i32 4095
+    ; RULE-NEXT: %and2:_(s32) = G_AND %shl, %lowmask
+    ; RULE-NEXT: %other:_(s32) = G_ADD %and, %and
+    ; RULE-NEXT: $w0 = COPY %and2(s32)
+    ; RULE-NEXT: $w1 = COPY %other(s32)
+    ; RULE-NEXT: RET_ReallyLR implicit $w0, implicit $w1
+    ;
+    ; FULL-LABEL: name: multiuse_shift_inner_mask_survives
+    ; FULL: liveins: $w0
+    ; FULL-NEXT: {{  $}}
+    ; FULL-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; FULL-NEXT: %mask:_(s32) = G_CONSTANT i32 255
+    ; FULL-NEXT: %and:_(s32) = G_AND [[COPY]], %mask
+    ; FULL-NEXT: %amt:_(s32) = G_CONSTANT i32 4
+    ; FULL-NEXT: %shl:_(s32) = G_SHL %and, %amt(s32)
+    ; FULL-NEXT: %other:_(s32) = G_ADD %and, %and
+    ; FULL-NEXT: $w0 = COPY %shl(s32)
+    ; FULL-NEXT: $w1 = COPY %other(s32)
+    ; FULL-NEXT: RET_ReallyLR implicit $w0, implicit $w1
+    %0:_(s32) = COPY $w0
+    %mask:_(s32) = G_CONSTANT i32 255
+    %and:_(s32) = G_AND %0, %mask
+    %amt:_(s32) = G_CONSTANT i32 4
+    %shl:_(s32) = G_SHL %and, %amt
+    %lowmask:_(s32) = G_CONSTANT i32 4095
+    %and2:_(s32) = G_AND %shl, %lowmask
+    %other:_(s32) = G_ADD %and, %and
+    $w0 = COPY %and2(s32)
+    $w1 = COPY %other(s32)
+    RET_ReallyLR implicit $w0, implicit $w1
+...
+---
+# 0x00FF00FFFFFFFFFF = 71777218572845055. Bits set: [0,40) and [48,56).
+# G_LSHR-by-40 demands src bits [40,64). Within that window, C has bits [48,56)
+# set and [40,48)u[56,64) clear, so neither "demand isSubsetOf C" nor
+# "demand isSubsetOf ~C" holds. Additionally %and is multi-use (TRUNC + LSHR),
+# so partial-demand recursion is blocked. Nothing changes in the rule-isolated
+# run: strict negative gate for the multi-use guard.
+name:            keep_and_multiuse_straddling_mask
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: keep_and_multiuse_straddling_mask
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %ptr:_(p0) = COPY $x1
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT i64 71777218572845055
+    ; CHECK-NEXT: %and:_(s64) = G_AND [[COPY]], %c
+    ; CHECK-NEXT: %lo:_(s32) = G_TRUNC %and(s64)
+    ; CHECK-NEXT: %amt:_(s64) = G_CONSTANT i64 40
+    ; CHECK-NEXT: %hi:_(s64) = G_LSHR %and, %amt(s64)
+    ; CHECK-NEXT: $w0 = COPY %lo(s32)
+    ; CHECK-NEXT: $x1 = COPY %hi(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0, implicit $x1
+    %0:_(s64) = COPY $x0
+    %ptr:_(p0) = COPY $x1
+    %c:_(s64) = G_CONSTANT i64 71777218572845055
+    %and:_(s64) = G_AND %0, %c
+    %lo:_(s32) = G_TRUNC %and(s64)
+    %amt:_(s64) = G_CONSTANT i64 40
+    %hi:_(s64) = G_LSHR %and, %amt(s64)
+    $w0 = COPY %lo(s32)
+    $x1 = COPY %hi(s64)
+    RET_ReallyLR implicit $w0, implicit $x1
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc-shift-demanded-and.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc-shift-demanded-and.mir
new file mode 100644
index 0000000000000..fbbcbbe055efa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc-shift-demanded-and.mir
@@ -0,0 +1,33 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -aarch64prelegalizercombiner-only-enable-rule=trunc_shift -verify-machineinstrs %s | FileCheck %s
+#
+# Isolates trunc_shift: only this rule is enabled, so narrow_binop / redundant-and
+# folds cannot drop the inner G_AND. applyCombineTruncOfShift's demanded-bits
+# hook is the ONLY thing that can eliminate it. With the hook, the inner AND
+# (mask = low 32 bits, which the post-trunc demand covers entirely) is dropped
+# before the trunc is built; without it the AND would survive as (and (trunc X), -1).
+---
+name:            hook_drops_redundant_and_under_trunc_shl
+legalized: false
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: hook_drops_redundant_and_under_trunc_shl
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: $w0 = COPY [[SHL]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = G_CONSTANT i64 4294967295
+    %2:_(s64) = G_AND %0, %1
+    %3:_(s64) = G_CONSTANT i64 5
+    %4:_(s64) = G_SHL %2, %3
+    %5:_(s32) = G_TRUNC %4(s64)
+    $w0 = COPY %5(s32)
+    RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/form-bitfield-extract-from-shr-and.mir b/llvm/test/CodeGen/AArch64/GlobalISel/form-bitfield-extract-from-shr-and.mir
index c8c5b157f4244..3525a30512f72 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/form-bitfield-extract-from-shr-and.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/form-bitfield-extract-from-shr-and.mir
@@ -82,10 +82,8 @@ body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: no_mask_extract_asr
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1073741824
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 30
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[AND]], [[C1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 30
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s32)
     ; CHECK-NEXT: $w0 = COPY [[ASHR]](s32)
     %0:_(s32) = COPY $w0
     %1:_(s32) = G_CONSTANT i32 3221225472
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
index d669c49cb019b..e8af59f24c6da 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
@@ -5464,7 +5464,7 @@ define void @test_ashr_i512_const_1(ptr %result, ptr %input) {
 ; GISEL-NEXT:    extr x9, x10, x9, #1
 ; GISEL-NEXT:    extr x10, x11, x10, #1
 ; GISEL-NEXT:    stp x8, x9, [x0]
-; GISEL-NEXT:    asr x8, x13, #63
+; GISEL-NEXT:    lsr x8, x13, #63
 ; GISEL-NEXT:    extr x11, x14, x11, #1
 ; GISEL-NEXT:    extr x9, x15, x14, #1
 ; GISEL-NEXT:    lsl x8, x8, #63
diff --git a/llvm/test/CodeGen/AArch64/arm64-csel.ll b/llvm/test/CodeGen/AArch64/arm64-csel.ll
index ecdcbfe226e40..ef66a2827942b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-csel.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-csel.ll
@@ -387,8 +387,7 @@ define i64 @foo18_overflow3(i1 %cmp) nounwind readnone optsize ssp {
 ; CHECK-GI-LABEL: foo18_overflow3:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    and x8, x0, #0x1
-; CHECK-GI-NEXT:    lsl x0, x8, #63
+; CHECK-GI-NEXT:    lsl x0, x0, #63
 ; CHECK-GI-NEXT:    ret
 entry:
   %. = select i1 %cmp, i64 -9223372036854775808, i64 0
@@ -407,7 +406,6 @@ define i64 @foo18_overflow4(i1 %cmp) nounwind readnone optsize ssp {
 ; CHECK-GI-LABEL: foo18_overflow4:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    eor w8, w0, #0x1
-; CHECK-GI-NEXT:    and x8, x8, #0x1
 ; CHECK-GI-NEXT:    lsl x0, x8, #63
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/arm64-srl-and.ll b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll
index 330f27bd6c0cd..cbcb8bd303a96 100644
--- a/llvm/test/CodeGen/AArch64/arm64-srl-and.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll
@@ -28,10 +28,9 @@ define i32 @srl_and()  {
 ; CHECK-GI-NEXT:    eor w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, #65535 // =0xffff
 ; CHECK-GI-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-NEXT:    and w9, w8, #0xffff
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    cset w8, ne
-; CHECK-GI-NEXT:    and w0, w9, w8
+; CHECK-GI-NEXT:    cmp w8, w8, uxth
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    and w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
   %0 = load i16, ptr @g, align 4
diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
index 9034a39b0ac51..2913bde375dc3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -1751,13 +1751,13 @@ define <16 x i8> @andmask2v16i8(<16 x i16> %src1, <16 x i16> %src2) {
 ; CHECK-GI-NEXT:    movi.8h v4, #7
 ; CHECK-GI-NEXT:    movi.8h v5, #3
 ; CHECK-GI-NEXT:    and.16b v0, v0, v4
-; CHECK-GI-NEXT:    and.16b v2, v2, v5
 ; CHECK-GI-NEXT:    and.16b v1, v1, v4
+; CHECK-GI-NEXT:    and.16b v2, v2, v5
 ; CHECK-GI-NEXT:    and.16b v3, v3, v5
 ; CHECK-GI-NEXT:    add.8h v0, v0, v2
 ; CHECK-GI-NEXT:    add.8h v1, v1, v3
-; CHECK-GI-NEXT:    shrn.8b v0, v0, #1
-; CHECK-GI-NEXT:    shrn2.16b v0, v1, #1
+; CHECK-GI-NEXT:    uzp1.16b v0, v0, v1
+; CHECK-GI-NEXT:    ushr.16b v0, v0, #1
 ; CHECK-GI-NEXT:    ret
   %zextsrc1 = and <16 x i16> %src1, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %zextsrc2 = and <16 x i16> %src2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 8b297b77fe2f4..8674042500fe3 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -34,7 +34,6 @@ define i64 @bswap_i16_to_i64_anyext(i16 %a) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    rev w8, w0
 ; CHECK-GI-NEXT:    lsr w8, w8, #16
-; CHECK-GI-NEXT:    and x8, x8, #0xffff
 ; CHECK-GI-NEXT:    lsl x0, x8, #48
 ; CHECK-GI-NEXT:    ret
     %3 = call i16 @llvm.bswap.i16(i16 %a)
@@ -45,24 +44,14 @@ define i64 @bswap_i16_to_i64_anyext(i16 %a) {
 
 ; The zext here is optimised to an any_extend during isel..
 define i128 @bswap_i16_to_i128_anyext(i16 %a) {
-; CHECK-SD-LABEL: bswap_i16_to_i128_anyext:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov w8, w0
-; CHECK-SD-NEXT:    mov x0, xzr
-; CHECK-SD-NEXT:    rev w8, w8
-; CHECK-SD-NEXT:    lsr w8, w8, #16
-; CHECK-SD-NEXT:    lsl x1, x8, #48
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: bswap_i16_to_i128_anyext:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov w8, w0
-; CHECK-GI-NEXT:    mov x0, xzr
-; CHECK-GI-NEXT:    rev w8, w8
-; CHECK-GI-NEXT:    lsr w8, w8, #16
-; CHECK-GI-NEXT:    and x8, x8, #0xffff
-; CHECK-GI-NEXT:    lsl x1, x8, #48
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: bswap_i16_to_i128_anyext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    lsl x1, x8, #48
+; CHECK-NEXT:    ret
     %3 = call i16 @llvm.bswap.i16(i16 %a)
     %4 = zext i16 %3 to i128
     %5 = shl i128 %4, 112
diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
index fbb33db3fb7a9..55696fb08b6ef 100644
--- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll
+++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
@@ -1331,9 +1331,9 @@ define i8 @combine_i8_sdiv_pow2(i8 %x) {
 ;
 ; CHECK-GI-LABEL: combine_i8_sdiv_pow2:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sbfx w8, w0, #7, #1
-; CHECK-GI-NEXT:    ubfx w8, w8, #4, #4
-; CHECK-GI-NEXT:    add w8, w0, w8
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    ubfx w8, w8, #7, #8
+; CHECK-GI-NEXT:    add w8, w0, w8, lsr #4
 ; CHECK-GI-NEXT:    sbfx w0, w8, #4, #4
 ; CHECK-GI-NEXT:    ret
   %1 = sdiv i8 %x, 16
@@ -1352,9 +1352,9 @@ define i8 @combine_i8_sdiv_negpow2(i8 %x) {
 ;
 ; CHECK-GI-LABEL: combine_i8_sdiv_negpow2:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sbfx w8, w0, #7, #1
-; CHECK-GI-NEXT:    ubfx w8, w8, #2, #6
-; CHECK-GI-NEXT:    add w8, w0, w8
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    ubfx w8, w8, #7, #8
+; CHECK-GI-NEXT:    add w8, w0, w8, lsr #2
 ; CHECK-GI-NEXT:    sxtb w8, w8
 ; CHECK-GI-NEXT:    neg w0, w8, asr #6
 ; CHECK-GI-NEXT:    ret
@@ -1373,9 +1373,9 @@ define i16 @combine_i16_sdiv_pow2(i16 %x) {
 ;
 ; CHECK-GI-LABEL: combine_i16_sdiv_pow2:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sbfx w8, w0, #15, #1
-; CHECK-GI-NEXT:    ubfx w8, w8, #12, #4
-; CHECK-GI-NEXT:    add w8, w0, w8
+; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    ubfx w8, w8, #15, #16
+; CHECK-GI-NEXT:    add w8, w0, w8, lsr #12
 ; CHECK-GI-NEXT:    sbfx w0, w8, #4, #12
 ; CHECK-GI-NEXT:    ret
   %1 = sdiv i16 %x, 16
@@ -1394,9 +1394,9 @@ define i16 @combine_i16_sdiv_negpow2(i16 %x) {
 ;
 ; CHECK-GI-LABEL: combine_i16_sdiv_negpow2:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sbfx w8, w0, #15, #1
-; CHECK-GI-NEXT:    ubfx w8, w8, #8, #8
-; CHECK-GI-NEXT:    add w8, w0, w8
+; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    ubfx w8, w8, #15, #16
+; CHECK-GI-NEXT:    add w8, w0, w8, lsr #8
 ; CHECK-GI-NEXT:    sxth w8, w8
 ; CHECK-GI-NEXT:    neg w0, w8, asr #8
 ; CHECK-GI-NEXT:    ret
@@ -1523,9 +1523,8 @@ define i5 @combine_i5_sdiv_const100(i5 %x) {
 ; CHECK-GI-LABEL: combine_i5_sdiv_const100:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    sbfx w8, w0, #0, #5
-; CHECK-GI-NEXT:    asr w8, w8, #4
-; CHECK-GI-NEXT:    ubfx w8, w8, #3, #2
-; CHECK-GI-NEXT:    add w8, w0, w8
+; CHECK-GI-NEXT:    ubfx w8, w8, #4, #5
+; CHECK-GI-NEXT:    add w8, w0, w8, lsr #3
 ; CHECK-GI-NEXT:    sbfx w8, w8, #0, #5
 ; CHECK-GI-NEXT:    asr w0, w8, #2
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/hadd-combine-scalar.ll b/llvm/test/CodeGen/AArch64/hadd-combine-scalar.ll
index 2d54bb737ce9a..e8e2bdb5b42b9 100644
--- a/llvm/test/CodeGen/AArch64/hadd-combine-scalar.ll
+++ b/llvm/test/CodeGen/AArch64/hadd-combine-scalar.ll
@@ -57,12 +57,17 @@ define i32 @haddu_const_lhs(i32 %src1) {
 }
 
 define i32 @haddu_const_zero(i32 %src1) {
-; CHECK-LABEL: haddu_const_zero:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    lsr x0, x8, #1
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: haddu_const_zero:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, w0
+; CHECK-SD-NEXT:    lsr x0, x8, #1
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: haddu_const_zero:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    lsr w0, w0, #1
+; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext i32 %src1 to i64
   %add = add i64 0, %zextsrc1
   %resulti32 = lshr i64 %add, 1
@@ -179,9 +184,7 @@ define i32 @hadds_const_zero(i32 %src1) {
 ;
 ; CHECK-GI-LABEL: hadds_const_zero:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    sbfx x0, x0, #1, #31
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-GI-NEXT:    asr w0, w0, #1
 ; CHECK-GI-NEXT:    ret
   %zextsrc1 = sext i32 %src1 to i64
   %add = add i64 0, %zextsrc1
diff --git a/llvm/test/CodeGen/AArch64/hadd-combine.ll b/llvm/test/CodeGen/AArch64/hadd-combine.ll
index 450069cd27428..b99437b149651 100644
--- a/llvm/test/CodeGen/AArch64/hadd-combine.ll
+++ b/llvm/test/CodeGen/AArch64/hadd-combine.ll
@@ -72,9 +72,9 @@ define <8 x i16> @haddu_const_zero(<8 x i16> %src1) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-GI-NEXT:    uaddw v2.4s, v1.4s, v0.4h
-; CHECK-GI-NEXT:    uaddw2 v1.4s, v1.4s, v0.8h
-; CHECK-GI-NEXT:    shrn v0.4h, v2.4s, #1
-; CHECK-GI-NEXT:    shrn2 v0.8h, v1.4s, #1
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-GI-NEXT:    ushr v0.8h, v0.8h, #1
 ; CHECK-GI-NEXT:    ret
   %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
   %add = add <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, %zextsrc1
diff --git a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
index fc01c6b2c5471..26adcbc38d82f 100644
--- a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
+++ b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
@@ -31,8 +31,8 @@ define i1 @shifts_eqcmp_i16_i8(i16 %x) nounwind {
 ; CHECK-GI-LABEL: shifts_eqcmp_i16_i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    lsl w8, w0, #8
-; CHECK-GI-NEXT:    sbfx w8, w8, #8, #8
-; CHECK-GI-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-NEXT:    sxth w8, w8
+; CHECK-GI-NEXT:    ubfx w8, w8, #8, #16
 ; CHECK-GI-NEXT:    cmp w8, w0, uxth
 ; CHECK-GI-NEXT:    cset w0, eq
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index f9ecae443d399..16d1a3944e67d 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -963,13 +963,12 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
 ; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
 ; CHECK-GI-NEXT:    and w8, w9, #0x1
 ; CHECK-GI-NEXT:    and w9, w10, #0x1
-; CHECK-GI-NEXT:    mov.s w10, v0[1]
+; CHECK-GI-NEXT:    fmov w10, s0
 ; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
 ; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    and w9, w9, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
-; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    mov.s w9, v0[1]
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #4
 ; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
 ; CHECK-GI-NEXT:    and w8, w8, #0x3f
 ; CHECK-GI-NEXT:    strb w8, [sp, #15]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index ab7e11a78ed57..659193288ff6f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -1329,11 +1329,10 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 10, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 10
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 10, v2
-; GFX6-NEXT:    v_ashr_i64 v[2:3], v[2:3], 22
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 10, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 22, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sext_inreg_i65_22:
@@ -1343,11 +1342,10 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 10, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX8-NEXT:    v_bfe_u32 v1, v1, 0, 10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 10, v2
-; GFX8-NEXT:    v_ashrrev_i64 v[2:3], 22, v[2:3]
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 10, v2
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 22, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sext_inreg_i65_22:
@@ -1357,10 +1355,9 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 10, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX9-NEXT:    v_bfe_u32 v1, v1, 0, 10
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 10, v1
-; GFX9-NEXT:    v_ashrrev_i64 v[2:3], 22, v[2:3]
+; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 22, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i65_22:
@@ -1371,9 +1368,8 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
 ; GFX10PLUS-NEXT:    v_bfe_u32 v1, v1, 0, 10
 ; GFX10PLUS-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX10PLUS-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX10PLUS-NEXT:    v_lshl_or_b32 v1, v2, 10, v1
-; GFX10PLUS-NEXT:    v_ashrrev_i64 v[2:3], 22, v[2:3]
+; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 22, v2
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i65 %value, 22
   %ashr = ashr i65 %shl, 22
@@ -1444,7 +1440,7 @@ define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) {
 ; GCN-NEXT:    s_lshl_b32 s5, s2, 14
 ; GCN-NEXT:    s_mov_b32 s4, 0
 ; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 18
+; GCN-NEXT:    s_ashr_i32 s2, s2, 18
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i65_18:
@@ -1456,7 +1452,7 @@ define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) {
 ; GFX10PLUS-NEXT:    s_mov_b32 s4, 0
 ; GFX10PLUS-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
 ; GFX10PLUS-NEXT:    s_lshl_b32 s5, s2, 14
-; GFX10PLUS-NEXT:    s_ashr_i64 s[2:3], s[2:3], 18
+; GFX10PLUS-NEXT:    s_ashr_i32 s2, s2, 18
 ; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i65 %value, 18
diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
index 8563d7f1f15c9..b288f333318c1 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "GISelMITest.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 
@@ -1023,20 +1024,23 @@ TEST_F(AArch64GISelMITest, TestNumSignBitsCmp) {
 }
 
 TEST_F(AMDGPUGISelMITest, TestNumSignBitsTrunc) {
-  StringRef MIRString =
-    "  %3:_(<4 x s32>) = G_IMPLICIT_DEF\n"
-    "  %4:_(s32) = G_IMPLICIT_DEF\n"
-    "  %5:_(s32) = G_AMDGPU_BUFFER_LOAD_UBYTE %3, %4, %4, %4, 0, 0, 0 :: (load (s8))\n"
-    "  %6:_(s32) = COPY %5\n"
+  StringRef MIRString = "  %3:_(<4 x s32>) = G_IMPLICIT_DEF\n"
+                        "  %4:_(s32) = G_IMPLICIT_DEF\n"
+                        "  %5:_(s32) = G_AMDGPU_BUFFER_LOAD_UBYTE %3, %4, %4, "
+                        "%4, 0, 0, 0 :: (load (s8))\n"
+                        "  %6:_(s32) = COPY %5\n"
 
-    "  %7:_(s32) = G_AMDGPU_BUFFER_LOAD_SBYTE %3, %4, %4, %4, 0, 0, 0 :: (load (s8))\n"
-    "  %8:_(s32) = COPY %7\n"
+                        "  %7:_(s32) = G_AMDGPU_BUFFER_LOAD_SBYTE %3, %4, %4, "
+                        "%4, 0, 0, 0 :: (load (s8))\n"
+                        "  %8:_(s32) = COPY %7\n"
 
-    "  %9:_(s32) = G_AMDGPU_BUFFER_LOAD_USHORT %3, %4, %4, %4, 0, 0, 0 :: (load (s16))\n"
-    "  %10:_(s32) = COPY %9\n"
+                        "  %9:_(s32) = G_AMDGPU_BUFFER_LOAD_USHORT %3, %4, %4, "
+                        "%4, 0, 0, 0 :: (load (s16))\n"
+                        "  %10:_(s32) = COPY %9\n"
 
-    "  %11:_(s32) = G_AMDGPU_BUFFER_LOAD_SSHORT %3, %4, %4, %4, 0, 0, 0 :: (load (s16))\n"
-    "  %12:_(s32) = COPY %11\n";
+                        "  %11:_(s32) = G_AMDGPU_BUFFER_LOAD_SSHORT %3, %4, "
+                        "%4, %4, 0, 0, 0 :: (load (s16))\n"
+                        "  %12:_(s32) = COPY %11\n";
 
   setUp(MIRString);
   if (!TM)
@@ -1057,16 +1061,16 @@ TEST_F(AMDGPUGISelMITest, TestNumSignBitsTrunc) {
 
 TEST_F(AMDGPUGISelMITest, TestTargetKnownAlign) {
   StringRef MIRString =
-    "  %5:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.dispatch.ptr)\n"
-    "  %6:_(p4) = COPY %5\n"
-    "  %7:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.queue.ptr)\n"
-    "  %8:_(p4) = COPY %7\n"
-    "  %9:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)\n"
-    "  %10:_(p4) = COPY %9\n"
-    "  %11:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.implicitarg.ptr)\n"
-    "  %12:_(p4) = COPY %11\n"
-    "  %13:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.implicit.buffer.ptr)\n"
-    "  %14:_(p4) = COPY %13\n";
+      "  %5:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.dispatch.ptr)\n"
+      "  %6:_(p4) = COPY %5\n"
+      "  %7:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.queue.ptr)\n"
+      "  %8:_(p4) = COPY %7\n"
+      "  %9:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)\n"
+      "  %10:_(p4) = COPY %9\n"
+      "  %11:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.implicitarg.ptr)\n"
+      "  %12:_(p4) = COPY %11\n"
+      "  %13:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.implicit.buffer.ptr)\n"
+      "  %14:_(p4) = COPY %13\n";
 
   setUp(MIRString);
   if (!TM)
@@ -1517,7 +1521,8 @@ TEST_F(AArch64GISelMITest, TestKnownBitsUnmergeValues) {
 
     uint16_t PartTestVal = static_cast<uint16_t>(TestVal >> BitOffset);
     EXPECT_EQ(PartTestVal, PartKnown.One.getZExtValue());
-    EXPECT_EQ(static_cast<uint16_t>(~PartTestVal), PartKnown.Zero.getZExtValue());
+    EXPECT_EQ(static_cast<uint16_t>(~PartTestVal),
+              PartKnown.Zero.getZExtValue());
   }
 }
 
@@ -1763,7 +1768,6 @@ TEST_F(AArch64GISelMITest, TestInvalidQueries) {
   KnownBits EqSizeRes = Info.getKnownBits(EqSizedShl);
   KnownBits BiggerSizeRes = Info.getKnownBits(BiggerSizedShl);
 
-
   // Result can be anything, but we should not crash.
   EXPECT_TRUE(EqSizeRes.One.isZero());
   EXPECT_TRUE(EqSizeRes.Zero.isAllOnes());
@@ -2119,7 +2123,8 @@ TEST_F(AMDGPUGISelMITest, TestKnownBitsAssertAlign) {
     EXPECT_EQ(64u, Res.getBitWidth());
     EXPECT_EQ(NumBits - 1, Res.Zero.countr_one());
     EXPECT_EQ(64u, Res.One.countr_zero());
-    EXPECT_EQ(Align(1ull << (NumBits - 1)), Info.computeKnownAlignment(Copies[Idx]));
+    EXPECT_EQ(Align(1ull << (NumBits - 1)),
+              Info.computeKnownAlignment(Copies[Idx]));
   };
 
   const unsigned NumSetupCopies = 5;
@@ -2150,3 +2155,278 @@ TEST_F(AArch64GISelMITest, TestKnownBitsUADDO) {
   EXPECT_EQ(0u, Res.One.getZExtValue());
   EXPECT_EQ(31u, Res.Zero.countl_one());
 }
+
+namespace {
+
+MachineInstr *findOpcode(MachineFunction &MF, unsigned Opcode,
+                         unsigned Index = 0) {
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() == Opcode) {
+        if (Index == 0)
+          return &MI;
+        --Index;
+      }
+    }
+  }
+  return nullptr;
+}
+
+KnownBits simplifyDemandedBitsOperand(MachineFunction &MF,
+                                      MachineRegisterInfo &MRI,
+                                      MachineIRBuilder &B, MachineInstr &Use,
+                                      const APInt &Demand) {
+  GISelValueTracking VT(MF);
+  CombinerHelper Helper(VT, B, /*IsPreLegalize=*/false, &VT);
+  KnownBits Known(Demand.getBitWidth());
+  EXPECT_TRUE(Helper.simplifyDemandedBits(Use, /*OpNo=*/1, Demand, Known));
+  return Known;
+}
+
+} // namespace
+
+TEST_F(AArch64GISelMITest, SimplifyDemandedBitsAndSingleUse) {
+  StringRef MIRString = R"(
+    %x:_(s32) = G_TRUNC %0
+    %mask:_(s32) = G_CONSTANT i32 255
+    %and:_(s32) = G_AND %x, %mask
+    %lowmask:_(s32) = G_CONSTANT i32 15
+    %use:_(s32) = G_AND %and, %lowmask
+    %out:_(s32) = COPY %use
+)";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  MachineInstr *Producer = findOpcode(*MF, TargetOpcode::G_AND);
+  MachineInstr *Use = findOpcode(*MF, TargetOpcode::G_AND, /*Index=*/1);
+  ASSERT_NE(Producer, nullptr);
+  ASSERT_NE(Use, nullptr);
+
+  Register ProducerReg = Producer->getOperand(0).getReg();
+  Register XReg = Producer->getOperand(1).getReg();
+  simplifyDemandedBitsOperand(*MF, *MRI, B, *Use, APInt(32, 0x0F));
+  EXPECT_EQ(Use->getOperand(1).getReg(), XReg);
+  EXPECT_TRUE(MRI->use_nodbg_empty(ProducerReg));
+}
+
+TEST_F(AArch64GISelMITest, SimplifyDemandedBitsAndMultiUse) {
+  StringRef MIRString = R"(
+    %x:_(s32) = G_TRUNC %0
+    %mask:_(s32) = G_CONSTANT i32 255
+    %and:_(s32) = G_AND %x, %mask
+    %lowmask:_(s32) = G_CONSTANT i32 15
+    %use:_(s32) = G_AND %and, %lowmask
+    %amt:_(s32) = G_CONSTANT i32 8
+    %side:_(s32) = G_LSHR %and, %amt
+    %out:_(s32) = COPY %use
+    %side_out:_(s32) = COPY %side
+)";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  MachineInstr *Producer = findOpcode(*MF, TargetOpcode::G_AND);
+  MachineInstr *Use = findOpcode(*MF, TargetOpcode::G_AND, /*Index=*/1);
+  MachineInstr *Side = findOpcode(*MF, TargetOpcode::G_LSHR);
+  ASSERT_NE(Producer, nullptr);
+  ASSERT_NE(Use, nullptr);
+  ASSERT_NE(Side, nullptr);
+
+  Register ProducerReg = Producer->getOperand(0).getReg();
+  Register XReg = Producer->getOperand(1).getReg();
+  simplifyDemandedBitsOperand(*MF, *MRI, B, *Use, APInt(32, 0x0F));
+  EXPECT_EQ(Use->getOperand(1).getReg(), XReg);
+  EXPECT_EQ(Side->getOperand(1).getReg(), ProducerReg);
+  EXPECT_EQ(MRI->getVRegDef(ProducerReg)->getOpcode(), TargetOpcode::G_AND);
+}
+
+TEST_F(AArch64GISelMITest, SimplifyDemandedBitsOrSingleUse) {
+  StringRef MIRString = R"(
+    %x:_(s32) = G_TRUNC %0
+    %high:_(s32) = G_CONSTANT i32 65280
+    %or:_(s32) = G_OR %x, %high
+    %lowmask:_(s32) = G_CONSTANT i32 255
+    %use:_(s32) = G_AND %or, %lowmask
+    %out:_(s32) = COPY %use
+)";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  MachineInstr *Producer = findOpcode(*MF, TargetOpcode::G_OR);
+  MachineInstr *Use = findOpcode(*MF, TargetOpcode::G_AND);
+  ASSERT_NE(Producer, nullptr);
+  ASSERT_NE(Use, nullptr);
+
+  Register ProducerReg = Producer->getOperand(0).getReg();
+  Register XReg = Producer->getOperand(1).getReg();
+  simplifyDemandedBitsOperand(*MF, *MRI, B, *Use, APInt(32, 0xFF));
+  EXPECT_EQ(Use->getOperand(1).getReg(), XReg);
+  EXPECT_TRUE(MRI->use_nodbg_empty(ProducerReg));
+}
+
+TEST_F(AArch64GISelMITest, SimplifyDemandedBitsOrMultiUse) {
+  StringRef MIRString = R"(
+    %x:_(s32) = G_TRUNC %0
+    %high:_(s32) = G_CONSTANT i32 65280
+    %or:_(s32) = G_OR %x, %high
+    %lowmask:_(s32) = G_CONSTANT i32 255
+    %use:_(s32) = G_AND %or, %lowmask
+    %amt:_(s32) = G_CONSTANT i32 8
+    %side:_(s32) = G_LSHR %or, %amt
+    %out:_(s32) = COPY %use
+    %side_out:_(s32) = COPY %side
+)";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  MachineInstr *Producer = findOpcode(*MF, TargetOpcode::G_OR);
+  MachineInstr *Use = findOpcode(*MF, TargetOpcode::G_AND);
+  MachineInstr *Side = findOpcode(*MF, TargetOpcode::G_LSHR);
+  ASSERT_NE(Producer, nullptr);
+  ASSERT_NE(Use, nullptr);
+  ASSERT_NE(Side, nullptr);
+
+  Register ProducerReg = Producer->getOperand(0).getReg();
+  Register XReg = Producer->getOperand(1).getReg();
+  simplifyDemandedBitsOperand(*MF, *MRI, B, *Use, APInt(32, 0xFF));
+  EXPECT_EQ(Use->getOperand(1).getReg(), XReg);
+  EXPECT_EQ(Side->getOperand(1).getReg(), ProducerReg);
+  EXPECT_EQ(MRI->getVRegDef(ProducerReg)->getOpcode(), TargetOpcode::G_OR);
+}
+
+TEST_F(AArch64GISelMITest, SimplifyDemandedBitsOrConstantExplainsDemand) {
+  StringRef MIRString = R"(
+    %x:_(s32) = G_TRUNC %0
+    %low:_(s32) = G_CONSTANT i32 255
+    %or:_(s32) = G_OR %x, %low
+    %usemask:_(s32) = G_CONSTANT i32 15
+    %use:_(s32) = G_AND %or, %usemask
+    %out:_(s32) = COPY %use
+)";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  MachineInstr *Producer = findOpcode(*MF, TargetOpcode::G_OR);
+  MachineInstr *Use = findOpcode(*MF, TargetOpcode::G_AND);
+  ASSERT_NE(Producer, nullptr);
+  ASSERT_NE(Use, nullptr);
+
+  Register ProducerReg = Producer->getOperand(0).getReg();
+  Register LowCstReg = Producer->getOperand(2).getReg();
+  KnownBits Known =
+      simplifyDemandedBitsOperand(*MF, *MRI, B, *Use, APInt(32, 0x0F));
+  EXPECT_EQ(Use->getOperand(1).getReg(), LowCstReg);
+  EXPECT_TRUE(MRI->use_nodbg_empty(ProducerReg));
+  EXPECT_TRUE(APInt(32, 0x0F).isSubsetOf(Known.One));
+}
+
+TEST_F(AArch64GISelMITest, SimplifyDemandedBitsThroughShl) {
+  StringRef MIRString = R"(
+    %x:_(s32) = G_TRUNC %0
+    %mask:_(s32) = G_CONSTANT i32 255
+    %and:_(s32) = G_AND %x, %mask
+    %amt:_(s32) = G_CONSTANT i32 4
+    %shl:_(s32) = G_SHL %and, %amt
+    %lowmask:_(s32) = G_CONSTANT i32 4095
+    %use:_(s32) = G_AND %shl, %lowmask
+    %out:_(s32) = COPY %use
+)";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  // Demand on %shl is bits [0,12); through shl-by-4 the source demand is
+  // bits [0,8), fully covered by %mask=255 -> the inner G_AND must die.
+  MachineInstr *Use = findOpcode(*MF, TargetOpcode::G_AND, /*Index=*/1);
+  MachineInstr *Inner = findOpcode(*MF, TargetOpcode::G_AND);
+  ASSERT_NE(Use, nullptr);
+  ASSERT_NE(Inner, nullptr);
+
+  Register InnerDst = Inner->getOperand(0).getReg();
+  simplifyDemandedBitsOperand(*MF, *MRI, B, *Use, APInt(32, 0xFFF));
+  EXPECT_TRUE(MRI->use_nodbg_empty(InnerDst));
+}
+
+TEST_F(AArch64GISelMITest, SimplifyDemandedBitsAshrToLshr) {
+  StringRef MIRString = R"(
+    %x:_(s32) = G_TRUNC %0
+    %amt:_(s32) = G_CONSTANT i32 8
+    %ashr:_(s32) = G_ASHR %x, %amt
+    %lowmask:_(s32) = G_CONSTANT i32 65535
+    %use:_(s32) = G_AND %ashr, %lowmask
+    %out:_(s32) = COPY %use
+)";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  // Demand bits [0,16); ashr-by-8 sign-fill occupies result bits [24,32);
+  // none demanded -> convert to G_LSHR.
+  MachineInstr *Use = findOpcode(*MF, TargetOpcode::G_AND);
+  ASSERT_NE(Use, nullptr);
+
+  simplifyDemandedBitsOperand(*MF, *MRI, B, *Use, APInt(32, 0xFFFF));
+  EXPECT_EQ(findOpcode(*MF, TargetOpcode::G_ASHR), nullptr);
+  EXPECT_NE(findOpcode(*MF, TargetOpcode::G_LSHR), nullptr);
+}
+
+TEST_F(AArch64GISelMITest, SimplifyDemandedBitsMultiUseDefNoDescend) {
+  StringRef MIRString = R"(
+   %z:_(s32) = G_TRUNC %0
+   %mask:_(s32) = G_CONSTANT i32 16776960
+   %y:_(s32) = G_AND %z, %mask
+   %amt:_(s32) = G_CONSTANT i32 8
+   %s:_(s32) = G_LSHR %y, %amt
+   %lowmask:_(s32) = G_CONSTANT i32 255
+   %root:_(s32) = G_AND %s, %lowmask
+   %side:_(s32) = COPY %s
+   %out:_(s32) = COPY %root
+)";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  // %s has two users (%root's G_AND and %side's COPY, which demands all
+  // bits). Walking from %root with partial demand 0xFF must NOT descend
+  // through the multi-use %s and erase %y's mask (0xFFFF00) -- the COPY
+  // observes bits the root does not demand.
+  MachineInstr *Root = findOpcode(*MF, TargetOpcode::G_AND, /*Index=*/1);
+  MachineInstr *Inner = findOpcode(*MF, TargetOpcode::G_AND);
+  ASSERT_NE(Root, nullptr);
+  ASSERT_NE(Inner, nullptr);
+
+  Register InnerDst = Inner->getOperand(0).getReg();
+  GISelValueTracking VT(*MF);
+  CombinerHelper Helper(VT, B, /*IsPreLegalize=*/false, &VT);
+  KnownBits Known(32);
+  Helper.simplifyDemandedBits(*Root, /*OpNo=*/1, APInt(32, 0xFF), Known);
+  // The inner mask must survive: %s is multi-use and the demand is partial.
+  EXPECT_FALSE(MRI->use_nodbg_empty(InnerDst));
+}
+
+TEST(GISelShiftDemand, DemandedSrcBitsForShiftConst) {
+  // SHL by 4: result bits [4,8) come from src bits [0,4).
+  EXPECT_EQ(CombinerHelper::getDemandedSrcBitsForShiftConst(TargetOpcode::G_SHL,
+                                                            APInt(8, 0xF0), 4),
+            APInt(8, 0x0F));
+  // LSHR by 4: result bits [0,4) come from src bits [4,8).
+  EXPECT_EQ(CombinerHelper::getDemandedSrcBitsForShiftConst(
+                TargetOpcode::G_LSHR, APInt(8, 0x0F), 4),
+            APInt(8, 0xF0));
+  // ASHR by 4, only low result bits demanded: like LSHR, no sign-bit demand.
+  EXPECT_EQ(CombinerHelper::getDemandedSrcBitsForShiftConst(
+                TargetOpcode::G_ASHR, APInt(8, 0x0F), 4),
+            APInt(8, 0xF0));
+  // ASHR by 4, result bit 6 demanded (sign-fill territory): src sign bit only.
+  EXPECT_EQ(CombinerHelper::getDemandedSrcBitsForShiftConst(
+                TargetOpcode::G_ASHR, APInt(8, 0x40), 4),
+            APInt(8, 0x80));
+  // Shift by 0 is identity for all three.
+  EXPECT_EQ(CombinerHelper::getDemandedSrcBitsForShiftConst(
+                TargetOpcode::G_ASHR, APInt(8, 0xA5), 0),
+            APInt(8, 0xA5));
+}

>From feffd98c73af518c8ccb609a97ff3957b0f5a975 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Fri, 12 Jun 2026 04:54:25 -0700
Subject: [PATCH 4/4] [GlobalISel] Match SDAG's multi-use handling in the
 demanded-bits walk

Split the multi-use handling into SDAG's exact shape:

* simplifyMultipleUseDemandedBits is a pure look-through (GlobalISel
  counterpart of SDAG's SimplifyMultipleUseDemandedBits): it returns an
  existing register that agrees with the def on every demanded bit.
  Because it never rewrites anything, it may walk through defs with
  multiple uses; the caller reroutes only its own operand.
* When the look-through finds nothing, the walk relaxes the demand to
  all bits at a multi-use def and keeps going, as SDAG does, instead of
  giving up: every rewrite below an all-bits demand is value-preserving
  for all users of the shared def.

This recovers simplifications the previous one-use gate forfeited, e.g.
rerouting through chains of shared masks and erasing redundant masks
below shared shifts.
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   9 ++
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 103 +++++++++++++-----
 .../combine-simplify-demanded-bits.mir        |  69 ++++++++++++
 .../CodeGen/GlobalISel/KnownBitsTest.cpp      |  85 ++++++++++++++-
 4 files changed, 236 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index f3ac3b3bc7a35..ff1f2745a3dad 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -1189,6 +1189,15 @@ class CombinerHelper {
                                      KnownBits &Known,
                                      unsigned Depth = 0) const;
 
+  /// Look through \p R's def for an existing register that computes the same
+  /// \p DemandedBits more cheaply, without modifying any instruction
+  /// (GlobalISel counterpart of SDAG's SimplifyMultipleUseDemandedBits).
+  /// Because nothing is rewritten, this may walk through defs with multiple
+  /// uses. Returns the empty Register when no simpler value exists.
+  LLVM_ABI Register simplifyMultipleUseDemandedBits(Register R,
+                                                    const APInt &DemandedBits,
+                                                    unsigned Depth = 0) const;
+
   /// Demand transfer for a shift by constant \p ShAmt: which source bits can
   /// influence the demanded result bits. For G_ASHR the source sign bit is
   /// demanded whenever any of the top ShAmt result bits (copies of it) are.
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 863be388e9361..a567f731b7bae 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -8858,6 +8858,59 @@ APInt CombinerHelper::getDemandedSrcBitsForShiftConst(unsigned Opcode,
   }
 }
 
+Register CombinerHelper::simplifyMultipleUseDemandedBits(
+    Register R, const APInt &DemandedBits, unsigned Depth) const {
+  if (!R.isVirtual() || DemandedBits.isZero() ||
+      Depth >= MaxAnalysisRecursionDepth)
+    return Register();
+
+  LLT Ty = MRI.getType(R);
+  if (!Ty.isValid())
+    return Register();
+  assert(DemandedBits.getBitWidth() == Ty.getScalarSizeInBits() &&
+         "DemandedBits width must match the register scalar type");
+
+  MachineInstr *DefMI = MRI.getVRegDef(R);
+  if (!DefMI || DefMI->getNumExplicitDefs() != 1 ||
+      !isRegUseOperand(*DefMI, 1) || !isRegUseOperand(*DefMI, 2))
+    return Register();
+
+  unsigned Opcode = DefMI->getOpcode();
+  if (Opcode != TargetOpcode::G_AND && Opcode != TargetOpcode::G_OR)
+    return Register();
+
+  Register LHS = DefMI->getOperand(1).getReg();
+  Register RHS = DefMI->getOperand(2).getReg();
+  if (MRI.getType(LHS) != Ty || MRI.getType(RHS) != Ty)
+    return Register();
+
+  auto LookThrough = [&](Register X, Register CReg) -> Register {
+    std::optional<APInt> C = getConstantOrConstantSplatVector(CReg);
+    if (!C || C->getBitWidth() != DemandedBits.getBitWidth())
+      return Register();
+    bool BypassToX = Opcode == TargetOpcode::G_AND
+                         ? DemandedBits.isSubsetOf(*C)
+                         : DemandedBits.isSubsetOf(~*C);
+    if (BypassToX) {
+      // X agrees with the def on every demanded bit; keep looking through.
+      if (Register Deeper =
+              simplifyMultipleUseDemandedBits(X, DemandedBits, Depth + 1))
+        return Deeper;
+      return X;
+    }
+    bool BypassToC = Opcode == TargetOpcode::G_AND
+                         ? DemandedBits.isSubsetOf(~*C)
+                         : DemandedBits.isSubsetOf(*C);
+    if (BypassToC)
+      return CReg;
+    return Register();
+  };
+
+  if (Register Repl = LookThrough(LHS, RHS))
+    return Repl;
+  return LookThrough(RHS, LHS);
+}
+
 bool CombinerHelper::simplifyDemandedBitsImpl(MachineInstr &MI, unsigned OpNo,
                                               const APInt &DemandedBits,
                                               KnownBits &Known, unsigned Depth,
@@ -8905,12 +8958,21 @@ bool CombinerHelper::simplifyDemandedBitsImpl(MachineInstr &MI, unsigned OpNo,
     return true;
   };
 
-  // Descending into a def that has other users with a partial demand could
-  // rewrite it in ways those users observe (they demand bits we don't).
-  // Mirror SDAG: only recurse into single-use defs, unless every bit is
-  // demanded (then any derived deeper demand is intrinsic to the operators
-  // and value-preserving for all users).
-  bool CanRecurse = MRI.hasOneNonDBGUse(OpReg) || DemandedBits.isAllOnes();
+  // Multi-use defs cannot be rewritten under a partial demand (other users
+  // observe bits we do not). First try a pure look-through to an existing
+  // register that agrees on the demanded bits and reroute only this use;
+  // otherwise relax the demand to all bits, as SDAG does, so that any
+  // rewrite below remains value-preserving for every user of this def.
+  APInt Demanded = DemandedBits;
+  if (!MRI.hasOneNonDBGUse(OpReg) && !Demanded.isAllOnes()) {
+    if (Register Repl =
+            simplifyMultipleUseDemandedBits(OpReg, Demanded, Depth)) {
+      Known = VT->getKnownBits(Repl);
+      if (Rewrite(Repl))
+        return true;
+    }
+    Demanded = APInt::getAllOnes(BW);
+  }
 
   unsigned Opcode = DefMI->getOpcode();
   switch (Opcode) {
@@ -8933,15 +8995,15 @@ bool CombinerHelper::simplifyDemandedBitsImpl(MachineInstr &MI, unsigned OpNo,
       if (!C || C->getBitWidth() != BW)
         return std::nullopt;
       if (Opcode == TargetOpcode::G_AND) {
-        if (DemandedBits.isSubsetOf(*C))
+        if (Demanded.isSubsetOf(*C))
           return X;
-        if (DemandedBits.isSubsetOf(~*C))
+        if (Demanded.isSubsetOf(~*C))
           return CReg;
         return std::nullopt;
       }
-      if (DemandedBits.isSubsetOf(~*C))
+      if (Demanded.isSubsetOf(~*C))
         return X;
-      if (DemandedBits.isSubsetOf(*C))
+      if (Demanded.isSubsetOf(*C))
         return CReg;
       return std::nullopt;
     };
@@ -8960,16 +9022,10 @@ bool CombinerHelper::simplifyDemandedBitsImpl(MachineInstr &MI, unsigned OpNo,
         return true;
     }
 
-    // Node-local constant elimination above is multi-use safe (Rewrite
-    // reroutes only this operand for multi-use defs), but recursing into the
-    // def's operands with a partial demand is not.
-    if (!CanRecurse)
-      return GiveUp();
-
     KnownBits RHSKnown(BW);
-    bool Changed = simplifyDemandedBitsImpl(*DefMI, /*OpNo=*/2, DemandedBits,
+    bool Changed = simplifyDemandedBitsImpl(*DefMI, /*OpNo=*/2, Demanded,
                                             RHSKnown, Depth + 1, DoRewrite);
-    APInt LHSDemand = DemandedBits;
+    APInt LHSDemand = Demanded;
     if (Opcode == TargetOpcode::G_AND)
       LHSDemand &= ~RHSKnown.Zero;
     else
@@ -8997,14 +9053,8 @@ bool CombinerHelper::simplifyDemandedBitsImpl(MachineInstr &MI, unsigned OpNo,
     unsigned ShAmt = Amt->getZExtValue();
     if (ShAmt == 0)
       return GiveUp();
-    // Bail entirely for a multi-use def under partial demand (SDAG does the
-    // same): both the recursion and the ASHR->LSHR rewrite below derive from
-    // a demand the other users do not share.
-    if (!CanRecurse)
-      return GiveUp();
 
-    APInt SrcDemand =
-        getDemandedSrcBitsForShiftConst(Opcode, DemandedBits, ShAmt);
+    APInt SrcDemand = getDemandedSrcBitsForShiftConst(Opcode, Demanded, ShAmt);
     KnownBits SrcKnown(BW);
     bool Changed = simplifyDemandedBitsImpl(*DefMI, /*OpNo=*/1, SrcDemand,
                                             SrcKnown, Depth + 1, DoRewrite);
@@ -9020,8 +9070,7 @@ bool CombinerHelper::simplifyDemandedBitsImpl(MachineInstr &MI, unsigned OpNo,
       // If none of the sign-fill result bits [BW-ShAmt, BW) are demanded, or
       // the sign bit is known zero, an unsigned shift computes the same
       // demanded bits (SDAG's SRA-case rewrite).
-      if (DemandedBits.countLeadingZeros() >= ShAmt ||
-          SrcKnown.isNonNegative()) {
+      if (Demanded.countLeadingZeros() >= ShAmt || SrcKnown.isNonNegative()) {
         if (DoRewrite) {
           Builder.setInstrAndDebugLoc(*DefMI);
           auto Lshr = Builder.buildLShr(DstTy, DefMI->getOperand(1).getReg(),
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-simplify-demanded-bits.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-simplify-demanded-bits.mir
index aea943716da9f..0a0f78770a2f5 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-simplify-demanded-bits.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-simplify-demanded-bits.mir
@@ -332,3 +332,72 @@ body:             |
     $x1 = COPY %hi(s64)
     RET_ReallyLR implicit $w0, implicit $x1
 ...
+---
+# Chain of multi-use masks: %a and %b each have an extra user, but the pure
+# look-through may walk through both (bit 0 passes unchanged), so each user's
+# AND operand is rerouted straight to the copy while the shared ANDs survive.
+name:            multiuse_chain_look_through
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0
+    ; CHECK-LABEL: name: multiuse_chain_look_through
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %c1:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: %a:_(s32) = G_AND [[COPY]], %c1
+    ; CHECK-NEXT: %c2:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: %b:_(s32) = G_AND [[COPY]], %c2
+    ; CHECK-NEXT: %one:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %root:_(s32) = G_AND [[COPY]], %one
+    ; CHECK-NEXT: $w0 = COPY %root(s32)
+    ; CHECK-NEXT: $w1 = COPY %a(s32)
+    ; CHECK-NEXT: $w2 = COPY %b(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0, implicit $w1, implicit $w2
+    %0:_(s32) = COPY $w0
+    %c1:_(s32) = G_CONSTANT i32 255
+    %a:_(s32) = G_AND %0, %c1
+    %c2:_(s32) = G_CONSTANT i32 15
+    %b:_(s32) = G_AND %a, %c2
+    %one:_(s32) = G_CONSTANT i32 1
+    %root:_(s32) = G_AND %b, %one
+    $w0 = COPY %root(s32)
+    $w1 = COPY %a(s32)
+    $w2 = COPY %b(s32)
+    RET_ReallyLR implicit $w0, implicit $w1, implicit $w2
+...
+---
+# %s is multi-use (the masking G_AND plus a direct copy out). Instead of
+# giving up, the walk relaxes the demand to all bits at %s's frame; through
+# shl-by-4 that demands src bits [0,28), exactly what %mask keeps, so the
+# inner G_AND is redundant for every user of %s and is erased.
+name:            relax_through_multiuse_shl
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0
+    ; CHECK-LABEL: name: relax_through_multiuse_shl
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %amt:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: %s:_(s32) = G_SHL [[COPY]], %amt(s32)
+    ; CHECK-NEXT: %lowmask:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: %root:_(s32) = G_AND %s, %lowmask
+    ; CHECK-NEXT: $w0 = COPY %root(s32)
+    ; CHECK-NEXT: $w1 = COPY %s(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0, implicit $w1
+    %0:_(s32) = COPY $w0
+    %mask:_(s32) = G_CONSTANT i32 268435455
+    %y:_(s32) = G_AND %0, %mask
+    %amt:_(s32) = G_CONSTANT i32 4
+    %s:_(s32) = G_SHL %y, %amt
+    %lowmask:_(s32) = G_CONSTANT i32 255
+    %root:_(s32) = G_AND %s, %lowmask
+    $w0 = COPY %root(s32)
+    $w1 = COPY %s(s32)
+    RET_ReallyLR implicit $w0, implicit $w1
+...
diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
index b288f333318c1..80e8f3dc2df3f 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
@@ -2391,9 +2391,10 @@ TEST_F(AArch64GISelMITest, SimplifyDemandedBitsMultiUseDefNoDescend) {
     GTEST_SKIP();
 
   // %s has two users (%root's G_AND and %side's COPY, which demands all
-  // bits). Walking from %root with partial demand 0xFF must NOT descend
-  // through the multi-use %s and erase %y's mask (0xFFFF00) -- the COPY
-  // observes bits the root does not demand.
+  // bits). Walking from %root with partial demand 0xFF relaxes to all bits
+  // at the multi-use %s, so the derived source demand (0xFFFFFF00) straddles
+  // %y's mask (0xFFFF00) and the mask must survive -- the COPY observes bits
+  // the root does not demand.
   MachineInstr *Root = findOpcode(*MF, TargetOpcode::G_AND, /*Index=*/1);
   MachineInstr *Inner = findOpcode(*MF, TargetOpcode::G_AND);
   ASSERT_NE(Root, nullptr);
@@ -2408,6 +2409,84 @@ TEST_F(AArch64GISelMITest, SimplifyDemandedBitsMultiUseDefNoDescend) {
   EXPECT_FALSE(MRI->use_nodbg_empty(InnerDst));
 }
 
+TEST_F(AArch64GISelMITest, SimplifyMultipleUseDemandedBitsChain) {
+  StringRef MIRString = R"(
+   %x:_(s32) = G_TRUNC %0
+   %c1:_(s32) = G_CONSTANT i32 255
+   %a:_(s32) = G_AND %x, %c1
+   %c2:_(s32) = G_CONSTANT i32 15
+   %b:_(s32) = G_AND %a, %c2
+   %one:_(s32) = G_CONSTANT i32 1
+   %root:_(s32) = G_AND %b, %one
+   %sidea:_(s32) = COPY %a
+   %sideb:_(s32) = COPY %b
+   %out:_(s32) = COPY %root
+)";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  // %a and %b are both multi-use (each has a side COPY). A pure look-through
+  // may still walk the whole chain: bit 0 passes unchanged through both
+  // masks, so %root's operand can be rerouted all the way to %x while the
+  // shared ANDs survive for their other users.
+  MachineInstr *ADef = findOpcode(*MF, TargetOpcode::G_AND);
+  MachineInstr *BDef = findOpcode(*MF, TargetOpcode::G_AND, /*Index=*/1);
+  MachineInstr *Root = findOpcode(*MF, TargetOpcode::G_AND, /*Index=*/2);
+  ASSERT_NE(ADef, nullptr);
+  ASSERT_NE(BDef, nullptr);
+  ASSERT_NE(Root, nullptr);
+
+  Register XReg = ADef->getOperand(1).getReg();
+  Register AReg = ADef->getOperand(0).getReg();
+  Register BReg = BDef->getOperand(0).getReg();
+
+  GISelValueTracking VT(*MF);
+  CombinerHelper Helper(VT, B, /*IsPreLegalize=*/false, &VT);
+  EXPECT_EQ(Helper.simplifyMultipleUseDemandedBits(BReg, APInt(32, 1)), XReg);
+
+  KnownBits Known(32);
+  EXPECT_TRUE(
+      Helper.simplifyDemandedBits(*Root, /*OpNo=*/1, APInt(32, 1), Known));
+  EXPECT_EQ(Root->getOperand(1).getReg(), XReg);
+  EXPECT_FALSE(MRI->use_nodbg_empty(AReg));
+  EXPECT_FALSE(MRI->use_nodbg_empty(BReg));
+}
+
+TEST_F(AArch64GISelMITest, SimplifyDemandedBitsRelaxThroughMultiUseShift) {
+  StringRef MIRString = R"(
+   %x:_(s32) = G_TRUNC %0
+   %mask:_(s32) = G_CONSTANT i32 268435455
+   %y:_(s32) = G_AND %x, %mask
+   %amt:_(s32) = G_CONSTANT i32 4
+   %s:_(s32) = G_SHL %y, %amt
+   %lowmask:_(s32) = G_CONSTANT i32 255
+   %root:_(s32) = G_AND %s, %lowmask
+   %side:_(s32) = COPY %s
+   %out:_(s32) = COPY %root
+)";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  // %s is multi-use, so the partial demand 0xFF relaxes to all bits at %s's
+  // frame instead of giving up. Through shl-by-4 that demands src bits
+  // [0,28), exactly what %mask keeps, so the single-use inner G_AND is
+  // redundant for every user of %s and must be erased.
+  MachineInstr *Inner = findOpcode(*MF, TargetOpcode::G_AND);
+  MachineInstr *Root = findOpcode(*MF, TargetOpcode::G_AND, /*Index=*/1);
+  MachineInstr *Shl = findOpcode(*MF, TargetOpcode::G_SHL);
+  ASSERT_NE(Inner, nullptr);
+  ASSERT_NE(Root, nullptr);
+  ASSERT_NE(Shl, nullptr);
+
+  Register InnerDst = Inner->getOperand(0).getReg();
+  Register XReg = Inner->getOperand(1).getReg();
+  simplifyDemandedBitsOperand(*MF, *MRI, B, *Root, APInt(32, 0xFF));
+  EXPECT_TRUE(MRI->use_nodbg_empty(InnerDst));
+  EXPECT_EQ(Shl->getOperand(1).getReg(), XReg);
+}
+
 TEST(GISelShiftDemand, DemandedSrcBitsForShiftConst) {
   // SHL by 4: result bits [4,8) come from src bits [0,4).
   EXPECT_EQ(CombinerHelper::getDemandedSrcBitsForShiftConst(TargetOpcode::G_SHL,