[llvm] [X86] Add FastImm16 tuning flag to Intel Atom + AMD Bobcat/Ryzen Families (PR #90635)

Tue Apr 30 10:52:44 PDT 2024

https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/90635

This patch limits the icmp_i16(x,c) -> icmp_i32(ext(x),ext(c)) fold to CPUs that aren't known to have fast handling for length-changing prefixes for imm16 operands.

We are always assuming that 66/67h length-changing prefixes cause severe stalls and we should always extend imm16 operands and use a i32 icmp instead, the only exception being Intel Bonnell CPUs.

Agner makes this clear (see microarchitecture.pdf) that there are no stalls for any of the Intel Atom family (at least as far as Tremont - not sure about Gracemont or later). This is also true for AMD Bobcat/Jaguar and Ryzen families.

Recent performance Intel CPUs are trickier - Core2/Nehalem and earlier could have a 6-11cy stall, while SandyBridge onwards this is reduced to 3cy or less. I'm not sure if we should accept this as fast or not, we only use this flag for the icmp_i16 case, so that might be acceptable? If so, we should add this to x86-64-v3/v4 tuning as well.

Part of #90355 + #62952

>From 685067dcdd9fec0600ff7d93848d412f3a0dba88 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 30 Apr 2024 18:50:02 +0100
Subject: [PATCH] [X86] Add FastImm16 tuning flag to Intel Atom + AMD CPU
 Families

This limits the icmp_i16(x,c) -> icmp_i32(ext(x),ext(c)) fold to CPUs that aren't known to have fast handling for length-changing prefixes for imm16 operands.

We are always assuming that 66/67h length-changing prefixes cause severe stalls and we should always extend imm16 operands and use a i32 icmp instead, the only exception being Intel Bonnell CPUs.

Agner makes this clear (see microarchitecture.pdf) that there are no stalls for any of the Intel Atom family (at least as far as Tremont - not sure about Gracemont or later). This is also true for AMD Bobcat/Jaguar and Ryzen families.

Recent performance Intel CPUs are trickier - Core2/Nehalem and earlier could have a 6-11cy stall, while SandyBridge onwards this is reduced to 3cy or less. I'm not sure if we should accept this as fast or not, we only use this flag for the icmp_i16 case, so that might be acceptable? If so, we should add this to x86-64-v3/v4 tuning as well.

Part of #90355 + #62952
---
 llvm/lib/Target/X86/X86.td              |  12 ++
 llvm/lib/Target/X86/X86ISelLowering.cpp |   2 +-
 llvm/test/CodeGen/X86/cmp16.ll          | 239 ++++++++++++++++++++++++
 3 files changed, 252 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 78bc043911f2fc..6d4b2121ec9092 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -739,6 +739,10 @@ def TuningFastMOVBE
     : SubtargetFeature<"fast-movbe", "HasFastMOVBE", "true",
     "Prefer a movbe over a single-use load + bswap / single-use bswap + store">;
 
+def TuningFastImm16
+    : SubtargetFeature<"fast-imm16", "HasFastImm16", "true",
+    "Prefer a i16 instruction with i16 immediate over extension to i32">;
+
 def TuningUseSLMArithCosts
     : SubtargetFeature<"use-slm-arith-costs", "UseSLMArithCosts", "true",
         "Use Silvermont specific arithmetic costs">;
@@ -1145,6 +1149,7 @@ def ProcessorFeatures {
                                        TuningSlowDivide32,
                                        TuningSlowDivide64,
                                        TuningSlowTwoMemOps,
+                                       TuningFastImm16,
                                        TuningLEAUsesAG,
                                        TuningPadShortFunctions,
                                        TuningInsertVZEROUPPER,
@@ -1165,6 +1170,7 @@ def ProcessorFeatures {
                                       TuningSlowPMULLD,
                                       TuningFast7ByteNOP,
                                       TuningFastMOVBE,
+                                      TuningFastImm16,
                                       TuningPOPCNTFalseDeps,
                                       TuningInsertVZEROUPPER,
                                       TuningNoDomainDelay];
@@ -1186,6 +1192,7 @@ def ProcessorFeatures {
                                       TuningSlowLEA,
                                       TuningSlowIncDec,
                                       TuningFastMOVBE,
+                                      TuningFastImm16,
                                       TuningPOPCNTFalseDeps,
                                       TuningInsertVZEROUPPER,
                                       TuningNoDomainDelay];
@@ -1200,6 +1207,7 @@ def ProcessorFeatures {
                                       TuningSlowLEA,
                                       TuningSlowIncDec,
                                       TuningFastMOVBE,
+                                      TuningFastImm16,
                                       TuningInsertVZEROUPPER,
                                       TuningNoDomainDelay];
   list<SubtargetFeature> GLPFeatures =
@@ -1320,6 +1328,7 @@ def ProcessorFeatures {
                                       TuningPreferMaskRegisters,
                                       TuningFastGather,
                                       TuningFastMOVBE,
+                                      TuningFastImm16,
                                       TuningSlowPMADDWD];
   // TODO Add AVX5124FMAPS/AVX5124VNNIW features
   list<SubtargetFeature> KNMFeatures =
@@ -1363,6 +1372,7 @@ def ProcessorFeatures {
                                          TuningFastScalarShiftMasks,
                                          TuningFastVectorShiftMasks,
                                          TuningSlowSHLD,
+                                         TuningFastImm16,
                                          TuningSBBDepBreaking,
                                          TuningInsertVZEROUPPER];
 
@@ -1383,6 +1393,7 @@ def ProcessorFeatures {
                                          TuningFastScalarShiftMasks,
                                          TuningFastVectorShiftMasks,
                                          TuningFastMOVBE,
+                                         TuningFastImm16,
                                          TuningSBBDepBreaking,
                                          TuningSlowSHLD];
   list<SubtargetFeature> BtVer2Features =
@@ -1487,6 +1498,7 @@ def ProcessorFeatures {
                                      TuningFastScalarShiftMasks,
                                      TuningFastVariablePerLaneShuffle,
                                      TuningFastMOVBE,
+                                     TuningFastImm16,
                                      TuningSlowSHLD,
                                      TuningSBBDepBreaking,
                                      TuningInsertVZEROUPPER,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6a5fc3c5314656..4b58a8a20c6bf6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -22657,7 +22657,7 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
 
   // Only promote the compare up to I32 if it is a 16 bit operation
   // with an immediate.  16 bit immediates are to be avoided.
-  if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
+  if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
       !DAG.getMachineFunction().getFunction().hasMinSize()) {
     ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
     ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
diff --git a/llvm/test/CodeGen/X86/cmp16.ll b/llvm/test/CodeGen/X86/cmp16.ll
index 760c8e4044994c..699ea3e4dd4734 100644
--- a/llvm/test/CodeGen/X86/cmp16.ll
+++ b/llvm/test/CodeGen/X86/cmp16.ll
@@ -1,8 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86,X86-GENERIC
 ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=X64,X64-GENERIC
+; RUN: llc < %s -mtriple=i686-- -mattr=+fast-imm16 | FileCheck %s --check-prefixes=X86,X86-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+fast-imm16 | FileCheck %s --check-prefixes=X64,X64-FAST
 ; RUN: llc < %s -mtriple=i686-- -mcpu=atom | FileCheck %s --check-prefixes=X86,X86-ATOM
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=atom | FileCheck %s --check-prefixes=X64,X64-ATOM
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=slm | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=knl | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1 | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=X64,X64-FAST
 
 define i1 @cmp16_reg_eq_reg(i16 %a0, i16 %a1) {
 ; X86-GENERIC-LABEL: cmp16_reg_eq_reg:
@@ -18,6 +28,19 @@ define i1 @cmp16_reg_eq_reg(i16 %a0, i16 %a1) {
 ; X64-GENERIC-NEXT:    sete %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_eq_reg:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw {{[0-9]+}}(%esp), %ax
+; X86-FAST-NEXT:    sete %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_eq_reg:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw %si, %di
+; X64-FAST-NEXT:    sete %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_eq_reg:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -52,6 +75,18 @@ define i1 @cmp16_reg_eq_imm8(i16 %a0) {
 ; X64-GENERIC-NEXT:    sete %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_eq_imm8:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $15, {{[0-9]+}}(%esp)
+; X86-FAST-NEXT:    sete %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_eq_imm8:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $15, %di
+; X64-FAST-NEXT:    sete %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_eq_imm8:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $15, {{[0-9]+}}(%esp)
@@ -90,6 +125,18 @@ define i1 @cmp16_reg_eq_imm16(i16 %a0) {
 ; X64-GENERIC-NEXT:    sete %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_eq_imm16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
+; X86-FAST-NEXT:    sete %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_eq_imm16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $1024, %di # imm = 0x400
+; X64-FAST-NEXT:    sete %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_eq_imm16:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
@@ -144,6 +191,18 @@ define i1 @cmp16_reg_eq_imm16_optsize(i16 %a0) optsize {
 ; X64-GENERIC-NEXT:    sete %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_eq_imm16_optsize:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
+; X86-FAST-NEXT:    sete %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_eq_imm16_optsize:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $1024, %di # imm = 0x400
+; X64-FAST-NEXT:    sete %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_eq_imm16_optsize:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
@@ -172,6 +231,18 @@ define i1 @cmp16_reg_sgt_imm8(i16 %a0) {
 ; X64-GENERIC-NEXT:    setge %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_sgt_imm8:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $16, {{[0-9]+}}(%esp)
+; X86-FAST-NEXT:    setge %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_sgt_imm8:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $16, %di
+; X64-FAST-NEXT:    setge %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_sgt_imm8:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $16, {{[0-9]+}}(%esp)
@@ -210,6 +281,18 @@ define i1 @cmp16_reg_sgt_imm16(i16 %a0) {
 ; X64-GENERIC-NEXT:    setge %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_sgt_imm16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
+; X86-FAST-NEXT:    setge %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_sgt_imm16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $-1023, %di # imm = 0xFC01
+; X64-FAST-NEXT:    setge %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_sgt_imm16:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
@@ -264,6 +347,18 @@ define i1 @cmp16_reg_sgt_imm16_optsize(i16 %a0) optsize {
 ; X64-GENERIC-NEXT:    setge %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_sgt_imm16_optsize:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
+; X86-FAST-NEXT:    setge %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_sgt_imm16_optsize:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $-1023, %di # imm = 0xFC01
+; X64-FAST-NEXT:    setge %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_sgt_imm16_optsize:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
@@ -294,6 +389,18 @@ define i1 @cmp16_reg_uge_imm16(i16 %a0) {
 ; X64-GENERIC-NEXT:    setae %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_uge_imm16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
+; X86-FAST-NEXT:    setae %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_uge_imm16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $-1024, %di # imm = 0xFC00
+; X64-FAST-NEXT:    setae %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_uge_imm16:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
@@ -348,6 +455,18 @@ define i1 @cmp16_reg_uge_imm16_optsize(i16 %a0) optsize {
 ; X64-GENERIC-NEXT:    setae %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_uge_imm16_optsize:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
+; X86-FAST-NEXT:    setae %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_uge_imm16_optsize:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $-1024, %di # imm = 0xFC00
+; X64-FAST-NEXT:    setae %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_uge_imm16_optsize:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
@@ -380,6 +499,22 @@ define i1 @cmp16_load_ne_load(ptr %p0, ptr %p1) {
 ; X64-GENERIC-NEXT:    setne %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_ne_load:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    movzwl (%ecx), %ecx
+; X86-FAST-NEXT:    cmpw (%eax), %cx
+; X86-FAST-NEXT:    setne %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_ne_load:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movzwl (%rdi), %eax
+; X64-FAST-NEXT:    cmpw (%rsi), %ax
+; X64-FAST-NEXT:    setne %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_ne_load:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -417,6 +552,19 @@ define i1 @cmp16_load_ne_imm8(ptr %p0) {
 ; X64-GENERIC-NEXT:    setne %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_ne_imm8:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $15, (%eax)
+; X86-FAST-NEXT:    setne %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_ne_imm8:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $15, (%rdi)
+; X64-FAST-NEXT:    setne %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_ne_imm8:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -456,6 +604,19 @@ define i1 @cmp16_load_ne_imm16(ptr %p0) {
 ; X64-GENERIC-NEXT:    setne %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_ne_imm16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $512, (%eax) # imm = 0x200
+; X86-FAST-NEXT:    setne %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_ne_imm16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $512, (%rdi) # imm = 0x200
+; X64-FAST-NEXT:    setne %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_ne_imm16:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -493,6 +654,19 @@ define i1 @cmp16_load_slt_imm8(ptr %p0) {
 ; X64-GENERIC-NEXT:    setl %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_slt_imm8:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $15, (%eax)
+; X86-FAST-NEXT:    setl %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_slt_imm8:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $15, (%rdi)
+; X64-FAST-NEXT:    setl %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_slt_imm8:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -532,6 +706,19 @@ define i1 @cmp16_load_slt_imm16(ptr %p0) {
 ; X64-GENERIC-NEXT:    setl %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_slt_imm16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $512, (%eax) # imm = 0x200
+; X86-FAST-NEXT:    setl %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_slt_imm16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $512, (%rdi) # imm = 0x200
+; X64-FAST-NEXT:    setl %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_slt_imm16:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -589,6 +776,19 @@ define i1 @cmp16_load_slt_imm16_optsize(ptr %p0) optsize {
 ; X64-GENERIC-NEXT:    setl %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_slt_imm16_optsize:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $512, (%eax) # imm = 0x200
+; X86-FAST-NEXT:    setl %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_slt_imm16_optsize:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $512, (%rdi) # imm = 0x200
+; X64-FAST-NEXT:    setl %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_slt_imm16_optsize:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -620,6 +820,19 @@ define i1 @cmp16_load_ule_imm8(ptr %p0) {
 ; X64-GENERIC-NEXT:    setb %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_ule_imm8:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $16, (%eax)
+; X86-FAST-NEXT:    setb %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_ule_imm8:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $16, (%rdi)
+; X64-FAST-NEXT:    setb %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_ule_imm8:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -659,6 +872,19 @@ define i1 @cmp16_load_ule_imm16(ptr %p0) {
 ; X64-GENERIC-NEXT:    setb %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_ule_imm16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $513, (%eax) # imm = 0x201
+; X86-FAST-NEXT:    setb %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_ule_imm16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $513, (%rdi) # imm = 0x201
+; X64-FAST-NEXT:    setb %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_ule_imm16:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -716,6 +942,19 @@ define i1 @cmp16_load_ule_imm16_optsize(ptr %p0) optsize {
 ; X64-GENERIC-NEXT:    setb %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_ule_imm16_optsize:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $513, (%eax) # imm = 0x201
+; X86-FAST-NEXT:    setb %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_ule_imm16_optsize:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $513, (%rdi) # imm = 0x201
+; X64-FAST-NEXT:    setb %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_ule_imm16_optsize:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax