C++ support

This chapter supports some C++ compiler features.

Exception handle

The Chapter11_2 can be built and run with the C++ polymorphism example code of ch12_inherit.cpp as follows,

lbdex/input/ch12_inherit.cpp

#ifdef COUT_TEST
#include <iostream>
using namespace std;
#endif

extern "C" int printf(const char *format, ...);
extern "C" int sprintf(char *out, const char *format, ...);

class CPolygon { // _ZTVN10__cxxabiv117__class_type_infoE for parent class
  protected:
    int width, height;
  public:
    void set_values (int a, int b)
    { width=a; height=b; }
//    virtual int area (void) =0; // __cxa_pure_virtual
    virtual int area (void) { return 0;};
    void printarea (void)
#ifdef COUT_TEST
 // generate IR nvoke, landing, resume and unreachable on iMac
    { cout << this->area() << endl; }
#else
    { printf("%d\n", this->area()); }
#endif
  };

// _ZTVN10__cxxabiv120__si_class_type_infoE for derived class
class CRectangle: public CPolygon {
  public:
    int area (void)
    { return (width * height); }
};

class CTriangle: public CPolygon {
  public:
    int area (void)
    { return (width * height / 2); }
};

class CAngle: public CPolygon {
  public:
    int area (void)
    { return (width * height / 4); }
};
#if 0
int test_cpp_polymorphism() {
  CPolygon * ppoly1 = new CRectangle;	// _Znwm
  CPolygon * ppoly2 = new CTriangle;
  ppoly1->set_values (4,5);
  ppoly2->set_values (4,5);
  ppoly1->printarea();
  ppoly2->printarea();
  delete ppoly1;	// _ZdlPv
  delete ppoly2;
  return 0;
}
#else
int test_cpp_polymorphism() {
  CRectangle poly1;
  CTriangle poly2;
  CAngle poly3;
  
  CPolygon * ppoly1 = &poly1;
  CPolygon * ppoly2 = &poly2;
  CPolygon * ppoly3 = &poly3;
  
  ppoly1->set_values (4,5);
  ppoly2->set_values (4,5);
  ppoly3->set_values (4,5);
  ppoly1->printarea();
  ppoly2->printarea();
  ppoly3->printarea();
  if (ppoly1->area() == 20 && ppoly2->area() == 10 && ppoly3->area() == 5)
    return 0;
  
  return 0;
}
#endif

If using cout instead of printf in ch12_inherit.cpp, it won’t generate exception handler IRs on Linux, whereas it will generate invoke, landing, resume and unreachable exception handler IRs on iMac. Example code, ch12_eh.cpp, which supports try and catch exception handler as the following will generate these exception handler IRs both on iMac and Linux.

lbdex/input/ch12_eh.cpp

class Ex1 {};
void throw_exception(int a, int b) {
  Ex1 ex1;

  if (a > b) {
    throw ex1;
  }
}

int test_try_catch() {
  try {
    throw_exception(2, 1);
  }
  catch(...) {
    return 1;
  }
  return 0;
}

JonathantekiiMac:input Jonathan$ clang -c ch12_eh.cpp -emit-llvm
-o ch12_eh.bc
JonathantekiiMac:input Jonathan$ /Users/Jonathan/llvm/test/cmake_debug_build/
Debug/bin/llvm-dis ch12_eh.bc -o -
; ModuleID = 'ch12_eh.bc'
source_filename = "ch12_eh.bc"
target datalayout = "E-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32-S64"
target triple = "mips-unknown-linux-gnu"

%class.Ex1 = type { i8 }

$_ZTS3Ex1 = comdat any

$_ZTI3Ex1 = comdat any

@_ZTVN10__cxxabiv117__class_type_infoE = external global i8*
@_ZTS3Ex1 = linkonce_odr constant [5 x i8] c"3Ex1\00", comdat
@_ZTI3Ex1 = linkonce_odr constant { i8*, i8* } { i8* bitcast (i8** getelementptr
 inbounds (i8*, i8** @_ZTVN10__cxxabiv117__class_type_infoE, i32 2) to i8*), i8*
 getelementptr inbounds ([5 x i8], [5 x i8]* @_ZTS3Ex1, i32 0, i32 0) }, comdat

define void @_Z15throw_exceptionii(i32 signext %a, i32 signext %b) #0 {
  %1 = alloca i32, align 4
  %2 = alloca i32, align 4
  %ex1 = alloca %class.Ex1, align 1
  store i32 %a, i32* %1, align 4
  store i32 %b, i32* %2, align 4
  %3 = load i32, i32* %1, align 4
  %4 = load i32, i32* %2, align 4
  %5 = icmp sgt i32 %3, %4
  br i1 %5, label %6, label %9

; <label>:6:                                      ; preds = %0
  %7 = call i8* @__cxa_allocate_exception(i32 1) #1
  %8 = bitcast i8* %7 to %class.Ex1*
  call void @__cxa_throw(i8* %7, i8* bitcast ({ i8*, i8* }* @_ZTI3Ex1 to i8*), i
8* null) #2
  unreachable

; <label>:9:                                      ; preds = %0
  ret void
}

declare i8* @__cxa_allocate_exception(i32)

declare void @__cxa_throw(i8*, i8*, i8*)

define i32 @_Z14test_try_catchv() #0 personality i8* bitcast (i32 (...)* @__gxx_
personality_v0 to i8*) {
  %1 = alloca i32, align 4
  %2 = alloca i8*
  %3 = alloca i32
  %4 = alloca i32
  invoke void @_Z15throw_exceptionii(i32 signext 2, i32 signext 1)
          to label %5 unwind label %6

; <label>:5:                                      ; preds = %0
  br label %13

; <label>:6:                                      ; preds = %0
  %7 = landingpad { i8*, i32 }
          catch i8* null
  %8 = extractvalue { i8*, i32 } %7, 0
  store i8* %8, i8** %2
  %9 = extractvalue { i8*, i32 } %7, 1
  store i32 %9, i32* %3
  br label %10

; <label>:10:                                     ; preds = %6
  %11 = load i8*, i8** %2
  %12 = call i8* @__cxa_begin_catch(i8* %11) #1
  store i32 1, i32* %1
  store i32 1, i32* %4
  call void @__cxa_end_catch()
  br label %14

; <label>:13:                                     ; preds = %5
  store i32 0, i32* %1
  br label %14

; <label>:14:                                     ; preds = %13, %10
  %15 = load i32, i32* %1
  ret i32 %15
}

declare i32 @__gxx_personality_v0(...)

declare i8* @__cxa_begin_catch(i8*)

declare void @__cxa_end_catch()

attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
o-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="fals
e" "stack-protector-buffer-size"="8" "target-cpu"="mips32r2" "target-features"="
+mips32r2" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind }
attributes #2 = { noreturn }

!llvm.ident = !{!0}

!0 = !{!"Apple LLVM version 7.0.0 (clang-700.1.76)"}
JonathantekiiMac:input Jonathan$ /Users/Jonathan/llvm/test/cmake_debug_build/
Debug/bin/llc -march=cpu0 -relocation-model=static -filetype=asm ch12_eh.bc -o -
        .section .mdebug.abi32
        .previous
        .file "ch12_eh.bc"
llc: /Users/Jonathan/llvm/test/src/lib/CodeGen/LiveVariables.cpp:133: void llvm::
LiveVariables::HandleVirtRegUse(unsigned int, llvm::MachineBasicBlock *, llvm
::MachineInstr *): Assertion `MRI->getVRegDef(reg) && "Register use before
def!"' failed.

About the IRs of LLVM exception handling, please reference here [1]. Chapter12_1 supports the llvm IRs of corresponding try and catch exception C++ keywords. It can compile ch12_eh.bc as follows,

lbdex/chapters/Chapter12_1/Cpu0ISelLowering.h

    /// If a physical register, this returns the register that receives the
    /// exception address on entry to an EH pad.
    unsigned
    getExceptionPointerRegister(const Constant *PersonalityFn) const override {
      return Cpu0::A0;
    }

    /// If a physical register, this returns the register that receives the
    /// exception typeid on entry to a landing pad.
    unsigned
    getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
      return Cpu0::A1;
    }
JonathantekiiMac:input Jonathan$ /Users/Jonathan/llvm/test/cmake_debug_build/
Debug/bin/llc -march=cpu0 -relocation-model=static -filetype=asm ch12_eh.bc -o -
  .text
  .section .mdebug.abiO32
  .previous
  .file  "ch12_eh.bc"
  .globl  _Z15throw_exceptionii
  .p2align  2
  .type  _Z15throw_exceptionii,@function
  .ent  _Z15throw_exceptionii   # @_Z15throw_exceptionii
_Z15throw_exceptionii:
  .cfi_startproc
  .frame  $fp,40,$lr
  .mask   0x00005000,-4
  .set  noreorder
  .set  nomacro
# BB#0:
  addiu  $sp, $sp, -40
$tmp0:
  .cfi_def_cfa_offset 40
  st  $lr, 36($sp)            # 4-byte Folded Spill
  st  $fp, 32($sp)            # 4-byte Folded Spill
$tmp1:
  .cfi_offset 14, -4
$tmp2:
  .cfi_offset 12, -8
  move   $fp, $sp
$tmp3:
  .cfi_def_cfa_register 12
  st  $4, 28($fp)
  st  $5, 24($fp)
  ld  $2, 28($fp)
  cmp  $sw, $2, $5
  jle  $sw, $BB0_2
  nop
  jmp  $BB0_1
$BB0_2:
  move   $sp, $fp
  ld  $fp, 32($sp)            # 4-byte Folded Reload
  ld  $lr, 36($sp)            # 4-byte Folded Reload
  addiu  $sp, $sp, 40
  ret  $lr
  nop
$BB0_1:
  addiu  $4, $zero, 1
  jsub  __cxa_allocate_exception
  nop
  addiu  $3, $zero, 0
  st  $3, 8($sp)
  lui  $3, %hi(_ZTI3Ex1)
  ori  $5, $3, %lo(_ZTI3Ex1)
  addu  $4, $zero, $2
  jsub  __cxa_throw
  nop
  .set  macro
  .set  reorder
  .end  _Z15throw_exceptionii
$func_end0:
  .size  _Z15throw_exceptionii, ($func_end0)-_Z15throw_exceptionii
  .cfi_endproc

  .globl  _Z14test_try_catchv
  .p2align  2
  .type  _Z14test_try_catchv,@function
  .ent  _Z14test_try_catchv     # @_Z14test_try_catchv
_Z14test_try_catchv:
$tmp7:
$func_begin0 = ($tmp7)
  .cfi_startproc
  .cfi_personality 0, __gxx_personality_v0
  .cfi_lsda 0, $exception0
  .frame  $fp,40,$lr
  .mask   0x00005200,-4
  .set  noreorder
  .set  nomacro
# BB#0:
  addiu  $sp, $sp, -40
$tmp8:
  .cfi_def_cfa_offset 40
  st  $lr, 36($sp)            # 4-byte Folded Spill
  st  $fp, 32($sp)            # 4-byte Folded Spill
  st  $9, 28($sp)             # 4-byte Folded Spill
$tmp9:
  .cfi_offset 14, -4
$tmp10:
  .cfi_offset 12, -8
$tmp11:
  .cfi_offset 9, -12
  move   $fp, $sp
$tmp12:
  .cfi_def_cfa_register 12
$tmp4:
  addiu  $4, $zero, 2
  addiu  $9, $zero, 1
  addu  $5, $zero, $9
  jsub  _Z15throw_exceptionii
  nop
$tmp5:
# BB#2:
  addiu  $2, $zero, 0
  st  $2, 24($fp)
$BB1_3:
  ld  $2, 24($fp)
  move   $sp, $fp
  ld  $9, 28($sp)             # 4-byte Folded Reload
  ld  $fp, 32($sp)            # 4-byte Folded Reload
  ld  $lr, 36($sp)            # 4-byte Folded Reload
  addiu  $sp, $sp, 40
  ret  $lr
  nop
$BB1_1:
$tmp6:
  st  $4, 20($fp)
  st  $5, 16($fp)
  ld  $4, 20($fp)
  jsub  __cxa_begin_catch
  nop
  st  $9, 24($fp)
  st  $9, 12($fp)
  jsub  __cxa_end_catch
  nop
  jmp  $BB1_3
  .set  macro
  .set  reorder
  .end  _Z14test_try_catchv
$func_end1:
  .size  _Z14test_try_catchv, ($func_end1)-_Z14test_try_catchv
  .cfi_endproc
  .section  .gcc_except_table,"a",@progbits
  .p2align  2
GCC_except_table1:
$exception0:
  .byte  255                     # @LPStart Encoding = omit
  .byte  0                       # @TType Encoding = absptr
  .asciz  "\242\200\200"          # @TType base offset
  .byte  3                       # Call site Encoding = udata4
  .byte  26                      # Call site table length
  .4byte  ($tmp4)-($func_begin0)  # >> Call Site 1 <<
  .4byte  ($tmp5)-($tmp4)         #   Call between $tmp4 and $tmp5
  .4byte  ($tmp6)-($func_begin0)  #     jumps to $tmp6
  .byte  1                       #   On action: 1
  .4byte  ($tmp5)-($func_begin0)  # >> Call Site 2 <<
  .4byte  ($func_end1)-($tmp5)    #   Call between $tmp5 and $func_end1
  .4byte  0                       #     has no landing pad
  .byte  0                       #   On action: cleanup
  .byte  1                       # >> Action Record 1 <<
                                        #   Catch TypeInfo 1
  .byte  0                       #   No further actions
                                        # >> Catch TypeInfos <<
  .4byte  0                       # TypeInfo 1
  .p2align  2

  .type  _ZTS3Ex1,@object        # @_ZTS3Ex1
  .section  .rodata._ZTS3Ex1,"aG",@progbits,_ZTS3Ex1,comdat
  .weak  _ZTS3Ex1
  .p2align  2
_ZTS3Ex1:
  .asciz  "3Ex1"
  .size  _ZTS3Ex1, 5

  .type  _ZTI3Ex1,@object        # @_ZTI3Ex1
  .section  .rodata._ZTI3Ex1,"aG",@progbits,_ZTI3Ex1,comdat
  .weak  _ZTI3Ex1
  .p2align  3
_ZTI3Ex1:
  .4byte  _ZTVN10__cxxabiv117__class_type_infoE+8
  .4byte  _ZTS3Ex1
  .size  _ZTI3Ex1, 8


  .ident  "Apple LLVM version 7.0.0 (clang-700.1.76)"
  .section  ".note.GNU-stack","",@progbits

Thread variable

C++ support thread variable as the following file ch12_thread_var.cpp.

lbdex/input/ch12_thread_var.cpp

__thread int a = 0;
thread_local int b = 0; // need option -std=c++11
int test_thread_var()
{
    a = 2;
    return a;
}

int test_thread_var_2()
{
    b = 3;
    return b;
}

While global variable is a single instance shared by all threads in a process, thread variable has different instances for each different thread in a process. The same thread share the thread variable but different threads have their own thread variable with the same name [2].

To support thread variable, tlsgd, tlsldm, dtp_hi, dtp_lo, gottp, tp_hi and tp_lo in both evaluateRelocExpr() of Cpu0AsmParser.cpp and printImpl() of Cpu0MCExpr.cpp are needed, and the following code are required. Most of them are for relocation record handle and display since the thread variable created by OS or language library which support multi-threads programming.

lbdex/chapters/Chapter12_1/MCTargetDesc/Cpu0AsmBackend.cpp

const MCFixupKindInfo &Cpu0AsmBackend::
getFixupKindInfo(MCFixupKind Kind) const {
  const static MCFixupKindInfo Infos[Cpu0::NumTargetFixupKinds] = {
    // This table *must* be in same the order of fixup_* kinds in
    // Cpu0FixupKinds.h.
    //
    // name                        offset  bits  flags
    { "fixup_Cpu0_TLSGD",          0,     16,   0 },
    { "fixup_Cpu0_GOTTP",          0,     16,   0 },
    { "fixup_Cpu0_TP_HI",          0,     16,   0 },
    { "fixup_Cpu0_TP_LO",          0,     16,   0 },
    { "fixup_Cpu0_TLSLDM",         0,     16,   0 },
    { "fixup_Cpu0_DTP_HI",         0,     16,   0 },
    { "fixup_Cpu0_DTP_LO",         0,     16,   0 },
    ...
  };
  ...
}

lbdex/chapters/Chapter12_1/MCTargetDesc/Cpu0BaseInfo.h

namespace Cpu0II {
  /// Target Operand Flag enum.
  enum TOF {
    //===------------------------------------------------------------------===//
    // Cpu0 Specific MachineOperand flags.

    /// MO_TLSGD - Represents the offset into the global offset table at which
    // the module ID and TSL block offset reside during execution (General
    // Dynamic TLS).
    MO_TLSGD,

    /// MO_TLSLDM - Represents the offset into the global offset table at which
    // the module ID and TSL block offset reside during execution (Local
    // Dynamic TLS).
    MO_TLSLDM,
    MO_DTP_HI,
    MO_DTP_LO,

    /// MO_GOTTPREL - Represents the offset from the thread pointer (Initial
    // Exec TLS).
    MO_GOTTPREL,

    /// MO_TPREL_HI/LO - Represents the hi and low part of the offset from
    // the thread pointer (Local Exec TLS).
    MO_TP_HI,
    MO_TP_LO,
    ...
  };
  ...
}

lbdex/chapters/Chapter12_1/MCTargetDesc/Cpu0ELFObjectWriter.cpp

unsigned Cpu0ELFObjectWriter::getRelocType(MCContext &Ctx,
                                           const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
  // determine the type of the relocation
  unsigned Type = (unsigned)ELF::R_CPU0_NONE;
  unsigned Kind = (unsigned)Fixup.getKind();

  switch (Kind) {
  case Cpu0::fixup_Cpu0_TLSGD:
    Type = ELF::R_CPU0_TLS_GD;
    break;
  case Cpu0::fixup_Cpu0_GOTTPREL:
    Type = ELF::R_CPU0_TLS_GOTTPREL;
    break;
  ...
}

lbdex/chapters/Chapter12_1/MCTargetDesc/Cpu0FixupKinds.h

  enum Fixups {
    // resulting in - R_CPU0_TLS_GD.
    fixup_Cpu0_TLSGD,

    // resulting in - R_CPU0_TLS_GOTTPREL.
    fixup_Cpu0_GOTTPREL,

    // resulting in - R_CPU0_TLS_TPREL_HI16.
    fixup_Cpu0_TP_HI,

    // resulting in - R_CPU0_TLS_TPREL_LO16.
    fixup_Cpu0_TP_LO,

    // resulting in - R_CPU0_TLS_LDM.
    fixup_Cpu0_TLSLDM,

    // resulting in - R_CPU0_TLS_DTP_HI16.
    fixup_Cpu0_DTP_HI,

    // resulting in - R_CPU0_TLS_DTP_LO16.
    fixup_Cpu0_DTP_LO,
  ...
};

lbdex/chapters/Chapter12_1/MCTargetDesc/Cpu0MCCodeEmitter.cpp

unsigned Cpu0MCCodeEmitter::
getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups,
               const MCSubtargetInfo &STI) const {
    case Cpu0MCExpr::CEK_TLSGD:
      FixupKind = Cpu0::fixup_Cpu0_TLSGD;
      break;
    case Cpu0MCExpr::CEK_TLSLDM:
      FixupKind = Cpu0::fixup_Cpu0_TLSLDM;
      break;
    case Cpu0MCExpr::CEK_DTP_HI:
      FixupKind = Cpu0::fixup_Cpu0_DTP_HI;
      break;
    case Cpu0MCExpr::CEK_DTP_LO:
      FixupKind = Cpu0::fixup_Cpu0_DTP_LO;
      break;
    case Cpu0MCExpr::CEK_GOTTPREL:
      FixupKind = Cpu0::fixup_Cpu0_GOTTPREL;
      break;
    case Cpu0MCExpr::CEK_TP_HI:
      FixupKind = Cpu0::fixup_Cpu0_TP_HI;
      break;
    case Cpu0MCExpr::CEK_TP_LO:
      FixupKind = Cpu0::fixup_Cpu0_TP_LO;
      break;
  ...
}

lbdex/chapters/Chapter12_1/Cpu0InstrInfo.td

// TlsGd node is used to handle General Dynamic TLS
def Cpu0TlsGd : SDNode<"Cpu0ISD::TlsGd", SDTIntUnaryOp>;

// TpHi and TpLo nodes are used to handle Local Exec TLS
def Cpu0TpHi  : SDNode<"Cpu0ISD::TpHi", SDTIntUnaryOp>;
def Cpu0TpLo  : SDNode<"Cpu0ISD::TpLo", SDTIntUnaryOp>;
let Predicates = [Ch12_1] in {
def : Pat<(Cpu0Hi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
}
let Predicates = [Ch12_1] in {
def : Pat<(Cpu0Lo tglobaltlsaddr:$in), (ORi ZERO, tglobaltlsaddr:$in)>;
}
let Predicates = [Ch12_1] in {
def : Pat<(add CPURegs:$hi, (Cpu0Lo tglobaltlsaddr:$lo)),
              (ORi CPURegs:$hi, tglobaltlsaddr:$lo)>;
}
let Predicates = [Ch12_1] in {
def : WrapperPat<tglobaltlsaddr, ORi, CPURegs>;
}

lbdex/chapters/Chapter12_1/Cpu0SelLowering.cpp

Cpu0TargetLowering::Cpu0TargetLowering(const Cpu0TargetMachine &TM,
                                       const Cpu0Subtarget &STI)
    : TargetLowering(TM), Subtarget(STI), ABI(TM.getABI()) {

  setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
  ...
}
SDValue Cpu0TargetLowering::
LowerOperation(SDValue Op, SelectionDAG &DAG) const
{
  switch (Op.getOpcode())
  {
  case ISD::GlobalTLSAddress:   return lowerGlobalTLSAddress(Op, DAG);
    ...
  }
  ...
}
SDValue Cpu0TargetLowering::
lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
{
  // If the relocation model is PIC, use the General Dynamic TLS Model or
  // Local Dynamic TLS model, otherwise use the Initial Exec or
  // Local Exec TLS Model.

  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
  if (DAG.getTarget().Options.EmulatedTLS)
    return LowerToTLSEmulatedModel(GA, DAG);

  SDLoc DL(GA);
  const GlobalValue *GV = GA->getGlobal();
  EVT PtrVT = getPointerTy(DAG.getDataLayout());

  TLSModel::Model model = getTargetMachine().getTLSModel(GV);

  if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
    // General Dynamic and Local Dynamic TLS Model.
    unsigned Flag = (model == TLSModel::LocalDynamic) ? Cpu0II::MO_TLSLDM
                                                      : Cpu0II::MO_TLSGD;

    SDValue TGA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, Flag);
    SDValue Argument = DAG.getNode(Cpu0ISD::Wrapper, DL, PtrVT,
                                   getGlobalReg(DAG, PtrVT), TGA);
    unsigned PtrSize = PtrVT.getSizeInBits();
    IntegerType *PtrTy = Type::getIntNTy(*DAG.getContext(), PtrSize);

    SDValue TlsGetAddr = DAG.getExternalSymbol("__tls_get_addr", PtrVT);

    ArgListTy Args;
    ArgListEntry Entry;
    Entry.Node = Argument;
    Entry.Ty = PtrTy;
    Args.push_back(Entry);

    TargetLowering::CallLoweringInfo CLI(DAG);
    CLI.setDebugLoc(DL).setChain(DAG.getEntryNode())
      .setCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args));
    std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);

    SDValue Ret = CallResult.first;

    if (model != TLSModel::LocalDynamic)
      return Ret;

    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                               Cpu0II::MO_DTP_HI);
    SDValue Hi = DAG.getNode(Cpu0ISD::Hi, DL, PtrVT, TGAHi);
    SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                               Cpu0II::MO_DTP_LO);
    SDValue Lo = DAG.getNode(Cpu0ISD::Lo, DL, PtrVT, TGALo);
    SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Ret);
    return DAG.getNode(ISD::ADD, DL, PtrVT, Add, Lo);
  }

  SDValue Offset;
  if (model == TLSModel::InitialExec) {
    // Initial Exec TLS Model
    SDValue TGA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                             Cpu0II::MO_GOTTPREL);
    TGA = DAG.getNode(Cpu0ISD::Wrapper, DL, PtrVT, getGlobalReg(DAG, PtrVT),
                      TGA);
    Offset =
        DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), TGA, MachinePointerInfo());
  } else {
    // Local Exec TLS Model
    assert(model == TLSModel::LocalExec);
    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                               Cpu0II::MO_TP_HI);
    SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                               Cpu0II::MO_TP_LO);
    SDValue Hi = DAG.getNode(Cpu0ISD::Hi, DL, PtrVT, TGAHi);
    SDValue Lo = DAG.getNode(Cpu0ISD::Lo, DL, PtrVT, TGALo);
    Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
  }
  return Offset;
}

lbdex/chapters/Chapter12_1/Cpu0ISelLowering.h

    SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;

lbdex/chapters/Chapter12_1/Cpu0MCInstLower.cpp

MCOperand Cpu0MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                              MachineOperandType MOTy,
                                              unsigned Offset) const {
  MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
  Cpu0MCExpr::Cpu0ExprKind TargetKind = Cpu0MCExpr::CEK_None;
  const MCSymbol *Symbol;

  switch(MO.getTargetFlags()) {
  case Cpu0II::MO_TLSGD:
    TargetKind = Cpu0MCExpr::CEK_TLSGD;
    break;
  case Cpu0II::MO_TLSLDM:
    TargetKind = Cpu0MCExpr::CEK_TLSLDM;
    break;
  case Cpu0II::MO_DTP_HI:
    TargetKind = Cpu0MCExpr::CEK_DTP_HI;
    break;
  case Cpu0II::MO_DTP_LO:
    TargetKind = Cpu0MCExpr::CEK_DTP_LO;
    break;
  case Cpu0II::MO_GOTTPREL:
    TargetKind = Cpu0MCExpr::CEK_GOTTPREL;
    break;
  case Cpu0II::MO_TP_HI:
    TargetKind = Cpu0MCExpr::CEK_TP_HI;
    break;
  case Cpu0II::MO_TP_LO:
    TargetKind = Cpu0MCExpr::CEK_TP_LO;
    break;
  ...
  }
  ...
}
JonathantekiiMac:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch12_thread_var.cpp -emit-llvm -std=c++11 -o ch12_thread_var.bc
JonathantekiiMac:input Jonathan$ /Users/Jonathan/llvm/test/cmake_debug_build/
Debug/bin/llvm-dis ch12_thread_var.bc -o -
; ModuleID = 'ch12_thread_var.bc'
source_filename = "ch12_thread_var.bc"
target datalayout = "E-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32-S64"
target triple = "mips-unknown-linux-gnu"

@a = thread_local global i32 0, align 4
@b = thread_local global i32 0, align 4

; Function Attrs: nounwind
define i32 @_Z15test_thread_varv() #0 {
  store i32 2, i32* @a, align 4
  %1 = load i32, i32* @a, align 4
  ret i32 %1
}

define i32 @_Z17test_thread_var_2v() #1 {
  %1 = call i32* @_ZTW1b()
  store i32 3, i32* %1, align 4
  %2 = call i32* @_ZTW1b()
  %3 = load i32, i32* %2, align 4
  ret i32 %3
}

define weak_odr hidden i32* @_ZTW1b() {
  ret i32* @b
}

attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
"true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-ma
th"="false" "stack-protector-buffer-size"="8" "target-cpu"="mips32r2" "target-fe
atures"="+mips32r2" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
o-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="fals
e" "stack-protector-buffer-size"="8" "target-cpu"="mips32r2" "target-features"="
+mips32r2" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{!0}

!0 = !{!"Apple LLVM version 7.0.0 (clang-700.1.76)"}
JonathantekiiMac:input Jonathan$ /Users/Jonathan/llvm/test/cmake_debug_build/
Debug/bin/llc -march=cpu0 -relocation-model=pic -filetype=asm ch12_thread_var.bc
-o -
  .text
  .section .mdebug.abiO32
  .previous
  .file  "ch12_thread_var.bc"
  .globl  _Z15test_thread_varv
  .p2align  2
  .type  _Z15test_thread_varv,@function
  .ent  _Z15test_thread_varv    # @_Z15test_thread_varv
_Z15test_thread_varv:
  .frame  $fp,16,$lr
  .mask   0x00005000,-4
  .set  noreorder
  .cpload  $t9
  .set  nomacro
# BB#0:
  addiu  $sp, $sp, -16
  st  $lr, 12($sp)            # 4-byte Folded Spill
  st  $fp, 8($sp)             # 4-byte Folded Spill
  move   $fp, $sp
  .cprestore  8
  ld  $t9, %call16(__tls_get_addr)($gp)
  ori  $4, $gp, %tlsgd(a)
  jalr  $t9
  nop
  ld  $gp, 8($fp)
  addiu  $3, $zero, 2
  st  $3, 0($2)
  addu  $2, $zero, $3
  move   $sp, $fp
  ld  $fp, 8($sp)             # 4-byte Folded Reload
  ld  $lr, 12($sp)            # 4-byte Folded Reload
  addiu  $sp, $sp, 16
  ret  $lr
  nop
  .set  macro
  .set  reorder
  .end  _Z15test_thread_varv
$func_end0:
  .size  _Z15test_thread_varv, ($func_end0)-_Z15test_thread_varv

  .globl  _Z17test_thread_var_2v
  .p2align  2
  .type  _Z17test_thread_var_2v,@function
  .ent  _Z17test_thread_var_2v  # @_Z17test_thread_var_2v
_Z17test_thread_var_2v:
  .cfi_startproc
  .frame  $fp,16,$lr
  .mask   0x00005000,-4
  .set  noreorder
  .cpload  $t9
  .set  nomacro
# BB#0:
  addiu  $sp, $sp, -16
$tmp0:
  .cfi_def_cfa_offset 16
  st  $lr, 12($sp)            # 4-byte Folded Spill
  st  $fp, 8($sp)             # 4-byte Folded Spill
$tmp1:
  .cfi_offset 14, -4
$tmp2:
  .cfi_offset 12, -8
  move   $fp, $sp
$tmp3:
  .cfi_def_cfa_register 12
  .cprestore  8
  ld  $t9, %call16(_ZTW1b)($gp)
  jalr  $t9
  nop
  ld  $gp, 8($fp)
  addiu  $3, $zero, 3
  st  $3, 0($2)
  ld  $t9, %call16(_ZTW1b)($gp)
  jalr  $t9
  nop
  ld  $gp, 8($fp)
  ld  $2, 0($2)
  move   $sp, $fp
  ld  $fp, 8($sp)             # 4-byte Folded Reload
  ld  $lr, 12($sp)            # 4-byte Folded Reload
  addiu  $sp, $sp, 16
  ret  $lr
  nop
  .set  macro
  .set  reorder
  .end  _Z17test_thread_var_2v
$func_end1:
  .size  _Z17test_thread_var_2v, ($func_end1)-_Z17test_thread_var_2v
  .cfi_endproc

  .hidden  _ZTW1b
  .weak  _ZTW1b
  .p2align  2
  .type  _ZTW1b,@function
  .ent  _ZTW1b                  # @_ZTW1b
_ZTW1b:
  .cfi_startproc
  .frame  $sp,16,$lr
  .mask   0x00004000,-4
  .set  noreorder
  .cpload  $t9
  .set  nomacro
# BB#0:
  addiu  $sp, $sp, -16
$tmp4:
  .cfi_def_cfa_offset 16
  st  $lr, 12($sp)            # 4-byte Folded Spill
$tmp5:
  .cfi_offset 14, -4
  .cprestore  8
  ld  $t9, %call16(__tls_get_addr)($gp)
  ori  $4, $gp, %tlsgd(b)
  jalr  $t9
  nop
  ld  $gp, 8($sp)
  ld  $lr, 12($sp)            # 4-byte Folded Reload
  addiu  $sp, $sp, 16
  ret  $lr
  nop
  .set  macro
  .set  reorder
  .end  _ZTW1b
$func_end2:
  .size  _ZTW1b, ($func_end2)-_ZTW1b
  .cfi_endproc

  .type  a,@object               # @a
  .section  .tbss,"awT",@nobits
  .globl  a
  .p2align  2
a:
  .4byte  0                       # 0x0
  .size  a, 4

  .type  b,@object               # @b
  .globl  b
  .p2align  2
b:
  .4byte  0                       # 0x0
  .size  b, 4


  .ident  "Apple LLVM version 7.0.0 (clang-700.1.76)"
  .section  ".note.GNU-stack","",@progbits

In pic mode, the __thread variable access by call function __tls_get_addr with the address of thread variable. The c++11 standard thread_local variable is accessed by calling function _ZTW1b which also call the function __tls_get_addr to get the thread_local variable address. In static mode, the thread variable is accessed by machine instructions as follows,

JonathantekiiMac:input Jonathan$ /Users/Jonathan/llvm/test/cmake_debug_build/
Debug/bin/llc -march=cpu0 -relocation-model=static -filetype=asm
ch12_thread_var.bc -o -
  .text
  .section .mdebug.abiO32
  .previous
  .file  "ch12_thread_var.bc"
  .globl  _Z15test_thread_varv
  .p2align  2
  .type  _Z15test_thread_varv,@function
  .ent  _Z15test_thread_varv    # @_Z15test_thread_varv
_Z15test_thread_varv:
  .frame  $fp,8,$lr
  .mask   0x00001000,-4
  .set  noreorder
  .set  nomacro
# BB#0:
  addiu  $sp, $sp, -8
  st  $fp, 4($sp)             # 4-byte Folded Spill
  move   $fp, $sp
  lui  $2, %tp_hi(a)
  ori  $3, $2, %tp_lo(a)
  addiu  $2, $zero, 2
  st  $2, 0($3)
  move   $sp, $fp
  ld  $fp, 4($sp)             # 4-byte Folded Reload
  addiu  $sp, $sp, 8
  ret  $lr
  nop
  .set  macro
  .set  reorder
  .end  _Z15test_thread_varv
$func_end0:
  .size  _Z15test_thread_varv, ($func_end0)-_Z15test_thread_varv

  .globl  _Z17test_thread_var_2v
  .p2align  2
  .type  _Z17test_thread_var_2v,@function
  .ent  _Z17test_thread_var_2v  # @_Z17test_thread_var_2v
_Z17test_thread_var_2v:
  .cfi_startproc
  .frame  $fp,16,$lr
  .mask   0x00005000,-4
  .set  noreorder
  .set  nomacro
# BB#0:
  addiu  $sp, $sp, -16
$tmp0:
  .cfi_def_cfa_offset 16
  st  $lr, 12($sp)            # 4-byte Folded Spill
  st  $fp, 8($sp)             # 4-byte Folded Spill
$tmp1:
  .cfi_offset 14, -4
$tmp2:
  .cfi_offset 12, -8
  move   $fp, $sp
$tmp3:
  .cfi_def_cfa_register 12
  jsub  _ZTW1b
  nop
  addiu  $3, $zero, 3
  st  $3, 0($2)
  jsub  _ZTW1b
  nop
  ld  $2, 0($2)
  move   $sp, $fp
  ld  $fp, 8($sp)             # 4-byte Folded Reload
  ld  $lr, 12($sp)            # 4-byte Folded Reload
  addiu  $sp, $sp, 16
  ret  $lr
  nop
  .set  macro
  .set  reorder
  .end  _Z17test_thread_var_2v
$func_end1:
  .size  _Z17test_thread_var_2v, ($func_end1)-_Z17test_thread_var_2v
  .cfi_endproc

  .hidden  _ZTW1b
  .weak  _ZTW1b
  .p2align  2
  .type  _ZTW1b,@function
  .ent  _ZTW1b                  # @_ZTW1b
_ZTW1b:
  .cfi_startproc
  .frame  $sp,0,$lr
  .mask   0x00000000,0
  .set  noreorder
  .set  nomacro
# BB#0:
  lui  $2, %tp_hi(b)
  ori  $2, $2, %tp_lo(b)
  ret  $lr
  nop
  .set  macro
  .set  reorder
  .end  _ZTW1b
$func_end2:
  .size  _ZTW1b, ($func_end2)-_ZTW1b
  .cfi_endproc

  .type  a,@object               # @a
  .section  .tbss,"awT",@nobits
  .globl  a
  .p2align  2
a:
  .4byte  0                       # 0x0
  .size  a, 4

  .type  b,@object               # @b
  .globl  b
  .p2align  2
b:
  .4byte  0                       # 0x0
  .size  b, 4


  .ident  "Apple LLVM version 7.0.0 (clang-700.1.76)"
  .section  ".note.GNU-stack","",@progbits

While Mips uses rdhwr instruction to access thread varaible as below, Cpu0 access thread varaible without inventing any new instruction. The thread variables are keeped in thread varaible memory location which accessed through %tp_hi and %tp_lo, and furthermore, this section of memory is protected through kernel mode program. Thus, the user mode program cannot access this area of memory and no space to breathe for hack program.

JonathantekiiMac:input Jonathan$ /Users/Jonathan/llvm/test/cmake_debug_build/
Debug/bin/llc -march=mips -relocation-model=static -filetype=asm
ch12_thread_var.bc -o -
  ...
  lui $1, %tprel_hi(a)
  ori $1, $1, %tprel_lo(a)
  .set  push
  .set  mips32r2
  rdhwr $3, $29
  .set  pop
  addu  $1, $3, $1
  addiu $2, $zero, 2
  sw  $2, 0($1)
  addiu $2, $zero, 2
  ...

In static mode, the thread variable is similar to global variable. In general, they are same in IRs, DAGs and machine code translation. List them in the following tables. You can check them with debug option enabled.

Table 32 The DAGs of thread varaible of static mode
stage DAG
IR load i32* @a, align 4;
Legalized selection DAG (add Cpu0ISD::Hi Cpu0ISD::Lo);
Instruction Selection ori $2, $zero, %tp_lo(a);
lui $3, %tp_hi(a);
addu $3, $3, $2;
Table 33 The DAGs of local_thread varaible of static mode
stage DAG
IR ret i32* @b;
Legalized selection DAG %0=(add Cpu0ISD::Hi Cpu0ISD::Lo);...
Instruction Selection ori $2, $zero, %tp_lo(a);
lui $3, %tp_hi(a);
addu $3, $3, $2;

Atomic

In tradition, C uses different API which provided by OS or library to support multi-thread programming. For example, posix thread API on unix/linux, MS windows API, ..., etc. In order to achieve synchronization to solve race condition between threads, OS provide their own lock or semaphore functions to programmer. But this solution is OS dependent. After c++11, programmer can use atomic to program and run the code on every different platform since the thread and atomic are part of c++ standard. Beside of portability, the other important benifit is the compiler can generate high performance code by the target hardware instructions rather than couting on lock() function only [3] [4] [5].

In order to support atomic in C++ and java, llvm provides the atomic IRs here [6] [7].

For supporting llvm atomic IRs, the following code added to Chapter12_1.

lbdex/chapters/Chapter12_1/Disassembler/Cpu0Disassembler.cpp

static DecodeStatus DecodeMem(MCInst &Inst,
                              unsigned Insn,
                              uint64_t Address,
                              const void *Decoder) {
  if(Inst.getOpcode() == Cpu0::SC){
    Inst.addOperand(MCOperand::createReg(Reg));
  }
  ...
}

lbdex/chapters/Chapter12_1/Cpu0InstrInfo.td

def SDT_Sync             : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
def Cpu0Sync : SDNode<"Cpu0ISD::Sync", SDT_Sync, [SDNPHasChain]>;
def PtrRC : Operand<iPTR> {
  let MIOperandInfo = (ops ptr_rc);
  let DecoderMethod = "DecodeCPURegsRegisterClass";
}
// Atomic instructions with 2 source operands (ATOMIC_SWAP & ATOMIC_LOAD_*).
class Atomic2Ops<PatFrag Op, RegisterClass DRC> :
  PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$incr),
           [(set DRC:$dst, (Op iPTR:$ptr, DRC:$incr))]>;

// Atomic Compare & Swap.
class AtomicCmpSwap<PatFrag Op, RegisterClass DRC> :
  PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$cmp, DRC:$swap),
           [(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]>;
class LLBase<bits<8> Opc, string opstring, RegisterClass RC, Operand Mem> :
  FMem<Opc, (outs RC:$ra), (ins Mem:$addr),
       !strconcat(opstring, "\t$ra, $addr"), [], IILoad> {
  let mayLoad = 1;
}

class SCBase<bits<8> Opc, string opstring, RegisterOperand RO, Operand Mem> :
  FMem<Opc, (outs RO:$dst), (ins RO:$ra, Mem:$addr),
       !strconcat(opstring, "\t$ra, $addr"), [], IIStore> {
  let mayStore = 1;
  let Constraints = "$ra = $dst";
}
let Predicates = [Ch12_1] in {
let usesCustomInserter = 1 in {
  def ATOMIC_LOAD_ADD_I8   : Atomic2Ops<atomic_load_add_8, CPURegs>;
  def ATOMIC_LOAD_ADD_I16  : Atomic2Ops<atomic_load_add_16, CPURegs>;
  def ATOMIC_LOAD_ADD_I32  : Atomic2Ops<atomic_load_add_32, CPURegs>;
  def ATOMIC_LOAD_SUB_I8   : Atomic2Ops<atomic_load_sub_8, CPURegs>;
  def ATOMIC_LOAD_SUB_I16  : Atomic2Ops<atomic_load_sub_16, CPURegs>;
  def ATOMIC_LOAD_SUB_I32  : Atomic2Ops<atomic_load_sub_32, CPURegs>;
  def ATOMIC_LOAD_AND_I8   : Atomic2Ops<atomic_load_and_8, CPURegs>;
  def ATOMIC_LOAD_AND_I16  : Atomic2Ops<atomic_load_and_16, CPURegs>;
  def ATOMIC_LOAD_AND_I32  : Atomic2Ops<atomic_load_and_32, CPURegs>;
  def ATOMIC_LOAD_OR_I8    : Atomic2Ops<atomic_load_or_8, CPURegs>;
  def ATOMIC_LOAD_OR_I16   : Atomic2Ops<atomic_load_or_16, CPURegs>;
  def ATOMIC_LOAD_OR_I32   : Atomic2Ops<atomic_load_or_32, CPURegs>;
  def ATOMIC_LOAD_XOR_I8   : Atomic2Ops<atomic_load_xor_8, CPURegs>;
  def ATOMIC_LOAD_XOR_I16  : Atomic2Ops<atomic_load_xor_16, CPURegs>;
  def ATOMIC_LOAD_XOR_I32  : Atomic2Ops<atomic_load_xor_32, CPURegs>;
  def ATOMIC_LOAD_NAND_I8  : Atomic2Ops<atomic_load_nand_8, CPURegs>;
  def ATOMIC_LOAD_NAND_I16 : Atomic2Ops<atomic_load_nand_16, CPURegs>;
  def ATOMIC_LOAD_NAND_I32 : Atomic2Ops<atomic_load_nand_32, CPURegs>;

  def ATOMIC_SWAP_I8       : Atomic2Ops<atomic_swap_8, CPURegs>;
  def ATOMIC_SWAP_I16      : Atomic2Ops<atomic_swap_16, CPURegs>;
  def ATOMIC_SWAP_I32      : Atomic2Ops<atomic_swap_32, CPURegs>;

  def ATOMIC_CMP_SWAP_I8   : AtomicCmpSwap<atomic_cmp_swap_8, CPURegs>;
  def ATOMIC_CMP_SWAP_I16  : AtomicCmpSwap<atomic_cmp_swap_16, CPURegs>;
  def ATOMIC_CMP_SWAP_I32  : AtomicCmpSwap<atomic_cmp_swap_32, CPURegs>;
}
}
let Predicates = [Ch12_1] in {
let hasSideEffects = 1 in
def SYNC : Cpu0Inst<(outs), (ins i32imm:$stype), "sync $stype",
                    [(Cpu0Sync imm:$stype)], NoItinerary, FrmOther>
{
  bits<5> stype;
  let Opcode = 0x60;
  let Inst{25-11} = 0;
  let Inst{10-6} = stype;
  let Inst{5-0} = 0;
}
}
/// Load-linked, Store-conditional
def LL      : LLBase<0x61, "ll", CPURegs, mem>;
def SC      : SCBase<0x62, "sc", RegisterOperand<CPURegs>, mem>;
def : Cpu0InstAlias<"sync",
                    (SYNC 0), 1>;

lbdex/chapters/Chapter12_1/Cpu0ISelLowering.h

    MachineBasicBlock *
    EmitInstrWithCustomInserter(MachineInstr &MI,
                                MachineBasicBlock *MBB) const override;
    SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
    bool shouldInsertFencesForAtomic(const Instruction *I) const override {
      return true;
    }

    /// Emit a sign-extension using shl/sra appropriately.
    MachineBasicBlock *emitSignExtendToI32InReg(MachineInstr &MI,
                                                MachineBasicBlock *BB,
                                                unsigned Size, unsigned DstReg,
                                                unsigned SrcRec) const;
    MachineBasicBlock *emitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
                    unsigned Size, unsigned BinOpcode, bool Nand = false) const;
    MachineBasicBlock *emitAtomicBinaryPartword(MachineInstr &MI,
                    MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
                    bool Nand = false) const;
    MachineBasicBlock *emitAtomicCmpSwap(MachineInstr &MI,
                                  MachineBasicBlock *BB, unsigned Size) const;
    MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr &MI,
                                  MachineBasicBlock *BB, unsigned Size) const;

lbdex/chapters/Chapter12_1/Cpu0SelLowering.cpp

const char *Cpu0TargetLowering::getTargetNodeName(unsigned Opcode) const {
  case Cpu0ISD::Sync:              return "Cpu0ISD::Sync";
  ...
}
Cpu0TargetLowering::Cpu0TargetLowering(const Cpu0TargetMachine &TM,
                                       const Cpu0Subtarget &STI)
    : TargetLowering(TM), Subtarget(STI), ABI(TM.getABI()) {

  setOperationAction(ISD::ATOMIC_LOAD,       MVT::i32,    Expand);
  setOperationAction(ISD::ATOMIC_LOAD,       MVT::i64,    Expand);
  setOperationAction(ISD::ATOMIC_STORE,      MVT::i32,    Expand);
  setOperationAction(ISD::ATOMIC_STORE,      MVT::i64,    Expand);
SDValue Cpu0TargetLowering::
LowerOperation(SDValue Op, SelectionDAG &DAG) const
{
  switch (Op.getOpcode())
  {
  case ISD::ATOMIC_FENCE:       return lowerATOMIC_FENCE(Op, DAG);
  ...
}
MachineBasicBlock *
Cpu0TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
  switch (MI.getOpcode()) {
  default:
    llvm_unreachable("Unexpected instr type to insert");
  case Cpu0::ATOMIC_LOAD_ADD_I8:
    return emitAtomicBinaryPartword(MI, BB, 1, Cpu0::ADDu);
  case Cpu0::ATOMIC_LOAD_ADD_I16:
    return emitAtomicBinaryPartword(MI, BB, 2, Cpu0::ADDu);
  case Cpu0::ATOMIC_LOAD_ADD_I32:
    return emitAtomicBinary(MI, BB, 4, Cpu0::ADDu);

  case Cpu0::ATOMIC_LOAD_AND_I8:
    return emitAtomicBinaryPartword(MI, BB, 1, Cpu0::AND);
  case Cpu0::ATOMIC_LOAD_AND_I16:
    return emitAtomicBinaryPartword(MI, BB, 2, Cpu0::AND);
  case Cpu0::ATOMIC_LOAD_AND_I32:
    return emitAtomicBinary(MI, BB, 4, Cpu0::AND);

  case Cpu0::ATOMIC_LOAD_OR_I8:
    return emitAtomicBinaryPartword(MI, BB, 1, Cpu0::OR);
  case Cpu0::ATOMIC_LOAD_OR_I16:
    return emitAtomicBinaryPartword(MI, BB, 2, Cpu0::OR);
  case Cpu0::ATOMIC_LOAD_OR_I32:
    return emitAtomicBinary(MI, BB, 4, Cpu0::OR);

  case Cpu0::ATOMIC_LOAD_XOR_I8:
    return emitAtomicBinaryPartword(MI, BB, 1, Cpu0::XOR);
  case Cpu0::ATOMIC_LOAD_XOR_I16:
    return emitAtomicBinaryPartword(MI, BB, 2, Cpu0::XOR);
  case Cpu0::ATOMIC_LOAD_XOR_I32:
    return emitAtomicBinary(MI, BB, 4, Cpu0::XOR);

  case Cpu0::ATOMIC_LOAD_NAND_I8:
    return emitAtomicBinaryPartword(MI, BB, 1, 0, true);
  case Cpu0::ATOMIC_LOAD_NAND_I16:
    return emitAtomicBinaryPartword(MI, BB, 2, 0, true);
  case Cpu0::ATOMIC_LOAD_NAND_I32:
    return emitAtomicBinary(MI, BB, 4, 0, true);

  case Cpu0::ATOMIC_LOAD_SUB_I8:
    return emitAtomicBinaryPartword(MI, BB, 1, Cpu0::SUBu);
  case Cpu0::ATOMIC_LOAD_SUB_I16:
    return emitAtomicBinaryPartword(MI, BB, 2, Cpu0::SUBu);
  case Cpu0::ATOMIC_LOAD_SUB_I32:
    return emitAtomicBinary(MI, BB, 4, Cpu0::SUBu);

  case Cpu0::ATOMIC_SWAP_I8:
    return emitAtomicBinaryPartword(MI, BB, 1, 0);
  case Cpu0::ATOMIC_SWAP_I16:
    return emitAtomicBinaryPartword(MI, BB, 2, 0);
  case Cpu0::ATOMIC_SWAP_I32:
    return emitAtomicBinary(MI, BB, 4, 0);

  case Cpu0::ATOMIC_CMP_SWAP_I8:
    return emitAtomicCmpSwapPartword(MI, BB, 1);
  case Cpu0::ATOMIC_CMP_SWAP_I16:
    return emitAtomicCmpSwapPartword(MI, BB, 2);
  case Cpu0::ATOMIC_CMP_SWAP_I32:
    return emitAtomicCmpSwap(MI, BB, 4);
  }
}

// This function also handles Cpu0::ATOMIC_SWAP_I32 (when BinOpcode == 0), and
// Cpu0::ATOMIC_LOAD_NAND_I32 (when Nand == true)
MachineBasicBlock *Cpu0TargetLowering::emitAtomicBinary(
    MachineInstr &MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
    bool Nand) const {
  assert((Size == 4) && "Unsupported size for EmitAtomicBinary.");

  MachineFunction *MF = BB->getParent();
  MachineRegisterInfo &RegInfo = MF->getRegInfo();
  const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
  DebugLoc DL = MI.getDebugLoc();
  unsigned LL, SC, AND, XOR, ZERO, BEQ;

  LL = Cpu0::LL;
  SC = Cpu0::SC;
  AND = Cpu0::AND;
  XOR = Cpu0::XOR;
  ZERO = Cpu0::ZERO;
  BEQ = Cpu0::BEQ;

  unsigned OldVal = MI.getOperand(0).getReg();
  unsigned Ptr = MI.getOperand(1).getReg();
  unsigned Incr = MI.getOperand(2).getReg();

  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
  unsigned AndRes = RegInfo.createVirtualRegister(RC);
  unsigned AndRes2 = RegInfo.createVirtualRegister(RC);
  unsigned Success = RegInfo.createVirtualRegister(RC);

  // insert new blocks after the current block
  const BasicBlock *LLVM_BB = BB->getBasicBlock();
  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineFunction::iterator It = ++BB->getIterator();
  MF->insert(It, loopMBB);
  MF->insert(It, exitMBB);

  // Transfer the remainder of BB and its successor edges to exitMBB.
  exitMBB->splice(exitMBB->begin(), BB,
                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
  exitMBB->transferSuccessorsAndUpdatePHIs(BB);

  //  thisMBB:
  //    ...
  //    fallthrough --> loopMBB
  BB->addSuccessor(loopMBB);
  loopMBB->addSuccessor(loopMBB);
  loopMBB->addSuccessor(exitMBB);

  //  loopMBB:
  //    ll oldval, 0(ptr)
  //    <binop> storeval, oldval, incr
  //    sc success, storeval, 0(ptr)
  //    beq success, $0, loopMBB
  BB = loopMBB;
  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
  if (Nand) {
    //  and andres, oldval, incr
    //  xor storeval, $0, andres
    //  xor storeval2, $0, storeval
    BuildMI(BB, DL, TII->get(AND), AndRes).addReg(OldVal).addReg(Incr);
    BuildMI(BB, DL, TII->get(XOR), StoreVal).addReg(ZERO).addReg(AndRes);
    BuildMI(BB, DL, TII->get(XOR), AndRes2).addReg(ZERO).addReg(AndRes);
  } else if (BinOpcode) {
    //  <binop> storeval, oldval, incr
    BuildMI(BB, DL, TII->get(BinOpcode), StoreVal).addReg(OldVal).addReg(Incr);
  } else {
    StoreVal = Incr;
  }
  BuildMI(BB, DL, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
  BuildMI(BB, DL, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);

  MI.eraseFromParent(); // The instruction is gone now.

  return exitMBB;
}

MachineBasicBlock *Cpu0TargetLowering::emitSignExtendToI32InReg(
    MachineInstr &MI, MachineBasicBlock *BB, unsigned Size, unsigned DstReg,
    unsigned SrcReg) const {
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
  DebugLoc DL = MI.getDebugLoc();

  MachineFunction *MF = BB->getParent();
  MachineRegisterInfo &RegInfo = MF->getRegInfo();
  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
  unsigned ScrReg = RegInfo.createVirtualRegister(RC);

  assert(Size < 32);
  int64_t ShiftImm = 32 - (Size * 8);

  BuildMI(BB, DL, TII->get(Cpu0::SHL), ScrReg).addReg(SrcReg).addImm(ShiftImm);
  BuildMI(BB, DL, TII->get(Cpu0::SRA), DstReg).addReg(ScrReg).addImm(ShiftImm);

  return BB;
}

MachineBasicBlock *Cpu0TargetLowering::emitAtomicBinaryPartword(
    MachineInstr &MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
    bool Nand) const {
  assert((Size == 1 || Size == 2) &&
         "Unsupported size for EmitAtomicBinaryPartial.");

  MachineFunction *MF = BB->getParent();
  MachineRegisterInfo &RegInfo = MF->getRegInfo();
  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
  DebugLoc DL = MI.getDebugLoc();

  unsigned Dest = MI.getOperand(0).getReg();
  unsigned Ptr = MI.getOperand(1).getReg();
  unsigned Incr = MI.getOperand(2).getReg();

  unsigned AlignedAddr = RegInfo.createVirtualRegister(RC);
  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
  unsigned Mask = RegInfo.createVirtualRegister(RC);
  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
  unsigned Mask3 = RegInfo.createVirtualRegister(RC);
  unsigned NewVal = RegInfo.createVirtualRegister(RC);
  unsigned OldVal = RegInfo.createVirtualRegister(RC);
  unsigned Incr2 = RegInfo.createVirtualRegister(RC);
  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RC);
  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
  unsigned AndRes = RegInfo.createVirtualRegister(RC);
  unsigned BinOpRes = RegInfo.createVirtualRegister(RC);
  unsigned BinOpRes2 = RegInfo.createVirtualRegister(RC);
  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
  unsigned Success = RegInfo.createVirtualRegister(RC);

  // insert new blocks after the current block
  const BasicBlock *LLVM_BB = BB->getBasicBlock();
  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineFunction::iterator It = ++BB->getIterator();

  MF->insert(It, loopMBB);
  MF->insert(It, sinkMBB);
  MF->insert(It, exitMBB);

  // Transfer the remainder of BB and its successor edges to exitMBB.
  exitMBB->splice(exitMBB->begin(), BB,
                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
  exitMBB->transferSuccessorsAndUpdatePHIs(BB);

  BB->addSuccessor(loopMBB);
  loopMBB->addSuccessor(loopMBB);
  loopMBB->addSuccessor(sinkMBB);
  sinkMBB->addSuccessor(exitMBB);

  //  thisMBB:
  //    addiu   masklsb2,$0,-4                # 0xfffffffc
  //    and     alignedaddr,ptr,masklsb2
  //    andi    ptrlsb2,ptr,3
  //    sll     shiftamt,ptrlsb2,3
  //    ori     maskupper,$0,255               # 0xff
  //    sll     mask,maskupper,shiftamt
  //    xor     mask2,$0,mask
  //    xor     mask3,$0,mask2
  //    sll     incr2,incr,shiftamt

  int64_t MaskImm = (Size == 1) ? 255 : 65535;
  BuildMI(BB, DL, TII->get(Cpu0::ADDiu), MaskLSB2)
    .addReg(Cpu0::ZERO).addImm(-4);
  BuildMI(BB, DL, TII->get(Cpu0::AND), AlignedAddr)
    .addReg(Ptr).addReg(MaskLSB2);
  BuildMI(BB, DL, TII->get(Cpu0::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
  if (Subtarget.isLittle()) {
    BuildMI(BB, DL, TII->get(Cpu0::SHL), ShiftAmt).addReg(PtrLSB2).addImm(3);
  } else {
    unsigned Off = RegInfo.createVirtualRegister(RC);
    BuildMI(BB, DL, TII->get(Cpu0::XORi), Off)
      .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
    BuildMI(BB, DL, TII->get(Cpu0::SHL), ShiftAmt).addReg(Off).addImm(3);
  }
  BuildMI(BB, DL, TII->get(Cpu0::ORi), MaskUpper)
    .addReg(Cpu0::ZERO).addImm(MaskImm);
  BuildMI(BB, DL, TII->get(Cpu0::SHLV), Mask)
    .addReg(MaskUpper).addReg(ShiftAmt);
  BuildMI(BB, DL, TII->get(Cpu0::XOR), Mask2).addReg(Cpu0::ZERO).addReg(Mask);
  BuildMI(BB, DL, TII->get(Cpu0::XOR), Mask3).addReg(Cpu0::ZERO).addReg(Mask2);
  BuildMI(BB, DL, TII->get(Cpu0::SHLV), Incr2).addReg(Incr).addReg(ShiftAmt);

  // atomic.load.binop
  // loopMBB:
  //   ll      oldval,0(alignedaddr)
  //   binop   binopres,oldval,incr2
  //   and     newval,binopres,mask
  //   and     maskedoldval0,oldval,mask3
  //   or      storeval,maskedoldval0,newval
  //   sc      success,storeval,0(alignedaddr)
  //   beq     success,$0,loopMBB

  // atomic.swap
  // loopMBB:
  //   ll      oldval,0(alignedaddr)
  //   and     newval,incr2,mask
  //   and     maskedoldval0,oldval,mask3
  //   or      storeval,maskedoldval0,newval
  //   sc      success,storeval,0(alignedaddr)
  //   beq     success,$0,loopMBB

  BB = loopMBB;
  unsigned LL = Cpu0::LL;
  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
  if (Nand) {
    //  and andres, oldval, incr2
    //  xor binopres,  $0, andres
    //  xor binopres2, $0, binopres
    //  and newval, binopres, mask
    BuildMI(BB, DL, TII->get(Cpu0::AND), AndRes).addReg(OldVal).addReg(Incr2);
    BuildMI(BB, DL, TII->get(Cpu0::XOR), BinOpRes)
      .addReg(Cpu0::ZERO).addReg(AndRes);
    BuildMI(BB, DL, TII->get(Cpu0::XOR), BinOpRes2)
      .addReg(Cpu0::ZERO).addReg(BinOpRes);
    BuildMI(BB, DL, TII->get(Cpu0::AND), NewVal).addReg(BinOpRes).addReg(Mask);
  } else if (BinOpcode) {
    //  <binop> binopres, oldval, incr2
    //  and newval, binopres, mask
    BuildMI(BB, DL, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
    BuildMI(BB, DL, TII->get(Cpu0::AND), NewVal).addReg(BinOpRes).addReg(Mask);
  } else { // atomic.swap
    //  and newval, incr2, mask
    BuildMI(BB, DL, TII->get(Cpu0::AND), NewVal).addReg(Incr2).addReg(Mask);
  }

  BuildMI(BB, DL, TII->get(Cpu0::AND), MaskedOldVal0)
    .addReg(OldVal).addReg(Mask2);
  BuildMI(BB, DL, TII->get(Cpu0::OR), StoreVal)
    .addReg(MaskedOldVal0).addReg(NewVal);
  unsigned SC = Cpu0::SC;
  BuildMI(BB, DL, TII->get(SC), Success)
    .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
  BuildMI(BB, DL, TII->get(Cpu0::BEQ))
    .addReg(Success).addReg(Cpu0::ZERO).addMBB(loopMBB);

  //  sinkMBB:
  //    and     maskedoldval1,oldval,mask
  //    srl     srlres,maskedoldval1,shiftamt
  //    sign_extend dest,srlres
  BB = sinkMBB;

  BuildMI(BB, DL, TII->get(Cpu0::AND), MaskedOldVal1)
    .addReg(OldVal).addReg(Mask);
  BuildMI(BB, DL, TII->get(Cpu0::SHRV), SrlRes)
      .addReg(MaskedOldVal1).addReg(ShiftAmt);
  BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);

  MI.eraseFromParent(); // The instruction is gone now.

  return exitMBB;
}

MachineBasicBlock * Cpu0TargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
                                                          MachineBasicBlock *BB,
                                                          unsigned Size) const {
  assert((Size == 4) && "Unsupported size for EmitAtomicCmpSwap.");

  MachineFunction *MF = BB->getParent();
  MachineRegisterInfo &RegInfo = MF->getRegInfo();
  const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
  DebugLoc DL = MI.getDebugLoc();
  unsigned LL, SC, ZERO, BNE, BEQ;

  LL = Cpu0::LL;
  SC = Cpu0::SC;
  ZERO = Cpu0::ZERO;
  BNE = Cpu0::BNE;
  BEQ = Cpu0::BEQ;

  unsigned Dest    = MI.getOperand(0).getReg();
  unsigned Ptr     = MI.getOperand(1).getReg();
  unsigned OldVal  = MI.getOperand(2).getReg();
  unsigned NewVal  = MI.getOperand(3).getReg();

  unsigned Success = RegInfo.createVirtualRegister(RC);

  // insert new blocks after the current block
  const BasicBlock *LLVM_BB = BB->getBasicBlock();
  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineFunction::iterator It = ++BB->getIterator();

  MF->insert(It, loop1MBB);
  MF->insert(It, loop2MBB);
  MF->insert(It, exitMBB);

  // Transfer the remainder of BB and its successor edges to exitMBB.
  exitMBB->splice(exitMBB->begin(), BB,
                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
  exitMBB->transferSuccessorsAndUpdatePHIs(BB);

  //  thisMBB:
  //    ...
  //    fallthrough --> loop1MBB
  BB->addSuccessor(loop1MBB);
  loop1MBB->addSuccessor(exitMBB);
  loop1MBB->addSuccessor(loop2MBB);
  loop2MBB->addSuccessor(loop1MBB);
  loop2MBB->addSuccessor(exitMBB);

  // loop1MBB:
  //   ll dest, 0(ptr)
  //   bne dest, oldval, exitMBB
  BB = loop1MBB;
  BuildMI(BB, DL, TII->get(LL), Dest).addReg(Ptr).addImm(0);
  BuildMI(BB, DL, TII->get(BNE))
    .addReg(Dest).addReg(OldVal).addMBB(exitMBB);

  // loop2MBB:
  //   sc success, newval, 0(ptr)
  //   beq success, $0, loop1MBB
  BB = loop2MBB;
  BuildMI(BB, DL, TII->get(SC), Success)
    .addReg(NewVal).addReg(Ptr).addImm(0);
  BuildMI(BB, DL, TII->get(BEQ))
    .addReg(Success).addReg(ZERO).addMBB(loop1MBB);

  MI.eraseFromParent(); // The instruction is gone now.

  return exitMBB;
}

MachineBasicBlock *
Cpu0TargetLowering::emitAtomicCmpSwapPartword(MachineInstr &MI,
                                              MachineBasicBlock *BB,
                                              unsigned Size) const {
  assert((Size == 1 || Size == 2) &&
      "Unsupported size for EmitAtomicCmpSwapPartial.");

  MachineFunction *MF = BB->getParent();
  MachineRegisterInfo &RegInfo = MF->getRegInfo();
  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
  DebugLoc DL = MI.getDebugLoc();

  unsigned Dest    = MI.getOperand(0).getReg();
  unsigned Ptr     = MI.getOperand(1).getReg();
  unsigned CmpVal  = MI.getOperand(2).getReg();
  unsigned NewVal  = MI.getOperand(3).getReg();

  unsigned AlignedAddr = RegInfo.createVirtualRegister(RC);
  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
  unsigned Mask = RegInfo.createVirtualRegister(RC);
  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
  unsigned Mask3 = RegInfo.createVirtualRegister(RC);
  unsigned ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
  unsigned OldVal = RegInfo.createVirtualRegister(RC);
  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
  unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC);
  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RC);
  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
  unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC);
  unsigned MaskedNewVal = RegInfo.createVirtualRegister(RC);
  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
  unsigned Success = RegInfo.createVirtualRegister(RC);

  // insert new blocks after the current block
  const BasicBlock *LLVM_BB = BB->getBasicBlock();
  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineFunction::iterator It = ++BB->getIterator();

  MF->insert(It, loop1MBB);
  MF->insert(It, loop2MBB);
  MF->insert(It, sinkMBB);
  MF->insert(It, exitMBB);

  // Transfer the remainder of BB and its successor edges to exitMBB.
  exitMBB->splice(exitMBB->begin(), BB,
                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
  exitMBB->transferSuccessorsAndUpdatePHIs(BB);

  BB->addSuccessor(loop1MBB);
  loop1MBB->addSuccessor(sinkMBB);
  loop1MBB->addSuccessor(loop2MBB);
  loop2MBB->addSuccessor(loop1MBB);
  loop2MBB->addSuccessor(sinkMBB);
  sinkMBB->addSuccessor(exitMBB);

  // FIXME: computation of newval2 can be moved to loop2MBB.
  //  thisMBB:
  //    addiu   masklsb2,$0,-4                # 0xfffffffc
  //    and     alignedaddr,ptr,masklsb2
  //    andi    ptrlsb2,ptr,3
  //    shl     shiftamt,ptrlsb2,3
  //    ori     maskupper,$0,255               # 0xff
  //    shl     mask,maskupper,shiftamt
  //    xor     mask2,$0,mask
  //    xor     mask3,$0,mask2
  //    andi    maskedcmpval,cmpval,255
  //    shl     shiftedcmpval,maskedcmpval,shiftamt
  //    andi    maskednewval,newval,255
  //    shl     shiftednewval,maskednewval,shiftamt
  int64_t MaskImm = (Size == 1) ? 255 : 65535;
  BuildMI(BB, DL, TII->get(Cpu0::ADDiu), MaskLSB2)
    .addReg(Cpu0::ZERO).addImm(-4);
  BuildMI(BB, DL, TII->get(Cpu0::AND), AlignedAddr)
    .addReg(Ptr).addReg(MaskLSB2);
  BuildMI(BB, DL, TII->get(Cpu0::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
  if (Subtarget.isLittle()) {
    BuildMI(BB, DL, TII->get(Cpu0::SHL), ShiftAmt).addReg(PtrLSB2).addImm(3);
  } else {
    unsigned Off = RegInfo.createVirtualRegister(RC);
    BuildMI(BB, DL, TII->get(Cpu0::XORi), Off)
      .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
    BuildMI(BB, DL, TII->get(Cpu0::SHL), ShiftAmt).addReg(Off).addImm(3);
  }
  BuildMI(BB, DL, TII->get(Cpu0::ORi), MaskUpper)
    .addReg(Cpu0::ZERO).addImm(MaskImm);
  BuildMI(BB, DL, TII->get(Cpu0::SHLV), Mask)
    .addReg(MaskUpper).addReg(ShiftAmt);
  BuildMI(BB, DL, TII->get(Cpu0::XOR), Mask2).addReg(Cpu0::ZERO).addReg(Mask);
  BuildMI(BB, DL, TII->get(Cpu0::XOR), Mask3).addReg(Cpu0::ZERO).addReg(Mask2);
  BuildMI(BB, DL, TII->get(Cpu0::ANDi), MaskedCmpVal)
    .addReg(CmpVal).addImm(MaskImm);
  BuildMI(BB, DL, TII->get(Cpu0::SHLV), ShiftedCmpVal)
    .addReg(MaskedCmpVal).addReg(ShiftAmt);
  BuildMI(BB, DL, TII->get(Cpu0::ANDi), MaskedNewVal)
    .addReg(NewVal).addImm(MaskImm);
  BuildMI(BB, DL, TII->get(Cpu0::SHLV), ShiftedNewVal)
    .addReg(MaskedNewVal).addReg(ShiftAmt);

  //  loop1MBB:
  //    ll      oldval,0(alginedaddr)
  //    and     maskedoldval0,oldval,mask
  //    bne     maskedoldval0,shiftedcmpval,sinkMBB
  BB = loop1MBB;
  unsigned LL = Cpu0::LL;
  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
  BuildMI(BB, DL, TII->get(Cpu0::AND), MaskedOldVal0)
    .addReg(OldVal).addReg(Mask);
  BuildMI(BB, DL, TII->get(Cpu0::BNE))
    .addReg(MaskedOldVal0).addReg(ShiftedCmpVal).addMBB(sinkMBB);

  //  loop2MBB:
  //    and     maskedoldval1,oldval,mask3
  //    or      storeval,maskedoldval1,shiftednewval
  //    sc      success,storeval,0(alignedaddr)
  //    beq     success,$0,loop1MBB
  BB = loop2MBB;
  BuildMI(BB, DL, TII->get(Cpu0::AND), MaskedOldVal1)
    .addReg(OldVal).addReg(Mask3);
  BuildMI(BB, DL, TII->get(Cpu0::OR), StoreVal)
    .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
  unsigned SC = Cpu0::SC;
  BuildMI(BB, DL, TII->get(SC), Success)
      .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
  BuildMI(BB, DL, TII->get(Cpu0::BEQ))
      .addReg(Success).addReg(Cpu0::ZERO).addMBB(loop1MBB);

  //  sinkMBB:
  //    srl     srlres,maskedoldval0,shiftamt
  //    sign_extend dest,srlres
  BB = sinkMBB;

  BuildMI(BB, DL, TII->get(Cpu0::SHRV), SrlRes)
      .addReg(MaskedOldVal0).addReg(ShiftAmt);
  BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);

  MI.eraseFromParent();   // The instruction is gone now.

  return exitMBB;
}
SDValue Cpu0TargetLowering::lowerATOMIC_FENCE(SDValue Op,
                                              SelectionDAG &DAG) const {
  // FIXME: Need pseudo-fence for 'singlethread' fences
  // FIXME: Set SType for weaker fences where supported/appropriate.
  unsigned SType = 0;
  SDLoc DL(Op);
  return DAG.getNode(Cpu0ISD::Sync, DL, MVT::Other, Op.getOperand(0),
                     DAG.getConstant(SType, DL, MVT::i32));
}

lbdex/chapters/Chapter12_1/Cpu0RegisterInfo.h

  /// Code Generation virtual methods...
  const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
                                                unsigned Kind) const override;

lbdex/chapters/Chapter12_1/Cpu0RegisterInfo.cpp

const TargetRegisterClass *
Cpu0RegisterInfo::getPointerRegClass(const MachineFunction &MF,
                                     unsigned Kind) const {
  return &Cpu0::CPURegsRegClass;
}

lbdex/chapters/Chapter12_1/Cpu0SEISelLowering.cpp

Cpu0SETargetLowering::Cpu0SETargetLowering(const Cpu0TargetMachine &TM,
                                           const Cpu0Subtarget &STI)
    : Cpu0TargetLowering(TM, STI) {
  setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
  ...
}

lbdex/chapters/Chapter12_1/Cpu0TargetMachine.cpp

/// Cpu0 Code Generator Pass Configuration Options.
class Cpu0PassConfig : public TargetPassConfig {
  void addIRPasses() override;
  ...
};
void Cpu0PassConfig::addIRPasses() {
  TargetPassConfig::addIRPasses();
  addPass(createAtomicExpandPass(&getCpu0TargetMachine()));
}

Since SC instruction uses RegisterOperand type in Cpu0InstrInfo.td and SC uses FMem node which DecoderMethod is “DecodeMem”, the DecodeMem() of Cpu0Disassembler.cpp need to be changed as above.

The atomic node defined in “let usesCustomInserter = 1 in” of Cpu0InstrInfo.td tells llvm calling EmitInstrWithCustomInserter() of Cpu0ISelLowering.cpp. For example, “def ATOMIC_LOAD_ADD_I8 : Atomic2Ops<atomic_load_add_8, CPURegs>;” will calling EmitInstrWithCustomInserter() with Machine Instruction Opcode “ATOMIC_LOAD_ADD_I8” when it meets IR “load atomic i8*”.

The “setInsertFencesForAtomic(true);” in Cpu0ISelLowering.cpp will trigger addIRPasses() of Cpu0TargetMachine.cpp, then the createAtomicExpandPass() of addIRPasses() will create llvm IR ATOMIC_FENCE. Next, the lowerATOMIC_FENCE() of Cpu0ISelLowering.cpp will create Cpu0ISD::Sync when it meets IR ATOMIC_FENCE since “setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);” of Cpu0SEISelLowering.cpp. Finally the pattern defined in Cpu0InstrInfo.td translate it into instruction “sync” by “def SYNC” and alias “SYNC 0”.

This part of Cpu0 backend code is same with Mips except Cpu0 has no instruction “nor”.

List the atomic IRs, corresponding DAGs and Opcode as the following table.

Table 34 The atomic related IRs, their corresponding DAGs and Opcode of Cpu0ISelLowering.cpp
IR DAG Opcode
load atomic AtomicLoad ATOMIC_CMP_SWAP_XXX
store atomic AtomicStore ATOMIC_SWAP_XXX
atomicrmw add AtomicLoadAdd ATOMIC_LOAD_ADD_XXX
atomicrmw sub AtomicLoadSub ATOMIC_LOAD_SUB_XXX
atomicrmw xor AtomicLoadXor ATOMIC_LOAD_XOR_XXX
atomicrmw and AtomicLoadAnd ATOMIC_LOAD_AND_XXX
atomicrmw nand AtomicLoadNand ATOMIC_LOAD_NAND_XXX
atomicrmw or AtomicLoadOr ATOMIC_LOAD_OR_XXX
cmpxchg AtomicCmpSwapWithSuccess ATOMIC_CMP_SWAP_XXX
atomicrmw xchg AtomicLoadSwap ATOMIC_SWAP_XXX

Input files atomics.ll and atomics-fences.ll include the llvm atomic IRs test. Input files ch12_atomics.cpp and ch12_atomics-fences.cpp are the C++ files for generating llvm atomic IRs. The C++ files need to run with clang options “clang++ -pthread -std=c++11”.

[1]http://llvm.org/docs/ExceptionHandling.html
[2]http://en.wikipedia.org/wiki/Thread-local_storage
[3]https://en.wikipedia.org/wiki/Memory_model_%28programming%29
[4]http://stackoverflow.com/questions/6319146/c11-introduced-a-standardized-memory-model-what-does-it-mean-and-how-is-it-g
[5]http://herbsutter.com/2013/02/11/atomic-weapons-the-c-memory-model-and-modern-hardware/
[6]http://llvm.org/docs/Atomics.html
[7]http://llvm.org/docs/LangRef.html#ordering