Other data type¶

Until now, we only handle both int and long type of 32 bits size. This chapter introduce other types, such as pointer and those are not 32-bit size which inlcude bool, char, short int and long long.

Local variable pointer ¶

To support pointer to local variable, add this code fragment in Cpu0InstrInfo.td and Cpu0InstPrinter.cpp as follows,

lbdex/chapters/Chapter7_1/Cpu0InstrInfo.td

def mem_ea : Operand<iPTR> {
  let PrintMethod = "printMemOperandEA";
  let MIOperandInfo = (ops GPROut, simm16);
  let EncoderMethod = "getMemEncoding";
}

class EffectiveAddress<string instr_asm, RegisterClass RC, Operand Mem> :
  FMem<0x09, (outs RC:$ra), (ins Mem:$addr),
     instr_asm, [(set RC:$ra, addr:$addr)], IIAlu>;
}

// FrameIndexes are legalized when they are operands from load/store
// instructions. The same not happens for stack address copies, so an
// add op with mem ComplexPattern is used and the stack address copy
// can be matched. It's similar to Sparc LEA_ADDRi
def LEA_ADDiu : EffectiveAddress<"addiu\t$ra, $addr", CPURegs, mem_ea> {
  let isCodeGenOnly = 1;
}

lbdex/chapters/Chapter3_2/InstPrinter/Cpu0InstPrinter.h

  void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);

lbdex/chapters/Chapter3_2/InstPrinter/Cpu0InstPrinter.cpp

// The DAG data node, mem_ea of Cpu0InstrInfo.td, cannot be disabled by
// ch7_1, only opcode node can be disabled.
void Cpu0InstPrinter::
printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O) {
  // when using stack locations for not load/store instructions
  // print the same way as all normal 3 operand instructions.
  printOperand(MI, opNum, O);
  O << ", ";
  printOperand(MI, opNum+1, O);
  return;
}

As comment in Cpu0InstPrinter.cpp, the printMemOperandEA is added at early chapter 3_2 since the DAG data node, mem_ea of Cpu0InstrInfo.td, cannot be disabled by ch7_1_localpointer, only opcode node can be disabled. Run ch7_1_localpointer.cpp with code Chapter7_1/ which support pointer to local variable, will get result as follows,

lbdex/input/ch7_1_localpointer.cpp

int test_local_pointer()
{
  int b = 3;
  
  int* p = &b;

  return *p;
}

118-165-66-82:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_localpointer.cpp -emit-llvm -o ch7_1_localpointer.bc
118-165-66-82:input Jonathan$ llvm-dis ch7_1_localpointer.bc -o -
...
; Function Attrs: nounwind
define i32 @_Z18test_local_pointerv() #0 {
  %b = alloca i32, align 4
  %p = alloca i32*, align 4
  store i32 3, i32* %b, align 4
  store i32* %b, i32** %p, align 4
  %1 = load i32** %p, align 4
  %2 = load i32* %1, align 4
  ret i32 %2
}
...

118-165-66-82:input Jonathan$ /Users/Jonathan/llvm/test/build/bin/llc
-march=cpu0 -relocation-model=pic -filetype=asm
ch7_1_localpointer.bc -o -
  ...
        addiu $sp, $sp, -8
        addiu $2, $zero, 3
        st    $2, 4($fp)
        addiu $2, $fp, 4     // b address is 4($sp)
        st    $2, 0($fp)
        ld    $2, 4($fp)
        addiu $sp, $sp, 8
        ret   $lr
  ...

char, short int and bool ¶

To support signed/unsigned type of char and short int, adding the following code to Chapter7_1/.

lbdex/chapters/Chapter7_1/Cpu0InstrInfo.td

def sextloadi16_a   : AlignedLoad<sextloadi16>;
def zextloadi16_a   : AlignedLoad<zextloadi16>;
def extloadi16_a    : AlignedLoad<extloadi16>;

def truncstorei16_a : AlignedStore<truncstorei16>;

let Predicates = [Ch7_1] in {
def LB     : LoadM32<0x03, "lb",  sextloadi8>;
def LBu    : LoadM32<0x04, "lbu", zextloadi8>;
def SB     : StoreM32<0x05, "sb", truncstorei8>;
def LH     : LoadM32<0x06, "lh",  sextloadi16_a>;
def LHu    : LoadM32<0x07, "lhu", zextloadi16_a>;
def SH     : StoreM32<0x08, "sh", truncstorei16_a>;
}

Run Chapter7_1/ with ch7_1_char_in_struct.cpp will get the following result.

lbdex/input/ch7_1_char_in_struct.cpp

struct Date
{
  short year;
  char month;
  char day;
  char hour;
  char minute;
  char second;
};

unsigned char b[4] = {'a', 'b', 'c', '\0'};

int test_char()
{
  unsigned char a = b[1];
  char c = (char)b[1];
  Date date1 = {2012, (char)11, (char)25, (char)9, (char)40, (char)15};
  char m = date1.month;
  char s = date1.second;

  return 0;
}

118-165-64-245:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llvm-dis ch7_1_char_in_struct.bc -o -
define i32 @_Z9test_charv() #0 {
  %a = alloca i8, align 1
  %c = alloca i8, align 1
  %date1 = alloca %struct.Date, align 2
  %m = alloca i8, align 1
  %s = alloca i8, align 1
  %1 = load i8* getelementptr inbounds ([4 x i8]* @b, i32 0, i32 1), align 1
  store i8 %1, i8* %a, align 1
  %2 = load i8* getelementptr inbounds ([4 x i8]* @b, i32 0, i32 1), align 1
  store i8 %2, i8* %c, align 1
  %3 = bitcast %struct.Date* %date1 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %3, i8* bitcast ({ i16, i8, i8, i8,
  i8, i8, i8 }* @_ZZ9test_charvE5date1 to i8*), i32 8, i32 2, i1 false)
  %4 = getelementptr inbounds %struct.Date* %date1, i32 0, i32 1
  %5 = load i8* %4, align 1
  store i8 %5, i8* %m, align 1
  %6 = getelementptr inbounds %struct.Date* %date1, i32 0, i32 5
  %7 = load i8* %6, align 1
  store i8 %7, i8* %s, align 1
  ret i32 0
}

118-165-64-245:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_char_in_struct.cpp -emit-llvm -o ch7_1_char_in_struct.bc
118-165-64-245:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llc -march=cpu0 -relocation-model=pic -filetype=asm
ch7_1_char_in_struct.bc -o -
  ...
# BB#0:                                 # %entry
  addiu $sp, $sp, -24
  lui $2, %got_hi(b)
  addu  $2, $2, $gp
  ld  $2, %got_lo(b)($2)
  lbu $3, 1($2)
  sb  $3, 20($fp)
  lbu $2, 1($2)
  sb  $2, 16($fp)
  ld  $2, %got($_ZZ9test_charvE5date1)($gp)
  addiu $2, $2, %lo($_ZZ9test_charvE5date1)
  lhu $3, 4($2)
  shl $3, $3, 16
  lhu $4, 6($2)
  or  $3, $3, $4
  st  $3, 12($fp) // store hour, minute and second on 12($sp)
  lhu $3, 2($2)
  lhu $2, 0($2)
  shl $2, $2, 16
  or  $2, $2, $3
  st  $2, 8($fp)    // store year, month and day on 8($sp)
  lbu $2, 10($fp)   // m = date1.month;
  sb  $2, 4($fp)
  lbu $2, 14($fp)   // s = date1.second;
  sb  $2, 0($fp)
  addiu $sp, $sp, 24
  ret $lr
  .set  macro
  .set  reorder
  .end  _Z9test_charv
$tmp1:
  .size _Z9test_charv, ($tmp1)-_Z9test_charv

  .type b,@object               # @b
  .data
  .globl  b
b:
  .asciz   "abc"
  .size b, 4

  .type $_ZZ9test_charvE5date1,@object # @_ZZ9test_charvE5date1
  .section  .rodata.cst8,"aM",@progbits,8
  .align  1
$_ZZ9test_charvE5date1:
  .2byte  2012                    # 0x7dc
  .byte 11                      # 0xb
  .byte 25                      # 0x19
  .byte 9                       # 0x9
  .byte 40                      # 0x28
  .byte 15                      # 0xf
  .space  1
  .size $_ZZ9test_charvE5date1, 8

Run Chapter7_1/ with ch7_1_char_short.cpp will get the following result.

lbdex/input/ch7_1_char_short.cpp

int test_signed_char()
{
  char a = 0x80;
  int i = (signed int)a;
  i = i + 2; // i = (-128+2) = -126

  return i;
}

int test_unsigned_char()
{
  unsigned char c = 0x80;
  unsigned int ui = (unsigned int)c;
  ui = ui + 2; // i = (128+2) = 130

  return (int)ui;
}

int test_signed_short()
{
  short a = 0x8000;
  int i = (signed int)a;
  i = i + 2; // i = (-32768+2) = -32766

  return i;
}

int test_unsigned_short()
{
  unsigned short c = 0x8000;
  unsigned int ui = (unsigned int)c;
  ui = ui + 2; // i = (32768+2) = 32770
  c = (unsigned short)ui;

  return (int)ui;
}

1-160-136-236:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llvm-dis ch7_1_char_short.bc -o -
  ...
define i32 @_Z16test_signed_charv() #0 {
  ...
  %1 = load i8* %a, align 1
  %2 = sext i8 %1 to i32
  ...
}

; Function Attrs: nounwind
define i32 @_Z18test_unsigned_charv() #0 {
  ...
  %1 = load i8* %c, align 1
  %2 = zext i8 %1 to i32
  ...
}

; Function Attrs: nounwind
define i32 @_Z17test_signed_shortv() #0 {
  ...
  %1 = load i16* %a, align 2
  %2 = sext i16 %1 to i32
  ...
}

; Function Attrs: nounwind
define i32 @_Z19test_unsigned_shortv() #0 {
  ...
  %1 = load i16* %c, align 2
  %2 = zext i16 %1 to i32
  ...
}

attributes #0 = { nounwind }

1-160-136-236:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llc -march=cpu0 -relocation-model=static -filetype=asm ch7_1_char_short.bc -o -
  ...
  .globl  _Z16test_signed_charv
  ...
  lb  $2, 4($sp)
  ...
  .end  _Z16test_signed_charv

  .globl  _Z18test_unsigned_charv
  ...
  lbu $2, 4($sp)
  ...
  .end  _Z18test_unsigned_charv

  .globl  _Z17test_signed_shortv
  ...
  lh  $2, 4($sp)
  ...
  .end  _Z17test_signed_shortv

  .globl  _Z19test_unsigned_shortv
  ...
  lhu $2, 4($sp)
  ...
  .end  _Z19test_unsigned_shortv
  ...

As you can see lb/lh are for signed byte/short type while lbu/lhu are for unsigned byte/short type. To support C type-cast or type-conversion feature efficiently, Cpu0 provide instruction “lb” to converse type char to int with one single instruction. The other instructions lbu, lh, lhu, sb and sh are applied in both signed or unsigned of type byte and short conversion. Their differences have been explained in Chapter 2.

To support load bool type, the following code added.

lbdex/chapters/Chapter7_1/Cpu0ISelLowering.cpp

Cpu0TargetLowering::Cpu0TargetLowering(const Cpu0TargetMachine &TM,
                                       const Cpu0Subtarget &STI)
    : TargetLowering(TM), Subtarget(STI), ABI(TM.getABI()) {

  // Cpu0 does not have i1 type, so use i32 for
  // setcc operations results (slt, sgt, ...).
  setBooleanContents(ZeroOrOneBooleanContent);
  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

  // Load extented operations for i1 types must be promoted
  for (MVT VT : MVT::integer_valuetypes()) {
    setLoadExtAction(ISD::EXTLOAD,  VT, MVT::i1,  Promote);
    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1,  Promote);
    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1,  Promote);
  }

  ...
}

The setBooleanContents() purpose as following, but I don’t know it well. Without it, the ch7_1_bool2.ll still works as below. The IR input file ch7_1_bool2.ll is used in testing here since the c++ version need flow control which is not supported at this point. File ch_run_backend.cpp include the test fragment for bool as below.

include/llvm/Target/TargetLowering.h

  enum BooleanContent { // How the target represents true/false values.
    UndefinedBooleanContent,    // Only bit 0 counts, the rest can hold garbage.
    ZeroOrOneBooleanContent,        // All bits zero except for bit 0.
    ZeroOrNegativeOneBooleanContent // All bits equal to bit 0.
  };
...
protected:
  /// setBooleanContents - Specify how the target extends the result of a
  /// boolean value from i1 to a wider type.  See getBooleanContents.
  void setBooleanContents(BooleanContent Ty) { BooleanContents = Ty; }
  /// setBooleanVectorContents - Specify how the target extends the result
  /// of a vector boolean value from a vector of i1 to a wider type.  See
  /// getBooleanContents.
  void setBooleanVectorContents(BooleanContent Ty) {
    BooleanVectorContents = Ty;
  }

lbdex/input/ch7_1_bool2.ll

define zeroext i1 @verify_load_bool() #0 {
entry:
  %retval = alloca i1, align 1
  store i1 1, i1* %retval, align 1
  %0 = load i1, i1* %retval
  ret i1 %0
}

  118-165-64-245:input Jonathan$ /Users/Jonathan/llvm/test/build/
  bin/llc -march=cpu0 -relocation-model=pic -filetype=asm ch7_1_bool2.ll -o -

  .section .mdebug.abi32
  .previous
  .file "ch7_1_bool2.ll"
  .text
  .globl  verify_load_bool
  .align  2
  .type verify_load_bool,@function
  .ent  verify_load_bool        # @verify_load_bool
verify_load_bool:
  .cfi_startproc
  .frame  $sp,8,$lr
  .mask   0x00000000,0
  .set  noreorder
  .set  nomacro
# BB#0:                                 # %entry
  addiu $sp, $sp, -8
$tmp1:
  .cfi_def_cfa_offset 8
  addiu $2, $zero, 1
  sb  $2, 7($sp)
  addiu $sp, $sp, 8
  ret $lr
  .set  macro
  .set  reorder
  .end  verify_load_bool
$tmp2:
  .size verify_load_bool, ($tmp2)-verify_load_bool
  .cfi_endproc

The ch7_1_bool.cpp is the bool test version for C language. You can run with it at Chapter8_1 to get the similar result with ch7_1_bool2.ll.

lbdex/input/ch7_1_bool.cpp

bool test_load_bool()
{
  int a = 1;

  if (a < 0)
    return false;

  return true;
}

Summary as the following table.

Table 31 The C, IR, and DAG translation for char, short and bool translation (ch7_1_char_short.cpp and ch7_1_bool2.ll).¶
C	.bc	Optimized legalized selection DAG
char a =0x80;	%1 = load i8* %a, align 1
int i = (signed int)a;	%2 = sext i8 %1 to i32	load …, <…, sext from i8>
unsigned char c = 0x80;	%1 = load i8* %c, align 1
unsigned int ui = (unsigned int)c;	%2 = zext i8 %1 to i32	load …, <…, zext from i8>
short a =0x8000;	%1 = load i16* %a, align 2
int i = (signed int)a;	%2 = sext i16 %1 to i32	load …, <…, sext from i16>
unsigned short c = 0x8000;	%1 = load i16* %c, align 2
unsigned int ui = (unsigned int)c;	%2 = zext i16 %1 to i32	load …, <…, zext from i16>
c = (unsigned short)ui;	%6 = trunc i32 %5 to i16
	store i16 %6, i16* %c, align 2	store …,<…, trunc to i16>
return true;	store i1 1, i1* %retval, align 1	store …,<…, trunc to i8>

Table 32 The backend translation for char, short and bool translation (ch7_1_char_short.cpp and ch7_1_bool2.ll).¶
Optimized legalized selection DAG	Cpu0	pattern in Cpu0InstrInfo.td
load …, <…, sext from i8>	lb	LB : LoadM32<0x03, “lb”, sextloadi8>;
load …, <…, zext from i8>	lbu	LBu : LoadM32<0x04, “lbu”, zextloadi8>;
load …, <…, sext from i16>	lh	LH : LoadM32<0x06, “lh”, sextloadi16_a>;
load …, <…, zext from i16>	lhu	LHu : LoadM32<0x07, “lhu”, zextloadi16_a>;
store …,<…, trunc to i16>	sh	SH : StoreM32<0x08, “sh”, truncstorei16_a>;
store …,<…, trunc to i8>	sb	SB : StoreM32<0x05, “sb”, truncstorei8>;

long long ¶

Like Mips, the type long of Cpu0 is 32-bit and type long long is 64-bit for C language. To support type long long, we add the following code to Chapter7_1/.

lbdex/chapters/Chapter7_1/Cpu0SEISelDAGToDAG.cpp

void Cpu0SEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
                                           SDValue CmpLHS, const SDLoc &DL,
                                           SDNode *Node) const {
  unsigned Opc = InFlag.getOpcode(); (void)Opc;
  assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
          (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
         "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");

  SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
  SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
  EVT VT = LHS.getValueType();

  SDNode *Carry;
  if (Subtarget->hasCpu032II())
    Carry = CurDAG->getMachineNode(Cpu0::SLTu, DL, VT, Ops);
  else {
    SDNode *StatusWord = CurDAG->getMachineNode(Cpu0::CMP, DL, VT, Ops);
    SDValue Constant1 = CurDAG->getTargetConstant(1, DL, VT);
    Carry = CurDAG->getMachineNode(Cpu0::ANDi, DL, VT, 
                                           SDValue(StatusWord,0), Constant1);
  }
  SDNode *AddCarry = CurDAG->getMachineNode(Cpu0::ADDu, DL, VT,
                                            SDValue(Carry,0), RHS);

  CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry,0));
}

bool Cpu0SEDAGToDAGISel::trySelect(SDNode *Node) {
  unsigned Opcode = Node->getOpcode();
  SDLoc DL(Node);

  ///
  // Instruction Selection not handled by the auto-generated
  // tablegen selection should be handled here.
  ///

  ///
  // Instruction Selection not handled by the auto-generated
  // tablegen selection should be handled here.
  ///
  EVT NodeTy = Node->getValueType(0);
  unsigned MultOpc;

  switch(Opcode) {
  default: break;

  case ISD::SUBE: {
    SDValue InFlag = Node->getOperand(2);
    selectAddESubE(Cpu0::SUBu, InFlag, InFlag.getOperand(0), DL, Node);
    return true;
  }

  case ISD::ADDE: {
    SDValue InFlag = Node->getOperand(2);
    selectAddESubE(Cpu0::ADDu, InFlag, InFlag.getValue(0), DL, Node);
    return true;
  }

  /// Mul with two results
  case ISD::SMUL_LOHI:
  case ISD::UMUL_LOHI: {
    MultOpc = (Opcode == ISD::UMUL_LOHI ? Cpu0::MULTu : Cpu0::MULT);

    std::pair<SDNode*, SDNode*> LoHi =
        selectMULT(Node, MultOpc, DL, NodeTy, true, true);

    if (!SDValue(Node, 0).use_empty())
      ReplaceUses(SDValue(Node, 0), SDValue(LoHi.first, 0));

    if (!SDValue(Node, 1).use_empty())
      ReplaceUses(SDValue(Node, 1), SDValue(LoHi.second, 0));

    CurDAG->RemoveDeadNode(Node);
    return true;
  }

  ...
}

lbdex/chapters/Chapter7_1/Cpu0ISelLowering.h

  class Cpu0TargetLowering : public TargetLowering  {

    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;

  ...
}

lbdex/chapters/Chapter7_1/Cpu0ISelLowering.cpp

Cpu0TargetLowering::Cpu0TargetLowering(const Cpu0TargetMachine &TM,
                                       const Cpu0Subtarget &STI)
    : TargetLowering(TM), Subtarget(STI), ABI(TM.getABI()) {

  // Handle i64 shl
  setOperationAction(ISD::SHL_PARTS,          MVT::i32,   Expand);
  setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Expand);
  setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Expand);

  ...
}

The added code in Cpu0ISelLowering.cpp are for shift operations which support type long long 64-bit. When applying operators << and >> in 64-bit variables will create DAG SHL_PARTS, SRA_PARTS and SRL_PARTS those which take care the 32 bits operands during llvm DAGs translation. File ch9_7.cpp of 64-bit shift operations cannot be run at this point. It will be verified on later chapter “Function call”.

Run Chapter7_1 with ch7_1_longlong.cpp to get the result as follows,

lbdex/input/ch7_1_longlong.cpp

long long test_longlong()
{
  long long a = 0x300000002;
  long long b = 0x100000001;
  int a1 = 0x3001000;
  int b1 = 0x2001000;
  
  long long c = a + b;   // c = 0x00000004,00000003
  long long d = a - b;   // d = 0x00000002,00000001
  long long e = a * b;   // e = 0x00000005,00000002
  long long f = (long long)a1 * (long long)b1; // f = 0x00060050,01000000

  long long g = ((-7 * 8) + 1) >> 4; // g = -55/16=-3.4375=-4

  return (c+d+e+f+g); // (0x0006005b,01000002) = (393307,16777218)
}

1-160-134-62:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_longlong.cpp -emit-llvm -o ch7_1_longlong.bc
1-160-134-62:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llc -march=cpu0 -mcpu=cpu032I -relocation-model=pic -filetype=asm
ch7_1_longlong.bc -o -
  ...
# BB#0:
        addiu $sp, $sp, -72
        st    $8, 68($fp)             # 4-byte Folded Spill
        addiu $2, $zero, 2
        st    $2, 60($fp)
        addiu $2, $zero, 3
        st    $2, 56($fp)
        addiu $2, $zero, 1
        st    $2, 52($fp)
        st    $2, 48($fp)
        lui   $2, 768
        ori   $2, $2, 4096
        st    $2, 44($fp)
        lui   $2, 512
        ori   $2, $2, 4096
        st    $2, 40($fp)
        ld    $2, 52($fp)
        ld    $3, 60($fp)
        addu  $3, $3, $2
        ld    $4, 56($fp)
        ld    $5, 48($fp)
        st    $3, 36($fp)
        cmp   $sw, $3, $2
        andi  $2, $sw, 1
        addu  $2, $2, $5
        addu  $2, $4, $2
        st    $2, 32($fp)
        ld    $2, 52($fp)
        ld    $3, 60($fp)
        subu  $4, $3, $2
        ld    $5, 56($fp)
        ld    $t9, 48($fp)
        st    $4, 28($fp)
        cmp   $sw, $3, $2
        andi  $2, $sw, 1
        addu  $2, $2, $t9
        subu  $2, $5, $2
        st    $2, 24($fp)
        ld    $2, 52($fp)
        ld    $3, 60($fp)
        multu $3, $2
        mflo  $4
        mfhi  $5
        ld    $t9, 56($fp)
        ld    $7, 48($fp)
        st    $4, 20($fp)
        mul   $3, $3, $7
        addu  $3, $5, $3
        mul   $2, $t9, $2
        addu  $2, $3, $2
        st    $2, 16($fp)
        ld    $2, 40($fp)
        ld    $3, 44($fp)
        mult  $3, $2
        mflo  $2
        mfhi  $4
        st    $2, 12($fp)
        st    $4, 8($fp)
        ld    $5, 28($fp)
        ld    $3, 36($fp)
        addu  $t9, $3, $5
        ld    $7, 20($fp)
        addu  $8, $t9, $7
        addu  $3, $8, $2
        cmp   $sw, $3, $2
        andi  $2, $sw, 1
        addu  $2, $2, $4
        cmp   $sw, $t9, $5
        st    $sw, 4($fp)             # 4-byte Folded Spill
        cmp   $sw, $8, $7
        andi  $4, $sw, 1
        ld    $5, 16($fp)
        addu  $4, $4, $5
        ld    $sw, 4($fp)             # 4-byte Folded Reload
        andi  $5, $sw, 1
        ld    $t9, 24($fp)
        addu  $5, $5, $t9
        ld    $t9, 32($fp)
        addu  $5, $t9, $5
        addu  $4, $5, $4
        addu  $2, $4, $2
        ld    $8, 68($fp)             # 4-byte Folded Reload
        addiu $sp, $sp, 72
        ret   $lr
  ...

float and double ¶

Cpu0 only has integer instructions at this point. For float operations, Cpu0 backend will call the library function to translate integer to float as follows,

lbdex/input/ch7_1_fmul.c

/* 
~/llvm/debug/build/bin/clang -target mips-unknown-linux-gnu -emit-llvm -S ch7_1_fmul.c
        ...
        %mul = fmul float %0, %1

~/llvm/debug/build/bin/llc -march=mips ch7_1_fmul.ll -relocation-model=static -o -
        ...
	v_log_f32_e32 v1, v0
	v_mul_legacy_f32_e32 v0, v0, v1
	v_exp_f32_e32 v0, v0

~/llvm/test/build/bin/llc -march=cpu0 ch7_1_fmul.ll -relocation-model=static -o -
         ...
        jsub __mulsf3
*/

float ch7_1_fmul(float a, float b) {
  float c = a * b;
  return c;
}

This float (or double) function call for Cpu0 will be supported after the chapter of function call. For hardware cost reason, many CPU have no hardware float instructions. They call library function to finish float operations. Mips sperarate float operations with a sperarate co-processor for those needing “float intended” application.

In order to support float point library (part of compiler-rt) [2], the following code are added to support instructions clz and clo. Though clz and clo instructions are implemented in compiler-rt. However these two instructions are integer operations and will get better speed up in float point application.

lbdex/chapters/Chapter7_1/Cpu0InstrInfo.td

let Predicates = [Ch7_1] in {
// Count Leading Ones/Zeros in Word
class CountLeading0<bits<8> op, string instr_asm, RegisterClass RC>:
  FA<op, (outs GPROut:$ra), (ins RC:$rb),
     !strconcat(instr_asm, "\t$ra, $rb"),
     [(set GPROut:$ra, (ctlz RC:$rb))], II_CLZ> {
  let rc = 0;
  let shamt = 0;
}

class CountLeading1<bits<8> op, string instr_asm, RegisterClass RC>:
  FA<op, (outs GPROut:$ra), (ins RC:$rb),
     !strconcat(instr_asm, "\t$ra, $rb"),
     [(set GPROut:$ra, (ctlz (not RC:$rb)))], II_CLO> {
  let rc = 0;
  let shamt = 0;
}

let Predicates = [Ch7_1] in {
/// Count Leading
def CLZ : CountLeading0<0x15, "clz", CPURegs>;
def CLO : CountLeading1<0x16, "clo", CPURegs>;

Array and struct support ¶

LLVM uses getelementptr to represent the array and struct type in C. Please reference here [1]. For ch7_1_globalstructoffset.cpp, the llvm IR as follows,

lbdex/input/ch7_1_globalstructoffset.cpp

struct Date
{
  int year;
  int month;
  int day;
};

Date date = {2012, 10, 12};
int a[3] = {2012, 10, 12};

int test_struct()
{
  int day = date.day;
  int i = a[1];

  return (i+day); // 10+12=22
}

// ch7_1_globalstructoffset.ll
; ModuleID = 'ch7_1_globalstructoffset.bc'
...
%struct.Date = type { i32, i32, i32 }

@date = global %struct.Date { i32 2012, i32 10, i32 12 }, align 4
@a = global [3 x i32] [i32 2012, i32 10, i32 12], align 4

; Function Attrs: nounwind
define i32 @_Z11test_structv() #0 {
  %day = alloca i32, align 4
  %i = alloca i32, align 4
  %1 = load i32* getelementptr inbounds (%struct.Date* @date, i32 0, i32 2), align 4
  store i32 %1, i32* %day, align 4
  %2 = load i32* getelementptr inbounds ([3 x i32]* @a, i32 0, i32 1), align 4
  store i32 %2, i32* %i, align 4
  %3 = load i32* %i, align 4
  %4 = load i32* %day, align 4
  %5 = add nsw i32 %3, %4
  ret i32 %5
}

Run Chapter6_1/ with ch7_1_globalstructoffset.bc on static mode will get the incorrect asm file as follows,

1-160-134-62:input Jonathan$ /Users/Jonathan/llvm/test/build/bin/
llc -march=cpu0 -relocation-model=static -filetype=asm
ch7_1_globalstructoffset.bc -o -
  ...
  lui $2, %hi(date)
  ori $2, $2, %lo(date)
  ld  $2, 0($2)   // the correct one is   ld  $2, 8($2)
  ...

For “day = date.day”, the correct one is “ld $2, 8($2)”, not “ld $2, 0($2)”, since date.day is offset 8(date) ( Type int is 4 bytes in Cpu0, and the date.day has fields year and month before it). Let’s use debug option in llc to see what’s wrong,

jonathantekiimac:input Jonathan$ /Users/Jonathan/llvm/test/
build/bin/llc -march=cpu0 -debug -relocation-model=static
-filetype=asm ch6_2.bc -o ch6_2.cpu0.static.s
...
=== main
Initial selection DAG: BB#0 'main:entry'
SelectionDAG has 20 nodes:
  0x7f7f5b02d210: i32 = undef [ORD=1]

      0x7f7f5ac10590: ch = EntryToken [ORD=1]

      0x7f7f5b02d010: i32 = Constant<0> [ORD=1]

      0x7f7f5b02d110: i32 = FrameIndex<0> [ORD=1]

      0x7f7f5b02d210: <multiple use>
    0x7f7f5b02d310: ch = store 0x7f7f5ac10590, 0x7f7f5b02d010, 0x7f7f5b02d110,
    0x7f7f5b02d210<ST4[%retval]> [ORD=1]

      0x7f7f5b02d410: i32 = GlobalAddress<%struct.Date* @date> 0 [ORD=2]

      0x7f7f5b02d510: i32 = Constant<8> [ORD=2]

    0x7f7f5b02d610: i32 = add 0x7f7f5b02d410, 0x7f7f5b02d510 [ORD=2]

    0x7f7f5b02d210: <multiple use>
  0x7f7f5b02d710: i32,ch = load 0x7f7f5b02d310, 0x7f7f5b02d610, 0x7f7f5b02d210
  <LD4[getelementptr inbounds (%struct.Date* @date, i32 0, i32 2)]> [ORD=3]

  0x7f7f5b02db10: i64 = Constant<4>

      0x7f7f5b02d710: <multiple use>
      0x7f7f5b02d710: <multiple use>
      0x7f7f5b02d810: i32 = FrameIndex<1> [ORD=4]

      0x7f7f5b02d210: <multiple use>
    0x7f7f5b02d910: ch = store 0x7f7f5b02d710:1, 0x7f7f5b02d710, 0x7f7f5b02d810,
     0x7f7f5b02d210<ST4[%day]> [ORD=4]

      0x7f7f5b02da10: i32 = GlobalAddress<[3 x i32]* @a> 0 [ORD=5]

      0x7f7f5b02dc10: i32 = Constant<4> [ORD=5]

    0x7f7f5b02dd10: i32 = add 0x7f7f5b02da10, 0x7f7f5b02dc10 [ORD=5]

    0x7f7f5b02d210: <multiple use>
  0x7f7f5b02de10: i32,ch = load 0x7f7f5b02d910, 0x7f7f5b02dd10, 0x7f7f5b02d210
  <LD4[getelementptr inbounds ([3 x i32]* @a, i32 0, i32 1)]> [ORD=6]

...


Replacing.3 0x7f7f5b02dd10: i32 = add 0x7f7f5b02da10, 0x7f7f5b02dc10 [ORD=5]

With: 0x7f7f5b030010: i32 = GlobalAddress<[3 x i32]* @a> + 4


Replacing.3 0x7f7f5b02d610: i32 = add 0x7f7f5b02d410, 0x7f7f5b02d510 [ORD=2]

With: 0x7f7f5b02db10: i32 = GlobalAddress<%struct.Date* @date> + 8

Optimized lowered selection DAG: BB#0 'main:entry'
SelectionDAG has 15 nodes:
  0x7f7f5b02d210: i32 = undef [ORD=1]

      0x7f7f5ac10590: ch = EntryToken [ORD=1]

      0x7f7f5b02d010: i32 = Constant<0> [ORD=1]

      0x7f7f5b02d110: i32 = FrameIndex<0> [ORD=1]

      0x7f7f5b02d210: <multiple use>
    0x7f7f5b02d310: ch = store 0x7f7f5ac10590, 0x7f7f5b02d010, 0x7f7f5b02d110,
    0x7f7f5b02d210<ST4[%retval]> [ORD=1]

    0x7f7f5b02db10: i32 = GlobalAddress<%struct.Date* @date> + 8

    0x7f7f5b02d210: <multiple use>
  0x7f7f5b02d710: i32,ch = load 0x7f7f5b02d310, 0x7f7f5b02db10, 0x7f7f5b02d210
  <LD4[getelementptr inbounds (%struct.Date* @date, i32 0, i32 2)]> [ORD=3]

      0x7f7f5b02d710: <multiple use>
      0x7f7f5b02d710: <multiple use>
      0x7f7f5b02d810: i32 = FrameIndex<1> [ORD=4]

      0x7f7f5b02d210: <multiple use>
    0x7f7f5b02d910: ch = store 0x7f7f5b02d710:1, 0x7f7f5b02d710, 0x7f7f5b02d810,
     0x7f7f5b02d210<ST4[%day]> [ORD=4]

    0x7f7f5b030010: i32 = GlobalAddress<[3 x i32]* @a> + 4

    0x7f7f5b02d210: <multiple use>
  0x7f7f5b02de10: i32,ch = load 0x7f7f5b02d910, 0x7f7f5b030010, 0x7f7f5b02d210
  <LD4[getelementptr inbounds ([3 x i32]* @a, i32 0, i32 1)]> [ORD=6]

...

Through llc -debug, you can see the DAG translation process. As above, the DAG list for date.day (add GlobalAddress<[3 x i32]* @a> 0, Constant<8>) with 3 nodes is replaced by 1 node GlobalAddress<%struct.Date* @date> + 8. The DAG list for a[1] is same. The replacement occurs since TargetLowering.cpp::isOffsetFoldingLegal(…) return true in llc -static static addressing mode as below. In Cpu0 the ld instruction format is “ld $r1, offset($r2)” which meaning load $r2 address+offset to $r1. So, we just replace the isOffsetFoldingLegal(…) function by override mechanism as below.

lib/CodeGen/SelectionDAG/TargetLowering.cpp

bool
TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
  // Assume that everything is safe in static mode.
  if (getTargetMachine().getRelocationModel() == Reloc::Static)
    return true;

  // In dynamic-no-pic mode, assume that known defined values are safe.
  if (getTargetMachine().getRelocationModel() == Reloc::DynamicNoPIC &&
     GA &&
     !GA->getGlobal()->isDeclaration() &&
     !GA->getGlobal()->isWeakForLinker())
  return true;

  // Otherwise assume nothing is safe.
  return false;
}

lbdex/chapters/Chapter7_1/Cpu0ISelLowering.cpp

bool
Cpu0TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
  // The Cpu0 target isn't yet aware of offsets.
  return false;
}

Beyond that, we need to add the following code fragment to Cpu0ISelDAGToDAG.cpp,

lbdex/chapters/Chapter7_1/Cpu0ISelDAGToDAG.cpp

/// ComplexPattern used on Cpu0InstrInfo
/// Used on Cpu0 Load/Store instructions
bool Cpu0DAGToDAGISel::
SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) {

  // Addresses of the form FI+const or FI|const
  if (CurDAG->isBaseWithConstantOffset(Addr)) {
    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
    if (isInt<16>(CN->getSExtValue())) {

      // If the first operand is a FI, get the TargetFI Node
      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
                                  (Addr.getOperand(0)))
        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
      else
        Base = Addr.getOperand(0);

      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), DL, ValTy);
      return true;
    }
  }

  ...
}

Recall we have translated DAG list for date.day (add GlobalAddress<[3 x i32]* @a> 0, Constant<8>) into (add (add Cpu0ISD::Hi (Cpu0II::MO_ABS_HI), Cpu0ISD::Lo(Cpu0II::MO_ABS_LO)), Constant<8>) by the following code in Cpu0ISelLowering.cpp.

lbdex/chapters/Chapter6_1/Cpu0ISelLowering.h

    // This method creates the following nodes, which are necessary for
    // computing a symbol's address in non-PIC mode:
    //
    // (add %hi(sym), %lo(sym))
    template<class NodeTy>
    SDValue getAddrNonPIC(NodeTy *N, EVT Ty, SelectionDAG &DAG) const {
      SDLoc DL(N);
      SDValue Hi = getTargetNode(N, Ty, DAG, Cpu0II::MO_ABS_HI);
      SDValue Lo = getTargetNode(N, Ty, DAG, Cpu0II::MO_ABS_LO);
      return DAG.getNode(ISD::ADD, DL, Ty,
                         DAG.getNode(Cpu0ISD::Hi, DL, Ty, Hi),
                         DAG.getNode(Cpu0ISD::Lo, DL, Ty, Lo));
    }

So, when the SelectAddr(…) of Cpu0ISelDAGToDAG.cpp is called. The Addr SDValue in SelectAddr(…, Addr, …) is DAG list for date.day (add (add Cpu0ISD::Hi (Cpu0II::MO_ABS_HI), Cpu0ISD::Lo(Cpu0II::MO_ABS_LO)), Constant<8>). Since Addr.getOpcode() = ISD:ADD, Addr.getOperand(0) = (add Cpu0ISD::Hi (Cpu0II::MO_ABS_HI), Cpu0ISD::Lo(Cpu0II::MO_ABS_LO)) and Addr.getOperand(1).getOpcode() = ISD::Constant, the Base = SDValue (add Cpu0ISD::Hi (Cpu0II::MO_ABS_HI), Cpu0ISD::Lo(Cpu0II::MO_ABS_LO)) and Offset = Constant<8>. After set Base and Offset, the load DAG will translate the global address date.day into machine instruction “ld $r1, 8($r2)” in Instruction Selection stage.

Chapter7_1/ include these changes as above, you can run it with ch7_1_globalstructoffset.cpp to get the correct generated instruction “ld $r1, 8($r2)” for date.day access, as follows.

...
      lui   $2, %hi(date)
      ori   $2, $2, %lo(date)
      ld    $2, 8($2)   // correct
...

The ch7_1_localarrayinit.cpp is for local variable initialization test. The result as follows,

lbdex/input/ch7_1_localarrayinit.cpp

int main()
{
  int a[3]={0, 1, 2};
    
  return 0;
}

118-165-79-206:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_localarrayinit.cpp -emit-llvm -o ch7_1_localarrayinit.bc
118-165-79-206:input Jonathan$ llvm-dis ch7_1_localarrayinit.bc -o -
...

define i32 @main() nounwind ssp {
entry:
  %retval = alloca i32, align 4
  %a = alloca [3 x i32], align 4
  store i32 0, i32* %retval
  %0 = bitcast [3 x i32]* %a to i8*
  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* bitcast ([3 x i32]*
    @_ZZ4mainE1a to i8*), i32 12, i32 4, i1 false)
  ret i32 0
}
; Function Attrs: nounwind
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) #1

118-165-79-206:input Jonathan$ ~/llvm/test/build/
bin/llc -march=cpu0 -relocation-model=pic -filetype=asm ch7_1_localarrayinit.bc -o -
        ...
# BB#0:                                 # %entry
        addiu $sp, $sp, -16
        addiu $2, $zero, 0
        st    $2, 12($fp)
        ld    $2, %got($_ZZ4mainE1a)($gp)
        ori   $2, $2, %lo($_ZZ4mainE1a)
        ld    $3, 8($2)
        st    $3, 8($fp)
        ld    $3, 4($2)
        st    $3, 4($fp)
        ld    $2, 0($2)
        st    $2, 0($fp)
        addiu $sp, $sp, 16
        ret   $lr
        ...
        .type $_ZZ4mainE1a,@object    # @_ZZ4mainE1a
        .section      .rodata,"a",@progbits
        .align        2
$_ZZ4mainE1a:
        .4byte        0                       # 0x0
        .4byte        1                       # 0x1
        .4byte        2                       # 0x2
        .size $_ZZ4mainE1a, 12

Vector type (SIMD) support ¶

Vector types are used when multiple primitive data are operated in parallel using a single instruction (SIMD) [3]. Mips supports the following llvm IRs “icmp slt” and “sext” for vector type, Cpu0 supports them either.

lbdex/input/ch7_1_vector.cpp

typedef long   vector8long   __attribute__((__vector_size__(32)));
typedef long   vector8short   __attribute__((__vector_size__(16)));


int test_cmplt_short() {
  volatile vector8short a0 = {0, 1, 2, 3};
  volatile vector8short b0 = {2, 2, 2, 4};
  volatile vector8short c0;
  c0 = a0 < b0; // c0[0] = -1 (since 0 < 2 is true), c0[1] = -1, c0[2] = 0 (since 2 < 2 is false), c0[3] = -1
  
  return (int)(c0[0]+c0[1]+c0[2]+c0[3]); // -3
}


int test_cmplt_long() {
  volatile vector8long a0 = {2, 2, 2, 2, 1, 1, 1, 1};
  volatile vector8long b0 = {1, 1, 1, 1, 2, 2, 2, 2};
  volatile vector8long c0;
  c0 = a0 < b0; // c0[0..3] = {0, 0, ...}, c0[4..7] = {-1, ...}
  
  return (c0[0]+c0[1]+c0[2]+c0[3]+c0[4]+c0[5]+c0[6]+c0[7]); //-4
}

118-165-79-206:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_vector.cpp -emit-llvm -o ch7_1_vector.bc
118-165-79-206:input Jonathan$ ~/llvm/test/build/bin/
llvm-dis ch7_1_vector.bc -o -
...

; Function Attrs: nounwind
define i32 @_Z16test_cmplt_shortv() #0 {
  %a0 = alloca <4 x i32>, align 16
  %b0 = alloca <4 x i32>, align 16
  %c0 = alloca <4 x i32>, align 16
  store volatile <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %a0, align 16
  store volatile <4 x i32> <i32 2, i32 2, i32 2, i32 2>, <4 x i32>* %b0, align 16
  %1 = load volatile <4 x i32>, <4 x i32>* %a0, align 16
  %2 = load volatile <4 x i32>, <4 x i32>* %b0, align 16
  %3 = icmp slt <4 x i32> %1, %2
  %4 = sext <4 x i1> %3 to <4 x i32>
  store volatile <4 x i32> %4, <4 x i32>* %c0, align 16
  %5 = load volatile <4 x i32>, <4 x i32>* %c0, align 16
  %6 = extractelement <4 x i32> %5, i32 0
  %7 = load volatile <4 x i32>, <4 x i32>* %c0, align 16
  %8 = extractelement <4 x i32> %7, i32 1
  %9 = add nsw i32 %6, %8
  %10 = load volatile <4 x i32>, <4 x i32>* %c0, align 16
  %11 = extractelement <4 x i32> %10, i32 2
  %12 = add nsw i32 %9, %11
  %13 = load volatile <4 x i32>, <4 x i32>* %c0, align 16
  %14 = extractelement <4 x i32> %13, i32 3
  %15 = add nsw i32 %12, %14
  ret i32 %15
}

118-165-79-206:input Jonathan$ ~/llvm/test/build/bin/llc
  -march=cpu0 -mcpu=cpu032II -relocation-model=pic -filetype=asm ch7_1_vector.bc
  -o -
  .text
  .section .mdebug.abiO32
  .previous
  .file "ch7_1_vector.bc"
  .globl  _Z16test_cmplt_shortv
  .p2align  2
  .type _Z16test_cmplt_shortv,@function
  .ent  _Z16test_cmplt_shortv   # @_Z16test_cmplt_shortv
_Z16test_cmplt_shortv:
  .frame  $fp,48,$lr
  .mask   0x00000000,0
  .set  noreorder
  .set  nomacro
# BB#0:
  addiu $sp, $sp, -48
  addiu $2, $zero, 3
  st  $2, 44($sp)
  addiu $2, $zero, 1
  st  $2, 36($sp)
  addiu $2, $zero, 0
  st  $2, 32($sp)
  addiu $2, $zero, 2
  st  $2, 40($sp)
  st  $2, 28($sp)
  st  $2, 24($sp)
  st  $2, 20($sp)
  st  $2, 16($sp)
  ld  $2, 32($sp)
  ld  $3, 44($sp)
  ld  $4, 40($sp)
  ld  $5, 36($sp)
  ld  $t9, 20($sp)
  slt $5, $5, $t9
  ld  $t9, 24($sp)
  slt $4, $4, $t9
  ld  $t9, 28($sp)
  slt $3, $3, $t9
  shl $3, $3, 31
  sra $3, $3, 31
  ld  $t9, 16($sp)
  st  $3, 12($sp)
  shl $3, $4, 31
  sra $3, $3, 31
  st  $3, 8($sp)
  shl $3, $5, 31
  sra $3, $3, 31
  st  $3, 4($sp)
  slt $2, $2, $t9
  shl $2, $2, 31
  sra $2, $2, 31
  st  $2, 0($sp)
  ld  $2, 12($sp)
  ld  $2, 8($sp)
  ld  $2, 4($sp)
  ld  $2, 0($sp)
  ld  $3, 4($sp)
  addu  $2, $2, $3
  ld  $3, 12($sp)
  ld  $3, 8($sp)
  ld  $3, 0($sp)
  ld  $3, 8($sp)
  addu  $2, $2, $3
  ld  $3, 12($sp)
  ld  $3, 4($sp)
  ld  $3, 0($sp)
  ld  $3, 12($sp)
  addu  $2, $2, $3
  ld  $3, 8($sp)
  ld  $3, 4($sp)
  ld  $3, 0($sp)
  addiu $sp, $sp, 48
  ret $lr
  .set  macro
  .set  reorder
  .end  _Z16test_cmplt_shortv
$func_end0:
  .size _Z16test_cmplt_shortv, ($func_end0)-_Z16test_cmplt_shortv


  .ident  "Apple LLVM version 7.0.0 (clang-700.1.76)"
  .section  ".note.GNU-stack","",@progbits

Since test_longlong_shift2() of ch7_1_vector.cpp needs implementation storeRegToStack() of Cpu0SEInstInfo.cpp, at this point it cannot be verified.

lbdex/chapters/Chapter7_1/Cpu0ISelLowering.h

    /// getSetCCResultType - get the ISD::SETCC result ValueType
    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                           EVT VT) const override;

lbdex/chapters/Chapter7_1/Cpu0ISelLowering.cpp

EVT Cpu0TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
                                           EVT VT) const {
  if (!VT.isVector())
    return MVT::i32;
  return VT.changeVectorElementTypeToInteger();
}

Other data type¶

Local variable pointer¶

char, short int and bool¶

long long¶

float and double¶

Array and struct support¶

Vector type (SIMD) support¶

Local variable pointer ¶

char, short int and bool ¶

long long ¶

float and double ¶

Array and struct support ¶

Vector type (SIMD) support ¶