Other data type¶
Until now, we only handle both int and long type of 32 bits size. This chapter introduce other types, such as pointer and those are not 32-bit size which inlcude bool, char, short int and long long.
Local variable pointer¶
To support pointer to local variable, add this code fragment in Cpu0InstrInfo.td and Cpu0InstPrinter.cpp as follows,
lbdex/chapters/Chapter7_1/Cpu0InstrInfo.td
def mem_ea : Operand<iPTR> {
let PrintMethod = "printMemOperandEA";
let MIOperandInfo = (ops GPROut, simm16);
let EncoderMethod = "getMemEncoding";
}
class EffectiveAddress<string instr_asm, RegisterClass RC, Operand Mem> :
FMem<0x09, (outs RC:$ra), (ins Mem:$addr),
instr_asm, [(set RC:$ra, addr:$addr)], IIAlu>;
}
// FrameIndexes are legalized when they are operands from load/store
// instructions. The same not happens for stack address copies, so an
// add op with mem ComplexPattern is used and the stack address copy
// can be matched. It's similar to Sparc LEA_ADDRi
def LEA_ADDiu : EffectiveAddress<"addiu\t$ra, $addr", CPURegs, mem_ea> {
let isCodeGenOnly = 1;
}
lbdex/chapters/Chapter3_2/InstPrinter/Cpu0InstPrinter.h
void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
lbdex/chapters/Chapter3_2/InstPrinter/Cpu0InstPrinter.cpp
// The DAG data node, mem_ea of Cpu0InstrInfo.td, cannot be disabled by
// ch7_1, only opcode node can be disabled.
void Cpu0InstPrinter::
printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O) {
// when using stack locations for not load/store instructions
// print the same way as all normal 3 operand instructions.
printOperand(MI, opNum, O);
O << ", ";
printOperand(MI, opNum+1, O);
return;
}
As comment in Cpu0InstPrinter.cpp, the printMemOperandEA is added at early chapter 3_2 since the DAG data node, mem_ea of Cpu0InstrInfo.td, cannot be disabled by ch7_1_localpointer, only opcode node can be disabled. Run ch7_1_localpointer.cpp with code Chapter7_1/ which support pointer to local variable, will get result as follows,
lbdex/input/ch7_1_localpointer.cpp
int test_local_pointer()
{
int b = 3;
int* p = &b;
return *p;
}
118-165-66-82:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_localpointer.cpp -emit-llvm -o ch7_1_localpointer.bc
118-165-66-82:input Jonathan$ llvm-dis ch7_1_localpointer.bc -o -
...
; Function Attrs: nounwind
define i32 @_Z18test_local_pointerv() #0 {
%b = alloca i32, align 4
%p = alloca i32*, align 4
store i32 3, i32* %b, align 4
store i32* %b, i32** %p, align 4
%1 = load i32** %p, align 4
%2 = load i32* %1, align 4
ret i32 %2
}
...
118-165-66-82:input Jonathan$ /Users/Jonathan/llvm/test/build/bin/llc
-march=cpu0 -relocation-model=pic -filetype=asm
ch7_1_localpointer.bc -o -
...
addiu $sp, $sp, -8
addiu $2, $zero, 3
st $2, 4($fp)
addiu $2, $fp, 4 // b address is 4($sp)
st $2, 0($fp)
ld $2, 4($fp)
addiu $sp, $sp, 8
ret $lr
...
char, short int and bool¶
To support signed/unsigned type of char and short int, adding the following code to Chapter7_1/.
lbdex/chapters/Chapter7_1/Cpu0InstrInfo.td
def sextloadi16_a : AlignedLoad<sextloadi16>;
def zextloadi16_a : AlignedLoad<zextloadi16>;
def extloadi16_a : AlignedLoad<extloadi16>;
def truncstorei16_a : AlignedStore<truncstorei16>;
let Predicates = [Ch7_1] in {
def LB : LoadM32<0x03, "lb", sextloadi8>;
def LBu : LoadM32<0x04, "lbu", zextloadi8>;
def SB : StoreM32<0x05, "sb", truncstorei8>;
def LH : LoadM32<0x06, "lh", sextloadi16_a>;
def LHu : LoadM32<0x07, "lhu", zextloadi16_a>;
def SH : StoreM32<0x08, "sh", truncstorei16_a>;
}
Run Chapter7_1/ with ch7_1_char_in_struct.cpp will get the following result.
lbdex/input/ch7_1_char_in_struct.cpp
struct Date
{
short year;
char month;
char day;
char hour;
char minute;
char second;
};
unsigned char b[4] = {'a', 'b', 'c', '\0'};
int test_char()
{
unsigned char a = b[1];
char c = (char)b[1];
Date date1 = {2012, (char)11, (char)25, (char)9, (char)40, (char)15};
char m = date1.month;
char s = date1.second;
return 0;
}
118-165-64-245:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llvm-dis ch7_1_char_in_struct.bc -o -
define i32 @_Z9test_charv() #0 {
%a = alloca i8, align 1
%c = alloca i8, align 1
%date1 = alloca %struct.Date, align 2
%m = alloca i8, align 1
%s = alloca i8, align 1
%1 = load i8* getelementptr inbounds ([4 x i8]* @b, i32 0, i32 1), align 1
store i8 %1, i8* %a, align 1
%2 = load i8* getelementptr inbounds ([4 x i8]* @b, i32 0, i32 1), align 1
store i8 %2, i8* %c, align 1
%3 = bitcast %struct.Date* %date1 to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %3, i8* bitcast ({ i16, i8, i8, i8,
i8, i8, i8 }* @_ZZ9test_charvE5date1 to i8*), i32 8, i32 2, i1 false)
%4 = getelementptr inbounds %struct.Date* %date1, i32 0, i32 1
%5 = load i8* %4, align 1
store i8 %5, i8* %m, align 1
%6 = getelementptr inbounds %struct.Date* %date1, i32 0, i32 5
%7 = load i8* %6, align 1
store i8 %7, i8* %s, align 1
ret i32 0
}
118-165-64-245:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_char_in_struct.cpp -emit-llvm -o ch7_1_char_in_struct.bc
118-165-64-245:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llc -march=cpu0 -relocation-model=pic -filetype=asm
ch7_1_char_in_struct.bc -o -
...
# BB#0: # %entry
addiu $sp, $sp, -24
lui $2, %got_hi(b)
addu $2, $2, $gp
ld $2, %got_lo(b)($2)
lbu $3, 1($2)
sb $3, 20($fp)
lbu $2, 1($2)
sb $2, 16($fp)
ld $2, %got($_ZZ9test_charvE5date1)($gp)
addiu $2, $2, %lo($_ZZ9test_charvE5date1)
lhu $3, 4($2)
shl $3, $3, 16
lhu $4, 6($2)
or $3, $3, $4
st $3, 12($fp) // store hour, minute and second on 12($sp)
lhu $3, 2($2)
lhu $2, 0($2)
shl $2, $2, 16
or $2, $2, $3
st $2, 8($fp) // store year, month and day on 8($sp)
lbu $2, 10($fp) // m = date1.month;
sb $2, 4($fp)
lbu $2, 14($fp) // s = date1.second;
sb $2, 0($fp)
addiu $sp, $sp, 24
ret $lr
.set macro
.set reorder
.end _Z9test_charv
$tmp1:
.size _Z9test_charv, ($tmp1)-_Z9test_charv
.type b,@object # @b
.data
.globl b
b:
.asciz "abc"
.size b, 4
.type $_ZZ9test_charvE5date1,@object # @_ZZ9test_charvE5date1
.section .rodata.cst8,"aM",@progbits,8
.align 1
$_ZZ9test_charvE5date1:
.2byte 2012 # 0x7dc
.byte 11 # 0xb
.byte 25 # 0x19
.byte 9 # 0x9
.byte 40 # 0x28
.byte 15 # 0xf
.space 1
.size $_ZZ9test_charvE5date1, 8
Run Chapter7_1/ with ch7_1_char_short.cpp will get the following result.
lbdex/input/ch7_1_char_short.cpp
int test_signed_char()
{
char a = 0x80;
int i = (signed int)a;
i = i + 2; // i = (-128+2) = -126
return i;
}
int test_unsigned_char()
{
unsigned char c = 0x80;
unsigned int ui = (unsigned int)c;
ui = ui + 2; // i = (128+2) = 130
return (int)ui;
}
int test_signed_short()
{
short a = 0x8000;
int i = (signed int)a;
i = i + 2; // i = (-32768+2) = -32766
return i;
}
int test_unsigned_short()
{
unsigned short c = 0x8000;
unsigned int ui = (unsigned int)c;
ui = ui + 2; // i = (32768+2) = 32770
c = (unsigned short)ui;
return (int)ui;
}
1-160-136-236:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llvm-dis ch7_1_char_short.bc -o -
...
define i32 @_Z16test_signed_charv() #0 {
...
%1 = load i8* %a, align 1
%2 = sext i8 %1 to i32
...
}
; Function Attrs: nounwind
define i32 @_Z18test_unsigned_charv() #0 {
...
%1 = load i8* %c, align 1
%2 = zext i8 %1 to i32
...
}
; Function Attrs: nounwind
define i32 @_Z17test_signed_shortv() #0 {
...
%1 = load i16* %a, align 2
%2 = sext i16 %1 to i32
...
}
; Function Attrs: nounwind
define i32 @_Z19test_unsigned_shortv() #0 {
...
%1 = load i16* %c, align 2
%2 = zext i16 %1 to i32
...
}
attributes #0 = { nounwind }
1-160-136-236:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llc -march=cpu0 -relocation-model=static -filetype=asm ch7_1_char_short.bc -o -
...
.globl _Z16test_signed_charv
...
lb $2, 4($sp)
...
.end _Z16test_signed_charv
.globl _Z18test_unsigned_charv
...
lbu $2, 4($sp)
...
.end _Z18test_unsigned_charv
.globl _Z17test_signed_shortv
...
lh $2, 4($sp)
...
.end _Z17test_signed_shortv
.globl _Z19test_unsigned_shortv
...
lhu $2, 4($sp)
...
.end _Z19test_unsigned_shortv
...
As you can see lb/lh are for signed byte/short type while lbu/lhu are for unsigned byte/short type. To support C type-cast or type-conversion feature efficiently, Cpu0 provide instruction “lb” to converse type char to int with one single instruction. The other instructions lbu, lh, lhu, sb and sh are applied in both signed or unsigned of type byte and short conversion. Their differences have been explained in Chapter 2.
To support load bool type, the following code added.
lbdex/chapters/Chapter7_1/Cpu0ISelLowering.cpp
Cpu0TargetLowering::Cpu0TargetLowering(const Cpu0TargetMachine &TM,
const Cpu0Subtarget &STI)
: TargetLowering(TM), Subtarget(STI), ABI(TM.getABI()) {
// Cpu0 does not have i1 type, so use i32 for
// setcc operations results (slt, sgt, ...).
setBooleanContents(ZeroOrOneBooleanContent);
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
// Load extented operations for i1 types must be promoted
for (MVT VT : MVT::integer_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
}
...
}
The setBooleanContents() purpose as following, but I don’t know it well. Without it, the ch7_1_bool2.ll still works as below. The IR input file ch7_1_bool2.ll is used in testing here since the c++ version need flow control which is not supported at this point. File ch_run_backend.cpp include the test fragment for bool as below.
include/llvm/Target/TargetLowering.h
enum BooleanContent { // How the target represents true/false values.
UndefinedBooleanContent, // Only bit 0 counts, the rest can hold garbage.
ZeroOrOneBooleanContent, // All bits zero except for bit 0.
ZeroOrNegativeOneBooleanContent // All bits equal to bit 0.
};
...
protected:
/// setBooleanContents - Specify how the target extends the result of a
/// boolean value from i1 to a wider type. See getBooleanContents.
void setBooleanContents(BooleanContent Ty) { BooleanContents = Ty; }
/// setBooleanVectorContents - Specify how the target extends the result
/// of a vector boolean value from a vector of i1 to a wider type. See
/// getBooleanContents.
void setBooleanVectorContents(BooleanContent Ty) {
BooleanVectorContents = Ty;
}
lbdex/input/ch7_1_bool2.ll
define zeroext i1 @verify_load_bool() #0 {
entry:
%retval = alloca i1, align 1
store i1 1, i1* %retval, align 1
%0 = load i1, i1* %retval
ret i1 %0
}
118-165-64-245:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llc -march=cpu0 -relocation-model=pic -filetype=asm ch7_1_bool2.ll -o -
.section .mdebug.abi32
.previous
.file "ch7_1_bool2.ll"
.text
.globl verify_load_bool
.align 2
.type verify_load_bool,@function
.ent verify_load_bool # @verify_load_bool
verify_load_bool:
.cfi_startproc
.frame $sp,8,$lr
.mask 0x00000000,0
.set noreorder
.set nomacro
# BB#0: # %entry
addiu $sp, $sp, -8
$tmp1:
.cfi_def_cfa_offset 8
addiu $2, $zero, 1
sb $2, 7($sp)
addiu $sp, $sp, 8
ret $lr
.set macro
.set reorder
.end verify_load_bool
$tmp2:
.size verify_load_bool, ($tmp2)-verify_load_bool
.cfi_endproc
The ch7_1_bool.cpp is the bool test version for C language. You can run with it at Chapter8_1 to get the similar result with ch7_1_bool2.ll.
lbdex/input/ch7_1_bool.cpp
bool test_load_bool()
{
int a = 1;
if (a < 0)
return false;
return true;
}
Summary as the following table.
C |
.bc |
Optimized legalized selection DAG |
---|---|---|
char a =0x80; |
%1 = load i8* %a, align 1 |
|
int i = (signed int)a; |
%2 = sext i8 %1 to i32 |
load …, <…, sext from i8> |
unsigned char c = 0x80; |
%1 = load i8* %c, align 1 |
|
unsigned int ui = (unsigned int)c; |
%2 = zext i8 %1 to i32 |
load …, <…, zext from i8> |
short a =0x8000; |
%1 = load i16* %a, align 2 |
|
int i = (signed int)a; |
%2 = sext i16 %1 to i32 |
load …, <…, sext from i16> |
unsigned short c = 0x8000; |
%1 = load i16* %c, align 2 |
|
unsigned int ui = (unsigned int)c; |
%2 = zext i16 %1 to i32 |
load …, <…, zext from i16> |
c = (unsigned short)ui; |
%6 = trunc i32 %5 to i16 |
|
store i16 %6, i16* %c, align 2 |
store …,<…, trunc to i16> |
|
return true; |
store i1 1, i1* %retval, align 1 |
store …,<…, trunc to i8> |
Optimized legalized selection DAG |
Cpu0 |
pattern in Cpu0InstrInfo.td |
---|---|---|
load …, <…, sext from i8> |
lb |
LB : LoadM32<0x03, “lb”, sextloadi8>; |
load …, <…, zext from i8> |
lbu |
LBu : LoadM32<0x04, “lbu”, zextloadi8>; |
load …, <…, sext from i16> |
lh |
LH : LoadM32<0x06, “lh”, sextloadi16_a>; |
load …, <…, zext from i16> |
lhu |
LHu : LoadM32<0x07, “lhu”, zextloadi16_a>; |
store …,<…, trunc to i16> |
sh |
SH : StoreM32<0x08, “sh”, truncstorei16_a>; |
store …,<…, trunc to i8> |
sb |
SB : StoreM32<0x05, “sb”, truncstorei8>; |
long long¶
Like Mips, the type long of Cpu0 is 32-bit and type long long is 64-bit for C language. To support type long long, we add the following code to Chapter7_1/.
lbdex/chapters/Chapter7_1/Cpu0SEISelDAGToDAG.cpp
void Cpu0SEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
SDValue CmpLHS, const SDLoc &DL,
SDNode *Node) const {
unsigned Opc = InFlag.getOpcode(); (void)Opc;
assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
(Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
"(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
EVT VT = LHS.getValueType();
SDNode *Carry;
if (Subtarget->hasCpu032II())
Carry = CurDAG->getMachineNode(Cpu0::SLTu, DL, VT, Ops);
else {
SDNode *StatusWord = CurDAG->getMachineNode(Cpu0::CMP, DL, VT, Ops);
SDValue Constant1 = CurDAG->getTargetConstant(1, DL, VT);
Carry = CurDAG->getMachineNode(Cpu0::ANDi, DL, VT,
SDValue(StatusWord,0), Constant1);
}
SDNode *AddCarry = CurDAG->getMachineNode(Cpu0::ADDu, DL, VT,
SDValue(Carry,0), RHS);
CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry,0));
}
bool Cpu0SEDAGToDAGISel::trySelect(SDNode *Node) {
unsigned Opcode = Node->getOpcode();
SDLoc DL(Node);
///
// Instruction Selection not handled by the auto-generated
// tablegen selection should be handled here.
///
///
// Instruction Selection not handled by the auto-generated
// tablegen selection should be handled here.
///
EVT NodeTy = Node->getValueType(0);
unsigned MultOpc;
switch(Opcode) {
default: break;
case ISD::SUBE: {
SDValue InFlag = Node->getOperand(2);
selectAddESubE(Cpu0::SUBu, InFlag, InFlag.getOperand(0), DL, Node);
return true;
}
case ISD::ADDE: {
SDValue InFlag = Node->getOperand(2);
selectAddESubE(Cpu0::ADDu, InFlag, InFlag.getValue(0), DL, Node);
return true;
}
/// Mul with two results
case ISD::SMUL_LOHI:
case ISD::UMUL_LOHI: {
MultOpc = (Opcode == ISD::UMUL_LOHI ? Cpu0::MULTu : Cpu0::MULT);
std::pair<SDNode*, SDNode*> LoHi =
selectMULT(Node, MultOpc, DL, NodeTy, true, true);
if (!SDValue(Node, 0).use_empty())
ReplaceUses(SDValue(Node, 0), SDValue(LoHi.first, 0));
if (!SDValue(Node, 1).use_empty())
ReplaceUses(SDValue(Node, 1), SDValue(LoHi.second, 0));
CurDAG->RemoveDeadNode(Node);
return true;
}
...
}
lbdex/chapters/Chapter7_1/Cpu0ISelLowering.h
class Cpu0TargetLowering : public TargetLowering {
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
...
}
lbdex/chapters/Chapter7_1/Cpu0ISelLowering.cpp
Cpu0TargetLowering::Cpu0TargetLowering(const Cpu0TargetMachine &TM,
const Cpu0Subtarget &STI)
: TargetLowering(TM), Subtarget(STI), ABI(TM.getABI()) {
// Handle i64 shl
setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
...
}
The added code in Cpu0ISelLowering.cpp are for shift operations which support type long long 64-bit. When applying operators << and >> in 64-bit variables will create DAG SHL_PARTS, SRA_PARTS and SRL_PARTS those which take care the 32 bits operands during llvm DAGs translation. File ch9_7.cpp of 64-bit shift operations cannot be run at this point. It will be verified on later chapter “Function call”.
Run Chapter7_1 with ch7_1_longlong.cpp to get the result as follows,
lbdex/input/ch7_1_longlong.cpp
long long test_longlong()
{
long long a = 0x300000002;
long long b = 0x100000001;
int a1 = 0x3001000;
int b1 = 0x2001000;
long long c = a + b; // c = 0x00000004,00000003
long long d = a - b; // d = 0x00000002,00000001
long long e = a * b; // e = 0x00000005,00000002
long long f = (long long)a1 * (long long)b1; // f = 0x00060050,01000000
long long g = ((-7 * 8) + 1) >> 4; // g = -55/16=-3.4375=-4
return (c+d+e+f+g); // (0x0006005b,01000002) = (393307,16777218)
}
1-160-134-62:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_longlong.cpp -emit-llvm -o ch7_1_longlong.bc
1-160-134-62:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llc -march=cpu0 -mcpu=cpu032I -relocation-model=pic -filetype=asm
ch7_1_longlong.bc -o -
...
# BB#0:
addiu $sp, $sp, -72
st $8, 68($fp) # 4-byte Folded Spill
addiu $2, $zero, 2
st $2, 60($fp)
addiu $2, $zero, 3
st $2, 56($fp)
addiu $2, $zero, 1
st $2, 52($fp)
st $2, 48($fp)
lui $2, 768
ori $2, $2, 4096
st $2, 44($fp)
lui $2, 512
ori $2, $2, 4096
st $2, 40($fp)
ld $2, 52($fp)
ld $3, 60($fp)
addu $3, $3, $2
ld $4, 56($fp)
ld $5, 48($fp)
st $3, 36($fp)
cmp $sw, $3, $2
andi $2, $sw, 1
addu $2, $2, $5
addu $2, $4, $2
st $2, 32($fp)
ld $2, 52($fp)
ld $3, 60($fp)
subu $4, $3, $2
ld $5, 56($fp)
ld $t9, 48($fp)
st $4, 28($fp)
cmp $sw, $3, $2
andi $2, $sw, 1
addu $2, $2, $t9
subu $2, $5, $2
st $2, 24($fp)
ld $2, 52($fp)
ld $3, 60($fp)
multu $3, $2
mflo $4
mfhi $5
ld $t9, 56($fp)
ld $7, 48($fp)
st $4, 20($fp)
mul $3, $3, $7
addu $3, $5, $3
mul $2, $t9, $2
addu $2, $3, $2
st $2, 16($fp)
ld $2, 40($fp)
ld $3, 44($fp)
mult $3, $2
mflo $2
mfhi $4
st $2, 12($fp)
st $4, 8($fp)
ld $5, 28($fp)
ld $3, 36($fp)
addu $t9, $3, $5
ld $7, 20($fp)
addu $8, $t9, $7
addu $3, $8, $2
cmp $sw, $3, $2
andi $2, $sw, 1
addu $2, $2, $4
cmp $sw, $t9, $5
st $sw, 4($fp) # 4-byte Folded Spill
cmp $sw, $8, $7
andi $4, $sw, 1
ld $5, 16($fp)
addu $4, $4, $5
ld $sw, 4($fp) # 4-byte Folded Reload
andi $5, $sw, 1
ld $t9, 24($fp)
addu $5, $5, $t9
ld $t9, 32($fp)
addu $5, $t9, $5
addu $4, $5, $4
addu $2, $4, $2
ld $8, 68($fp) # 4-byte Folded Reload
addiu $sp, $sp, 72
ret $lr
...
float and double¶
Cpu0 only has integer instructions at this point. For float operations, Cpu0 backend will call the library function to translate integer to float as follows,
lbdex/input/ch7_1_fmul.c
/*
~/llvm/debug/build/bin/clang -target mips-unknown-linux-gnu -emit-llvm -S ch7_1_fmul.c
...
%mul = fmul float %0, %1
~/llvm/debug/build/bin/llc -march=mips ch7_1_fmul.ll -relocation-model=static -o -
...
v_log_f32_e32 v1, v0
v_mul_legacy_f32_e32 v0, v0, v1
v_exp_f32_e32 v0, v0
~/llvm/test/build/bin/llc -march=cpu0 ch7_1_fmul.ll -relocation-model=static -o -
...
jsub __mulsf3
*/
float ch7_1_fmul(float a, float b) {
float c = a * b;
return c;
}
This float (or double) function call for Cpu0 will be supported after the chapter of function call. For hardware cost reason, many CPU have no hardware float instructions. They call library function to finish float operations. Mips sperarate float operations with a sperarate co-processor for those needing “float intended” application.
In order to support float point library (part of compiler-rt) [2], the following code are added to support instructions clz and clo. Though clz and clo instructions are implemented in compiler-rt. However these two instructions are integer operations and will get better speed up in float point application.
lbdex/chapters/Chapter7_1/Cpu0InstrInfo.td
let Predicates = [Ch7_1] in {
// Count Leading Ones/Zeros in Word
class CountLeading0<bits<8> op, string instr_asm, RegisterClass RC>:
FA<op, (outs GPROut:$ra), (ins RC:$rb),
!strconcat(instr_asm, "\t$ra, $rb"),
[(set GPROut:$ra, (ctlz RC:$rb))], II_CLZ> {
let rc = 0;
let shamt = 0;
}
class CountLeading1<bits<8> op, string instr_asm, RegisterClass RC>:
FA<op, (outs GPROut:$ra), (ins RC:$rb),
!strconcat(instr_asm, "\t$ra, $rb"),
[(set GPROut:$ra, (ctlz (not RC:$rb)))], II_CLO> {
let rc = 0;
let shamt = 0;
}
let Predicates = [Ch7_1] in {
/// Count Leading
def CLZ : CountLeading0<0x15, "clz", CPURegs>;
def CLO : CountLeading1<0x16, "clo", CPURegs>;
Array and struct support¶
LLVM uses getelementptr to represent the array and struct type in C. Please reference here [1]. For ch7_1_globalstructoffset.cpp, the llvm IR as follows,
lbdex/input/ch7_1_globalstructoffset.cpp
struct Date
{
int year;
int month;
int day;
};
Date date = {2012, 10, 12};
int a[3] = {2012, 10, 12};
int test_struct()
{
int day = date.day;
int i = a[1];
return (i+day); // 10+12=22
}
// ch7_1_globalstructoffset.ll
; ModuleID = 'ch7_1_globalstructoffset.bc'
...
%struct.Date = type { i32, i32, i32 }
@date = global %struct.Date { i32 2012, i32 10, i32 12 }, align 4
@a = global [3 x i32] [i32 2012, i32 10, i32 12], align 4
; Function Attrs: nounwind
define i32 @_Z11test_structv() #0 {
%day = alloca i32, align 4
%i = alloca i32, align 4
%1 = load i32* getelementptr inbounds (%struct.Date* @date, i32 0, i32 2), align 4
store i32 %1, i32* %day, align 4
%2 = load i32* getelementptr inbounds ([3 x i32]* @a, i32 0, i32 1), align 4
store i32 %2, i32* %i, align 4
%3 = load i32* %i, align 4
%4 = load i32* %day, align 4
%5 = add nsw i32 %3, %4
ret i32 %5
}
Run Chapter6_1/ with ch7_1_globalstructoffset.bc on static mode will get the incorrect asm file as follows,
1-160-134-62:input Jonathan$ /Users/Jonathan/llvm/test/build/bin/
llc -march=cpu0 -relocation-model=static -filetype=asm
ch7_1_globalstructoffset.bc -o -
...
lui $2, %hi(date)
ori $2, $2, %lo(date)
ld $2, 0($2) // the correct one is ld $2, 8($2)
...
For “day = date.day”, the correct one is “ld $2, 8($2)”, not “ld $2, 0($2)”, since date.day is offset 8(date) ( Type int is 4 bytes in Cpu0, and the date.day has fields year and month before it). Let’s use debug option in llc to see what’s wrong,
jonathantekiimac:input Jonathan$ /Users/Jonathan/llvm/test/
build/bin/llc -march=cpu0 -debug -relocation-model=static
-filetype=asm ch6_2.bc -o ch6_2.cpu0.static.s
...
=== main
Initial selection DAG: BB#0 'main:entry'
SelectionDAG has 20 nodes:
0x7f7f5b02d210: i32 = undef [ORD=1]
0x7f7f5ac10590: ch = EntryToken [ORD=1]
0x7f7f5b02d010: i32 = Constant<0> [ORD=1]
0x7f7f5b02d110: i32 = FrameIndex<0> [ORD=1]
0x7f7f5b02d210: <multiple use>
0x7f7f5b02d310: ch = store 0x7f7f5ac10590, 0x7f7f5b02d010, 0x7f7f5b02d110,
0x7f7f5b02d210<ST4[%retval]> [ORD=1]
0x7f7f5b02d410: i32 = GlobalAddress<%struct.Date* @date> 0 [ORD=2]
0x7f7f5b02d510: i32 = Constant<8> [ORD=2]
0x7f7f5b02d610: i32 = add 0x7f7f5b02d410, 0x7f7f5b02d510 [ORD=2]
0x7f7f5b02d210: <multiple use>
0x7f7f5b02d710: i32,ch = load 0x7f7f5b02d310, 0x7f7f5b02d610, 0x7f7f5b02d210
<LD4[getelementptr inbounds (%struct.Date* @date, i32 0, i32 2)]> [ORD=3]
0x7f7f5b02db10: i64 = Constant<4>
0x7f7f5b02d710: <multiple use>
0x7f7f5b02d710: <multiple use>
0x7f7f5b02d810: i32 = FrameIndex<1> [ORD=4]
0x7f7f5b02d210: <multiple use>
0x7f7f5b02d910: ch = store 0x7f7f5b02d710:1, 0x7f7f5b02d710, 0x7f7f5b02d810,
0x7f7f5b02d210<ST4[%day]> [ORD=4]
0x7f7f5b02da10: i32 = GlobalAddress<[3 x i32]* @a> 0 [ORD=5]
0x7f7f5b02dc10: i32 = Constant<4> [ORD=5]
0x7f7f5b02dd10: i32 = add 0x7f7f5b02da10, 0x7f7f5b02dc10 [ORD=5]
0x7f7f5b02d210: <multiple use>
0x7f7f5b02de10: i32,ch = load 0x7f7f5b02d910, 0x7f7f5b02dd10, 0x7f7f5b02d210
<LD4[getelementptr inbounds ([3 x i32]* @a, i32 0, i32 1)]> [ORD=6]
...
Replacing.3 0x7f7f5b02dd10: i32 = add 0x7f7f5b02da10, 0x7f7f5b02dc10 [ORD=5]
With: 0x7f7f5b030010: i32 = GlobalAddress<[3 x i32]* @a> + 4
Replacing.3 0x7f7f5b02d610: i32 = add 0x7f7f5b02d410, 0x7f7f5b02d510 [ORD=2]
With: 0x7f7f5b02db10: i32 = GlobalAddress<%struct.Date* @date> + 8
Optimized lowered selection DAG: BB#0 'main:entry'
SelectionDAG has 15 nodes:
0x7f7f5b02d210: i32 = undef [ORD=1]
0x7f7f5ac10590: ch = EntryToken [ORD=1]
0x7f7f5b02d010: i32 = Constant<0> [ORD=1]
0x7f7f5b02d110: i32 = FrameIndex<0> [ORD=1]
0x7f7f5b02d210: <multiple use>
0x7f7f5b02d310: ch = store 0x7f7f5ac10590, 0x7f7f5b02d010, 0x7f7f5b02d110,
0x7f7f5b02d210<ST4[%retval]> [ORD=1]
0x7f7f5b02db10: i32 = GlobalAddress<%struct.Date* @date> + 8
0x7f7f5b02d210: <multiple use>
0x7f7f5b02d710: i32,ch = load 0x7f7f5b02d310, 0x7f7f5b02db10, 0x7f7f5b02d210
<LD4[getelementptr inbounds (%struct.Date* @date, i32 0, i32 2)]> [ORD=3]
0x7f7f5b02d710: <multiple use>
0x7f7f5b02d710: <multiple use>
0x7f7f5b02d810: i32 = FrameIndex<1> [ORD=4]
0x7f7f5b02d210: <multiple use>
0x7f7f5b02d910: ch = store 0x7f7f5b02d710:1, 0x7f7f5b02d710, 0x7f7f5b02d810,
0x7f7f5b02d210<ST4[%day]> [ORD=4]
0x7f7f5b030010: i32 = GlobalAddress<[3 x i32]* @a> + 4
0x7f7f5b02d210: <multiple use>
0x7f7f5b02de10: i32,ch = load 0x7f7f5b02d910, 0x7f7f5b030010, 0x7f7f5b02d210
<LD4[getelementptr inbounds ([3 x i32]* @a, i32 0, i32 1)]> [ORD=6]
...
Through llc -debug
, you can see the DAG translation process.
As above, the DAG list
for date.day (add GlobalAddress<[3 x i32]* @a> 0, Constant<8>) with 3 nodes is
replaced by 1 node GlobalAddress<%struct.Date* @date> + 8.
The DAG list for a[1] is same.
The replacement occurs since TargetLowering.cpp::isOffsetFoldingLegal(…)
return true in llc -static
static addressing mode as below.
In Cpu0 the ld instruction format is “ld $r1, offset($r2)” which
meaning load $r2 address+offset to $r1.
So, we just replace the isOffsetFoldingLegal(…) function by override
mechanism as below.
lib/CodeGen/SelectionDAG/TargetLowering.cpp
bool
TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// Assume that everything is safe in static mode.
if (getTargetMachine().getRelocationModel() == Reloc::Static)
return true;
// In dynamic-no-pic mode, assume that known defined values are safe.
if (getTargetMachine().getRelocationModel() == Reloc::DynamicNoPIC &&
GA &&
!GA->getGlobal()->isDeclaration() &&
!GA->getGlobal()->isWeakForLinker())
return true;
// Otherwise assume nothing is safe.
return false;
}
lbdex/chapters/Chapter7_1/Cpu0ISelLowering.cpp
bool
Cpu0TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// The Cpu0 target isn't yet aware of offsets.
return false;
}
Beyond that, we need to add the following code fragment to Cpu0ISelDAGToDAG.cpp,
lbdex/chapters/Chapter7_1/Cpu0ISelDAGToDAG.cpp
/// ComplexPattern used on Cpu0InstrInfo
/// Used on Cpu0 Load/Store instructions
bool Cpu0DAGToDAGISel::
SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) {
// Addresses of the form FI+const or FI|const
if (CurDAG->isBaseWithConstantOffset(Addr)) {
ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
if (isInt<16>(CN->getSExtValue())) {
// If the first operand is a FI, get the TargetFI Node
if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
(Addr.getOperand(0)))
Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
else
Base = Addr.getOperand(0);
Offset = CurDAG->getTargetConstant(CN->getZExtValue(), DL, ValTy);
return true;
}
}
...
}
Recall we have translated DAG list for date.day (add GlobalAddress<[3 x i32]* @a> 0, Constant<8>) into (add (add Cpu0ISD::Hi (Cpu0II::MO_ABS_HI), Cpu0ISD::Lo(Cpu0II::MO_ABS_LO)), Constant<8>) by the following code in Cpu0ISelLowering.cpp.
lbdex/chapters/Chapter6_1/Cpu0ISelLowering.h
// This method creates the following nodes, which are necessary for
// computing a symbol's address in non-PIC mode:
//
// (add %hi(sym), %lo(sym))
template<class NodeTy>
SDValue getAddrNonPIC(NodeTy *N, EVT Ty, SelectionDAG &DAG) const {
SDLoc DL(N);
SDValue Hi = getTargetNode(N, Ty, DAG, Cpu0II::MO_ABS_HI);
SDValue Lo = getTargetNode(N, Ty, DAG, Cpu0II::MO_ABS_LO);
return DAG.getNode(ISD::ADD, DL, Ty,
DAG.getNode(Cpu0ISD::Hi, DL, Ty, Hi),
DAG.getNode(Cpu0ISD::Lo, DL, Ty, Lo));
}
So, when the SelectAddr(…) of Cpu0ISelDAGToDAG.cpp is called. The Addr SDValue in SelectAddr(…, Addr, …) is DAG list for date.day (add (add Cpu0ISD::Hi (Cpu0II::MO_ABS_HI), Cpu0ISD::Lo(Cpu0II::MO_ABS_LO)), Constant<8>). Since Addr.getOpcode() = ISD:ADD, Addr.getOperand(0) = (add Cpu0ISD::Hi (Cpu0II::MO_ABS_HI), Cpu0ISD::Lo(Cpu0II::MO_ABS_LO)) and Addr.getOperand(1).getOpcode() = ISD::Constant, the Base = SDValue (add Cpu0ISD::Hi (Cpu0II::MO_ABS_HI), Cpu0ISD::Lo(Cpu0II::MO_ABS_LO)) and Offset = Constant<8>. After set Base and Offset, the load DAG will translate the global address date.day into machine instruction “ld $r1, 8($r2)” in Instruction Selection stage.
Chapter7_1/ include these changes as above, you can run it with ch7_1_globalstructoffset.cpp to get the correct generated instruction “ld $r1, 8($r2)” for date.day access, as follows.
...
lui $2, %hi(date)
ori $2, $2, %lo(date)
ld $2, 8($2) // correct
...
The ch7_1_localarrayinit.cpp is for local variable initialization test. The result as follows,
lbdex/input/ch7_1_localarrayinit.cpp
int main()
{
int a[3]={0, 1, 2};
return 0;
}
118-165-79-206:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_localarrayinit.cpp -emit-llvm -o ch7_1_localarrayinit.bc
118-165-79-206:input Jonathan$ llvm-dis ch7_1_localarrayinit.bc -o -
...
define i32 @main() nounwind ssp {
entry:
%retval = alloca i32, align 4
%a = alloca [3 x i32], align 4
store i32 0, i32* %retval
%0 = bitcast [3 x i32]* %a to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* bitcast ([3 x i32]*
@_ZZ4mainE1a to i8*), i32 12, i32 4, i1 false)
ret i32 0
}
; Function Attrs: nounwind
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) #1
118-165-79-206:input Jonathan$ ~/llvm/test/build/
bin/llc -march=cpu0 -relocation-model=pic -filetype=asm ch7_1_localarrayinit.bc -o -
...
# BB#0: # %entry
addiu $sp, $sp, -16
addiu $2, $zero, 0
st $2, 12($fp)
ld $2, %got($_ZZ4mainE1a)($gp)
ori $2, $2, %lo($_ZZ4mainE1a)
ld $3, 8($2)
st $3, 8($fp)
ld $3, 4($2)
st $3, 4($fp)
ld $2, 0($2)
st $2, 0($fp)
addiu $sp, $sp, 16
ret $lr
...
.type $_ZZ4mainE1a,@object # @_ZZ4mainE1a
.section .rodata,"a",@progbits
.align 2
$_ZZ4mainE1a:
.4byte 0 # 0x0
.4byte 1 # 0x1
.4byte 2 # 0x2
.size $_ZZ4mainE1a, 12
Vector type (SIMD) support¶
Vector types are used when multiple primitive data are operated in parallel using a single instruction (SIMD) [3]. Mips supports the following llvm IRs “icmp slt” and “sext” for vector type, Cpu0 supports them either.
lbdex/input/ch7_1_vector.cpp
typedef long vector8long __attribute__((__vector_size__(32)));
typedef long vector8short __attribute__((__vector_size__(16)));
int test_cmplt_short() {
volatile vector8short a0 = {0, 1, 2, 3};
volatile vector8short b0 = {2, 2, 2, 4};
volatile vector8short c0;
c0 = a0 < b0; // c0[0] = -1 (since 0 < 2 is true), c0[1] = -1, c0[2] = 0 (since 2 < 2 is false), c0[3] = -1
return (int)(c0[0]+c0[1]+c0[2]+c0[3]); // -3
}
int test_cmplt_long() {
volatile vector8long a0 = {2, 2, 2, 2, 1, 1, 1, 1};
volatile vector8long b0 = {1, 1, 1, 1, 2, 2, 2, 2};
volatile vector8long c0;
c0 = a0 < b0; // c0[0..3] = {0, 0, ...}, c0[4..7] = {-1, ...}
return (c0[0]+c0[1]+c0[2]+c0[3]+c0[4]+c0[5]+c0[6]+c0[7]); //-4
}
118-165-79-206:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_vector.cpp -emit-llvm -o ch7_1_vector.bc
118-165-79-206:input Jonathan$ ~/llvm/test/build/bin/
llvm-dis ch7_1_vector.bc -o -
...
; Function Attrs: nounwind
define i32 @_Z16test_cmplt_shortv() #0 {
%a0 = alloca <4 x i32>, align 16
%b0 = alloca <4 x i32>, align 16
%c0 = alloca <4 x i32>, align 16
store volatile <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %a0, align 16
store volatile <4 x i32> <i32 2, i32 2, i32 2, i32 2>, <4 x i32>* %b0, align 16
%1 = load volatile <4 x i32>, <4 x i32>* %a0, align 16
%2 = load volatile <4 x i32>, <4 x i32>* %b0, align 16
%3 = icmp slt <4 x i32> %1, %2
%4 = sext <4 x i1> %3 to <4 x i32>
store volatile <4 x i32> %4, <4 x i32>* %c0, align 16
%5 = load volatile <4 x i32>, <4 x i32>* %c0, align 16
%6 = extractelement <4 x i32> %5, i32 0
%7 = load volatile <4 x i32>, <4 x i32>* %c0, align 16
%8 = extractelement <4 x i32> %7, i32 1
%9 = add nsw i32 %6, %8
%10 = load volatile <4 x i32>, <4 x i32>* %c0, align 16
%11 = extractelement <4 x i32> %10, i32 2
%12 = add nsw i32 %9, %11
%13 = load volatile <4 x i32>, <4 x i32>* %c0, align 16
%14 = extractelement <4 x i32> %13, i32 3
%15 = add nsw i32 %12, %14
ret i32 %15
}
118-165-79-206:input Jonathan$ ~/llvm/test/build/bin/llc
-march=cpu0 -mcpu=cpu032II -relocation-model=pic -filetype=asm ch7_1_vector.bc
-o -
.text
.section .mdebug.abiO32
.previous
.file "ch7_1_vector.bc"
.globl _Z16test_cmplt_shortv
.p2align 2
.type _Z16test_cmplt_shortv,@function
.ent _Z16test_cmplt_shortv # @_Z16test_cmplt_shortv
_Z16test_cmplt_shortv:
.frame $fp,48,$lr
.mask 0x00000000,0
.set noreorder
.set nomacro
# BB#0:
addiu $sp, $sp, -48
addiu $2, $zero, 3
st $2, 44($sp)
addiu $2, $zero, 1
st $2, 36($sp)
addiu $2, $zero, 0
st $2, 32($sp)
addiu $2, $zero, 2
st $2, 40($sp)
st $2, 28($sp)
st $2, 24($sp)
st $2, 20($sp)
st $2, 16($sp)
ld $2, 32($sp)
ld $3, 44($sp)
ld $4, 40($sp)
ld $5, 36($sp)
ld $t9, 20($sp)
slt $5, $5, $t9
ld $t9, 24($sp)
slt $4, $4, $t9
ld $t9, 28($sp)
slt $3, $3, $t9
shl $3, $3, 31
sra $3, $3, 31
ld $t9, 16($sp)
st $3, 12($sp)
shl $3, $4, 31
sra $3, $3, 31
st $3, 8($sp)
shl $3, $5, 31
sra $3, $3, 31
st $3, 4($sp)
slt $2, $2, $t9
shl $2, $2, 31
sra $2, $2, 31
st $2, 0($sp)
ld $2, 12($sp)
ld $2, 8($sp)
ld $2, 4($sp)
ld $2, 0($sp)
ld $3, 4($sp)
addu $2, $2, $3
ld $3, 12($sp)
ld $3, 8($sp)
ld $3, 0($sp)
ld $3, 8($sp)
addu $2, $2, $3
ld $3, 12($sp)
ld $3, 4($sp)
ld $3, 0($sp)
ld $3, 12($sp)
addu $2, $2, $3
ld $3, 8($sp)
ld $3, 4($sp)
ld $3, 0($sp)
addiu $sp, $sp, 48
ret $lr
.set macro
.set reorder
.end _Z16test_cmplt_shortv
$func_end0:
.size _Z16test_cmplt_shortv, ($func_end0)-_Z16test_cmplt_shortv
.ident "Apple LLVM version 7.0.0 (clang-700.1.76)"
.section ".note.GNU-stack","",@progbits
Since test_longlong_shift2() of ch7_1_vector.cpp needs implementation storeRegToStack() of Cpu0SEInstInfo.cpp, at this point it cannot be verified.
lbdex/chapters/Chapter7_1/Cpu0ISelLowering.h
/// getSetCCResultType - get the ISD::SETCC result ValueType
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
lbdex/chapters/Chapter7_1/Cpu0ISelLowering.cpp
EVT Cpu0TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
EVT VT) const {
if (!VT.isVector())
return MVT::i32;
return VT.changeVectorElementTypeToInteger();
}