Other data type¶
Until now, we have only handled int and long types of 32-bit size. This chapter introduces other types, such as pointers and types that are not 32-bit, including bool, char, short int, and long long.
Local Variable Pointer¶
To support pointers to local variables, add the following code fragment to Cpu0InstrInfo.td and Cpu0InstPrinter.cpp:
lbdex/chapters/Chapter7_1/Cpu0InstrInfo.td
def mem_ea : Operand<iPTR> {
let PrintMethod = "printMemOperandEA";
let MIOperandInfo = (ops GPROut, simm16);
let EncoderMethod = "getMemEncoding";
}
class EffectiveAddress<string instr_asm, RegisterClass RC, Operand Mem> :
FMem<0x09, (outs RC:$ra), (ins Mem:$addr),
instr_asm, [(set RC:$ra, addr:$addr)], IIAlu>;
}
// FrameIndexes are legalized when they are operands from load/store
// instructions. The same not happens for stack address copies, so an
// add op with mem ComplexPattern is used and the stack address copy
// can be matched. It's similar to Sparc LEA_ADDRi
def LEA_ADDiu : EffectiveAddress<"addiu\t$ra, $addr", CPURegs, mem_ea> {
let isCodeGenOnly = 1;
}
lbdex/chapters/Chapter3_2/InstPrinter/Cpu0InstPrinter.h
void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
lbdex/chapters/Chapter3_2/InstPrinter/Cpu0InstPrinter.cpp
// The DAG data node, mem_ea of Cpu0InstrInfo.td, cannot be disabled by
// ch7_1, only opcode node can be disabled.
void Cpu0InstPrinter::
printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O) {
// when using stack locations for not load/store instructions
// print the same way as all normal 3 operand instructions.
printOperand(MI, opNum, O);
O << ", ";
printOperand(MI, opNum+1, O);
return;
}
As noted in Cpu0InstPrinter.cpp, the printMemOperandEA function was added in an earlier Chapter 3.2 because the DAG data node mem_ea in Cpu0InstrInfo.td cannot be disabled by ch7_1_localpointer; only the opcode node can be disabled.
Run ch7_1_localpointer.cpp with the Chapter7_1/ directory, which supports pointers to local variables. The expected result is as follows:
lbdex/input/ch7_1_localpointer.cpp
int test_local_pointer()
{
int b = 3;
int* p = &b;
return *p;
}
118-165-66-82:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_localpointer.cpp -emit-llvm -o ch7_1_localpointer.bc
118-165-66-82:input Jonathan$ llvm-dis ch7_1_localpointer.bc -o -
...
; Function Attrs: nounwind
define i32 @_Z18test_local_pointerv() #0 {
%b = alloca i32, align 4
%p = alloca i32*, align 4
store i32 3, i32* %b, align 4
store i32* %b, i32** %p, align 4
%1 = load i32** %p, align 4
%2 = load i32* %1, align 4
ret i32 %2
}
...
118-165-66-82:input Jonathan$ /Users/Jonathan/llvm/test/build/bin/llc
-march=cpu0 -relocation-model=pic -filetype=asm
ch7_1_localpointer.bc -o -
...
addiu $sp, $sp, -8
addiu $2, $zero, 3
st $2, 4($fp)
addiu $2, $fp, 4 // b address is 4($sp)
st $2, 0($fp)
ld $2, 4($fp)
addiu $sp, $sp, 8
ret $lr
...
char, short int and bool¶
To support signed and unsigned char and short int, add the following code to Chapter7_1/:
lbdex/chapters/Chapter7_1/Cpu0InstrInfo.td
def sextloadi16_a : AlignedLoad<sextloadi16>;
def zextloadi16_a : AlignedLoad<zextloadi16>;
def extloadi16_a : AlignedLoad<extloadi16>;
def truncstorei16_a : AlignedStore<truncstorei16>;
let Predicates = [Ch7_1] in {
def LB : LoadM32<0x03, "lb", sextloadi8>;
def LBu : LoadM32<0x04, "lbu", zextloadi8>;
def SB : StoreM32<0x05, "sb", truncstorei8>;
def LH : LoadM32<0x06, "lh", sextloadi16_a>;
def LHu : LoadM32<0x07, "lhu", zextloadi16_a>;
def SH : StoreM32<0x08, "sh", truncstorei16_a>;
}
Run Chapter7_1/ with ch7_1_char_in_struct.cpp to obtain the following result.
lbdex/input/ch7_1_char_in_struct.cpp
struct Date
{
short year;
char month;
char day;
char hour;
char minute;
char second;
};
unsigned char b[4] = {'a', 'b', 'c', '\0'};
int test_char()
{
unsigned char a = b[1];
char c = (char)b[1];
Date date1 = {2012, (char)11, (char)25, (char)9, (char)40, (char)15};
char m = date1.month;
char s = date1.second;
return 0;
}
118-165-64-245:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llvm-dis ch7_1_char_in_struct.bc -o -
define i32 @_Z9test_charv() #0 {
%a = alloca i8, align 1
%c = alloca i8, align 1
%date1 = alloca %struct.Date, align 2
%m = alloca i8, align 1
%s = alloca i8, align 1
%1 = load i8* getelementptr inbounds ([4 x i8]* @b, i32 0, i32 1), align 1
store i8 %1, i8* %a, align 1
%2 = load i8* getelementptr inbounds ([4 x i8]* @b, i32 0, i32 1), align 1
store i8 %2, i8* %c, align 1
%3 = bitcast %struct.Date* %date1 to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %3, i8* bitcast ({ i16, i8, i8, i8,
i8, i8, i8 }* @_ZZ9test_charvE5date1 to i8*), i32 8, i32 2, i1 false)
%4 = getelementptr inbounds %struct.Date* %date1, i32 0, i32 1
%5 = load i8* %4, align 1
store i8 %5, i8* %m, align 1
%6 = getelementptr inbounds %struct.Date* %date1, i32 0, i32 5
%7 = load i8* %6, align 1
store i8 %7, i8* %s, align 1
ret i32 0
}
118-165-64-245:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_char_in_struct.cpp -emit-llvm -o ch7_1_char_in_struct.bc
118-165-64-245:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llc -march=cpu0 -relocation-model=pic -filetype=asm
ch7_1_char_in_struct.bc -o -
...
# BB#0: # %entry
addiu $sp, $sp, -24
lui $2, %got_hi(b)
addu $2, $2, $gp
ld $2, %got_lo(b)($2)
lbu $3, 1($2)
sb $3, 20($fp)
lbu $2, 1($2)
sb $2, 16($fp)
ld $2, %got($_ZZ9test_charvE5date1)($gp)
addiu $2, $2, %lo($_ZZ9test_charvE5date1)
lhu $3, 4($2)
shl $3, $3, 16
lhu $4, 6($2)
or $3, $3, $4
st $3, 12($fp) // store hour, minute and second on 12($sp)
lhu $3, 2($2)
lhu $2, 0($2)
shl $2, $2, 16
or $2, $2, $3
st $2, 8($fp) // store year, month and day on 8($sp)
lbu $2, 10($fp) // m = date1.month;
sb $2, 4($fp)
lbu $2, 14($fp) // s = date1.second;
sb $2, 0($fp)
addiu $sp, $sp, 24
ret $lr
.set macro
.set reorder
.end _Z9test_charv
$tmp1:
.size _Z9test_charv, ($tmp1)-_Z9test_charv
.type b,@object # @b
.data
.globl b
b:
.asciz "abc"
.size b, 4
.type $_ZZ9test_charvE5date1,@object # @_ZZ9test_charvE5date1
.section .rodata.cst8,"aM",@progbits,8
.align 1
$_ZZ9test_charvE5date1:
.2byte 2012 # 0x7dc
.byte 11 # 0xb
.byte 25 # 0x19
.byte 9 # 0x9
.byte 40 # 0x28
.byte 15 # 0xf
.space 1
.size $_ZZ9test_charvE5date1, 8
Run Chapter7_1/ with ch7_1_char_short.cpp to obtain the following result.
lbdex/input/ch7_1_char_short.cpp
int test_signed_char()
{
char a = 0x80;
int i = (signed int)a;
i = i + 2; // i = (-128+2) = -126
return i;
}
int test_unsigned_char()
{
unsigned char c = 0x80;
unsigned int ui = (unsigned int)c;
ui = ui + 2; // i = (128+2) = 130
return (int)ui;
}
int test_signed_short()
{
short a = 0x8000;
int i = (signed int)a;
i = i + 2; // i = (-32768+2) = -32766
return i;
}
int test_unsigned_short()
{
unsigned short c = 0x8000;
unsigned int ui = (unsigned int)c;
ui = ui + 2; // i = (32768+2) = 32770
c = (unsigned short)ui;
return (int)ui;
}
1-160-136-236:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llvm-dis ch7_1_char_short.bc -o -
...
define i32 @_Z16test_signed_charv() #0 {
...
%1 = load i8* %a, align 1
%2 = sext i8 %1 to i32
...
}
; Function Attrs: nounwind
define i32 @_Z18test_unsigned_charv() #0 {
...
%1 = load i8* %c, align 1
%2 = zext i8 %1 to i32
...
}
; Function Attrs: nounwind
define i32 @_Z17test_signed_shortv() #0 {
...
%1 = load i16* %a, align 2
%2 = sext i16 %1 to i32
...
}
; Function Attrs: nounwind
define i32 @_Z19test_unsigned_shortv() #0 {
...
%1 = load i16* %c, align 2
%2 = zext i16 %1 to i32
...
}
attributes #0 = { nounwind }
1-160-136-236:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llc -march=cpu0 -relocation-model=static -filetype=asm ch7_1_char_short.bc -o -
...
.globl _Z16test_signed_charv
...
lb $2, 4($sp)
...
.end _Z16test_signed_charv
.globl _Z18test_unsigned_charv
...
lbu $2, 4($sp)
...
.end _Z18test_unsigned_charv
.globl _Z17test_signed_shortv
...
lh $2, 4($sp)
...
.end _Z17test_signed_shortv
.globl _Z19test_unsigned_shortv
...
lhu $2, 4($sp)
...
.end _Z19test_unsigned_shortv
...
As shown, lb/lh instructions are used for signed byte/short types, while lbu/lhu are used for unsigned byte/short types. To efficiently support C type-casting and type-conversion features, Cpu0 provides the lb instruction, which converts a char to an int with a single instruction. The instructions lbu, lh, lhu, sb, and sh are applied to both signed and unsigned byte and short conversions. Their differences were explained in Chapter 2.
To support loading the bool type, add the following code:
lbdex/chapters/Chapter7_1/Cpu0ISelLowering.cpp
Cpu0TargetLowering::Cpu0TargetLowering(const Cpu0TargetMachine &TM,
const Cpu0Subtarget &STI)
: TargetLowering(TM), Subtarget(STI), ABI(TM.getABI()) {
// Cpu0 does not have i1 type, so use i32 for
// setcc operations results (slt, sgt, ...).
setBooleanContents(ZeroOrOneBooleanContent);
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
// Load extented operations for i1 types must be promoted
for (MVT VT : MVT::integer_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
}
...
}
The purpose of setBooleanContents() is as follows, but its details are not well understood. Without it, ch7_1_bool2.ll still works as shown below.
The IR input file ch7_1_bool2.ll is used for testing, as the C++ version requires flow control, which is not supported at this point. The file ch_run_backend.cpp includes a test fragment for bool, as shown below.
include/llvm/Target/TargetLowering.h
enum BooleanContent { // How the target represents true/false values.
UndefinedBooleanContent, // Only bit 0 counts, the rest can hold garbage.
ZeroOrOneBooleanContent, // All bits zero except for bit 0.
ZeroOrNegativeOneBooleanContent // All bits equal to bit 0.
};
...
protected:
/// setBooleanContents - Specify how the target extends the result of a
/// boolean value from i1 to a wider type. See getBooleanContents.
void setBooleanContents(BooleanContent Ty) { BooleanContents = Ty; }
/// setBooleanVectorContents - Specify how the target extends the result
/// of a vector boolean value from a vector of i1 to a wider type. See
/// getBooleanContents.
void setBooleanVectorContents(BooleanContent Ty) {
BooleanVectorContents = Ty;
}
lbdex/input/ch7_1_bool2.ll
define zeroext i1 @verify_load_bool() #0 {
entry:
%retval = alloca i1, align 1
store i1 1, i1* %retval, align 1
%0 = load i1, i1* %retval
ret i1 %0
}
118-165-64-245:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llc -march=cpu0 -relocation-model=pic -filetype=asm ch7_1_bool2.ll -o -
.section .mdebug.abi32
.previous
.file "ch7_1_bool2.ll"
.text
.globl verify_load_bool
.align 2
.type verify_load_bool,@function
.ent verify_load_bool # @verify_load_bool
verify_load_bool:
.cfi_startproc
.frame $sp,8,$lr
.mask 0x00000000,0
.set noreorder
.set nomacro
# BB#0: # %entry
addiu $sp, $sp, -8
$tmp1:
.cfi_def_cfa_offset 8
addiu $2, $zero, 1
sb $2, 7($sp)
addiu $sp, $sp, 8
ret $lr
.set macro
.set reorder
.end verify_load_bool
$tmp2:
.size verify_load_bool, ($tmp2)-verify_load_bool
.cfi_endproc
The ch7_1_bool.cpp file provides a bool test version for C. You can run it in Chapter8_1/ to obtain results similar to ch7_1_bool2.ll.
lbdex/input/ch7_1_bool.cpp
bool test_load_bool()
{
int a = 1;
if (a < 0)
return false;
return true;
}
Summary Table
C |
.bc |
Optimized legalized selection DAG |
---|---|---|
char a =0x80; |
%1 = load i8* %a, align 1 |
|
int i = (signed int)a; |
%2 = sext i8 %1 to i32 |
load …, <…, sext from i8> |
unsigned char c = 0x80; |
%1 = load i8* %c, align 1 |
|
unsigned int ui = (unsigned int)c; |
%2 = zext i8 %1 to i32 |
load …, <…, zext from i8> |
short a =0x8000; |
%1 = load i16* %a, align 2 |
|
int i = (signed int)a; |
%2 = sext i16 %1 to i32 |
load …, <…, sext from i16> |
unsigned short c = 0x8000; |
%1 = load i16* %c, align 2 |
|
unsigned int ui = (unsigned int)c; |
%2 = zext i16 %1 to i32 |
load …, <…, zext from i16> |
c = (unsigned short)ui; |
%6 = trunc i32 %5 to i16 |
|
store i16 %6, i16* %c, align 2 |
store …,<…, trunc to i16> |
|
return true; |
store i1 1, i1* %retval, align 1 |
store …,<…, trunc to i8> |
Optimized legalized selection DAG |
Cpu0 |
pattern in Cpu0InstrInfo.td |
---|---|---|
load …, <…, sext from i8> |
lb |
LB : LoadM32<0x03, “lb”, sextloadi8>; |
load …, <…, zext from i8> |
lbu |
LBu : LoadM32<0x04, “lbu”, zextloadi8>; |
load …, <…, sext from i16> |
lh |
LH : LoadM32<0x06, “lh”, sextloadi16_a>; |
load …, <…, zext from i16> |
lhu |
LHu : LoadM32<0x07, “lhu”, zextloadi16_a>; |
store …,<…, trunc to i16> |
sh |
SH : StoreM32<0x08, “sh”, truncstorei16_a>; |
store …,<…, trunc to i8> |
sb |
SB : StoreM32<0x05, “sb”, truncstorei8>; |
long long¶
Like MIPS, the long type in Cpu0 is 32-bit, while long long is 64-bit in C. To support long long, add the following code to Chapter7_1/:
lbdex/chapters/Chapter7_1/Cpu0SEISelDAGToDAG.cpp
void Cpu0SEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
SDValue CmpLHS, const SDLoc &DL,
SDNode *Node) const {
unsigned Opc = InFlag.getOpcode(); (void)Opc;
assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
(Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
"(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
EVT VT = LHS.getValueType();
SDNode *Carry;
if (Subtarget->hasCpu032II())
Carry = CurDAG->getMachineNode(Cpu0::SLTu, DL, VT, Ops);
else {
SDNode *StatusWord = CurDAG->getMachineNode(Cpu0::CMP, DL, VT, Ops);
SDValue Constant1 = CurDAG->getTargetConstant(1, DL, VT);
Carry = CurDAG->getMachineNode(Cpu0::ANDi, DL, VT,
SDValue(StatusWord,0), Constant1);
}
SDNode *AddCarry = CurDAG->getMachineNode(Cpu0::ADDu, DL, VT,
SDValue(Carry,0), RHS);
CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry,0));
}
bool Cpu0SEDAGToDAGISel::trySelect(SDNode *Node) {
unsigned Opcode = Node->getOpcode();
SDLoc DL(Node);
///
// Instruction Selection not handled by the auto-generated
// tablegen selection should be handled here.
///
///
// Instruction Selection not handled by the auto-generated
// tablegen selection should be handled here.
///
EVT NodeTy = Node->getValueType(0);
unsigned MultOpc;
switch(Opcode) {
default: break;
case ISD::SUBE: {
SDValue InFlag = Node->getOperand(2);
selectAddESubE(Cpu0::SUBu, InFlag, InFlag.getOperand(0), DL, Node);
return true;
}
case ISD::ADDE: {
SDValue InFlag = Node->getOperand(2);
selectAddESubE(Cpu0::ADDu, InFlag, InFlag.getValue(0), DL, Node);
return true;
}
/// Mul with two results
case ISD::SMUL_LOHI:
case ISD::UMUL_LOHI: {
MultOpc = (Opcode == ISD::UMUL_LOHI ? Cpu0::MULTu : Cpu0::MULT);
std::pair<SDNode*, SDNode*> LoHi =
selectMULT(Node, MultOpc, DL, NodeTy, true, true);
if (!SDValue(Node, 0).use_empty())
ReplaceUses(SDValue(Node, 0), SDValue(LoHi.first, 0));
if (!SDValue(Node, 1).use_empty())
ReplaceUses(SDValue(Node, 1), SDValue(LoHi.second, 0));
CurDAG->RemoveDeadNode(Node);
return true;
}
...
}
lbdex/chapters/Chapter7_1/Cpu0ISelLowering.h
class Cpu0TargetLowering : public TargetLowering {
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
...
}
lbdex/chapters/Chapter7_1/Cpu0ISelLowering.cpp
Cpu0TargetLowering::Cpu0TargetLowering(const Cpu0TargetMachine &TM,
const Cpu0Subtarget &STI)
: TargetLowering(TM), Subtarget(STI), ABI(TM.getABI()) {
// Handle i64 shl
setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
...
}
The additional code in Cpu0ISelLowering.cpp handles shift operations for long long (64-bit). Using the << and >> operators on 64-bit variables generates DAG SHL_PARTS, SRA_PARTS, and SRL_PARTS, which manage 32-bit operands during LLVM DAG translation.
At this point, ch9_7.cpp, which includes 64-bit shift operations, cannot be executed. It will be verified in the later chapter Function Call.
Run Chapter7_1/ with ch7_1_longlong.cpp to obtain the following result.
lbdex/input/ch7_1_longlong.cpp
long long test_longlong()
{
long long a = 0x300000002;
long long b = 0x100000001;
int a1 = 0x3001000;
int b1 = 0x2001000;
long long c = a + b; // c = 0x00000004,00000003
long long d = a - b; // d = 0x00000002,00000001
long long e = a * b; // e = 0x00000005,00000002
long long f = (long long)a1 * (long long)b1; // f = 0x00060050,01000000
long long g = ((-7 * 8) + 1) >> 4; // g = -55/16=-3.4375=-4
return (c+d+e+f+g); // (0x0006005b,01000002) = (393307,16777218)
}
1-160-134-62:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_longlong.cpp -emit-llvm -o ch7_1_longlong.bc
1-160-134-62:input Jonathan$ /Users/Jonathan/llvm/test/build/
bin/llc -march=cpu0 -mcpu=cpu032I -relocation-model=pic -filetype=asm
ch7_1_longlong.bc -o -
...
# BB#0:
addiu $sp, $sp, -72
st $8, 68($fp) # 4-byte Folded Spill
addiu $2, $zero, 2
st $2, 60($fp)
addiu $2, $zero, 3
st $2, 56($fp)
addiu $2, $zero, 1
st $2, 52($fp)
st $2, 48($fp)
lui $2, 768
ori $2, $2, 4096
st $2, 44($fp)
lui $2, 512
ori $2, $2, 4096
st $2, 40($fp)
ld $2, 52($fp)
ld $3, 60($fp)
addu $3, $3, $2
ld $4, 56($fp)
ld $5, 48($fp)
st $3, 36($fp)
cmp $sw, $3, $2
andi $2, $sw, 1
addu $2, $2, $5
addu $2, $4, $2
st $2, 32($fp)
ld $2, 52($fp)
ld $3, 60($fp)
subu $4, $3, $2
ld $5, 56($fp)
ld $t9, 48($fp)
st $4, 28($fp)
cmp $sw, $3, $2
andi $2, $sw, 1
addu $2, $2, $t9
subu $2, $5, $2
st $2, 24($fp)
ld $2, 52($fp)
ld $3, 60($fp)
multu $3, $2
mflo $4
mfhi $5
ld $t9, 56($fp)
ld $7, 48($fp)
st $4, 20($fp)
mul $3, $3, $7
addu $3, $5, $3
mul $2, $t9, $2
addu $2, $3, $2
st $2, 16($fp)
ld $2, 40($fp)
ld $3, 44($fp)
mult $3, $2
mflo $2
mfhi $4
st $2, 12($fp)
st $4, 8($fp)
ld $5, 28($fp)
ld $3, 36($fp)
addu $t9, $3, $5
ld $7, 20($fp)
addu $8, $t9, $7
addu $3, $8, $2
cmp $sw, $3, $2
andi $2, $sw, 1
addu $2, $2, $4
cmp $sw, $t9, $5
st $sw, 4($fp) # 4-byte Folded Spill
cmp $sw, $8, $7
andi $4, $sw, 1
ld $5, 16($fp)
addu $4, $4, $5
ld $sw, 4($fp) # 4-byte Folded Reload
andi $5, $sw, 1
ld $t9, 24($fp)
addu $5, $5, $t9
ld $t9, 32($fp)
addu $5, $t9, $5
addu $4, $5, $4
addu $2, $4, $2
ld $8, 68($fp) # 4-byte Folded Reload
addiu $sp, $sp, 72
ret $lr
...
float and double¶
At this point, Cpu0 only supports integer instructions. For floating-point operations, the Cpu0 backend calls library functions to convert integers to floats, as follows:
lbdex/input/ch7_1_fmul.c
/*
~/llvm/debug/build/bin/clang -target mips-unknown-linux-gnu -emit-llvm -S ch7_1_fmul.c
...
%mul = fmul float %0, %1
~/llvm/debug/build/bin/llc -march=mips ch7_1_fmul.ll -relocation-model=static -o -
...
v_log_f32_e32 v1, v0
v_mul_legacy_f32_e32 v0, v0, v1
v_exp_f32_e32 v0, v0
~/llvm/test/build/bin/llc -march=cpu0 ch7_1_fmul.ll -relocation-model=static -o -
...
jsub __mulsf3
*/
float ch7_1_fmul(float a, float b) {
float c = a * b;
return c;
}
Floating-point function calls for Cpu0 will be supported in the Function Call chapter. Due to hardware cost constraints, many CPUs do not include floating-point hardware instructions. Instead, they rely on library functions. MIPS separates floating-point operations into a dedicated co-processor for applications that require floating-point arithmetic.
To support the floating-point library (part of compiler-rt) [2], the following code is added to support clz and clo instructions. Although these instructions are implemented in compiler-rt, they are integer operations that improve floating-point application performance.
lbdex/chapters/Chapter7_1/Cpu0InstrInfo.td
let Predicates = [Ch7_1] in {
// Count Leading Ones/Zeros in Word
class CountLeading0<bits<8> op, string instr_asm, RegisterClass RC>:
FA<op, (outs GPROut:$ra), (ins RC:$rb),
!strconcat(instr_asm, "\t$ra, $rb"),
[(set GPROut:$ra, (ctlz RC:$rb))], II_CLZ> {
let rc = 0;
let shamt = 0;
}
class CountLeading1<bits<8> op, string instr_asm, RegisterClass RC>:
FA<op, (outs GPROut:$ra), (ins RC:$rb),
!strconcat(instr_asm, "\t$ra, $rb"),
[(set GPROut:$ra, (ctlz (not RC:$rb)))], II_CLO> {
let rc = 0;
let shamt = 0;
}
let Predicates = [Ch7_1] in {
/// Count Leading
def CLZ : CountLeading0<0x15, "clz", CPURegs>;
def CLO : CountLeading1<0x16, "clo", CPURegs>;
Array and struct support¶
LLVM uses getelementptr to represent array and struct types in C. For details, refer to [1].
For ch7_1_globalstructoffset.cpp, the LLVM IR is as follows:
lbdex/input/ch7_1_globalstructoffset.cpp
struct Date
{
int year;
int month;
int day;
};
Date date = {2012, 10, 12};
int a[3] = {2012, 10, 12};
int test_struct()
{
int day = date.day;
int i = a[1];
return (i+day); // 10+12=22
}
// ch7_1_globalstructoffset.ll
; ModuleID = 'ch7_1_globalstructoffset.bc'
...
%struct.Date = type { i32, i32, i32 }
@date = global %struct.Date { i32 2012, i32 10, i32 12 }, align 4
@a = global [3 x i32] [i32 2012, i32 10, i32 12], align 4
; Function Attrs: nounwind
define i32 @_Z11test_structv() #0 {
%day = alloca i32, align 4
%i = alloca i32, align 4
%1 = load i32* getelementptr inbounds (%struct.Date* @date, i32 0, i32 2), align 4
store i32 %1, i32* %day, align 4
%2 = load i32* getelementptr inbounds ([3 x i32]* @a, i32 0, i32 1), align 4
store i32 %2, i32* %i, align 4
%3 = load i32* %i, align 4
%4 = load i32* %day, align 4
%5 = add nsw i32 %3, %4
ret i32 %5
}
Run Chapter6_1/ with ch7_1_globalstructoffset.bc on static mode will get the incorrect asm file as follows,
1-160-134-62:input Jonathan$ /Users/Jonathan/llvm/test/build/bin/
llc -march=cpu0 -relocation-model=static -filetype=asm
ch7_1_globalstructoffset.bc -o -
...
lui $2, %hi(date)
ori $2, $2, %lo(date)
ld $2, 0($2) // the correct one is ld $2, 8($2)
...
For day = date.day, the correct instruction is ld $2, 8($2), not ld $2, 0($2), since date.day has an offset of 8 bytes (the date struct contains year and month before day). Use the debug option in llc to analyze this:
jonathantekiimac:input Jonathan$ /Users/Jonathan/llvm/test/
build/bin/llc -march=cpu0 -debug -relocation-model=static
-filetype=asm ch6_2.bc -o ch6_2.cpu0.static.s
...
=== main
Initial selection DAG: BB#0 'main:entry'
SelectionDAG has 20 nodes:
0x7f7f5b02d210: i32 = undef [ORD=1]
0x7f7f5ac10590: ch = EntryToken [ORD=1]
0x7f7f5b02d010: i32 = Constant<0> [ORD=1]
0x7f7f5b02d110: i32 = FrameIndex<0> [ORD=1]
0x7f7f5b02d210: <multiple use>
0x7f7f5b02d310: ch = store 0x7f7f5ac10590, 0x7f7f5b02d010, 0x7f7f5b02d110,
0x7f7f5b02d210<ST4[%retval]> [ORD=1]
0x7f7f5b02d410: i32 = GlobalAddress<%struct.Date* @date> 0 [ORD=2]
0x7f7f5b02d510: i32 = Constant<8> [ORD=2]
0x7f7f5b02d610: i32 = add 0x7f7f5b02d410, 0x7f7f5b02d510 [ORD=2]
0x7f7f5b02d210: <multiple use>
0x7f7f5b02d710: i32,ch = load 0x7f7f5b02d310, 0x7f7f5b02d610, 0x7f7f5b02d210
<LD4[getelementptr inbounds (%struct.Date* @date, i32 0, i32 2)]> [ORD=3]
0x7f7f5b02db10: i64 = Constant<4>
0x7f7f5b02d710: <multiple use>
0x7f7f5b02d710: <multiple use>
0x7f7f5b02d810: i32 = FrameIndex<1> [ORD=4]
0x7f7f5b02d210: <multiple use>
0x7f7f5b02d910: ch = store 0x7f7f5b02d710:1, 0x7f7f5b02d710, 0x7f7f5b02d810,
0x7f7f5b02d210<ST4[%day]> [ORD=4]
0x7f7f5b02da10: i32 = GlobalAddress<[3 x i32]* @a> 0 [ORD=5]
0x7f7f5b02dc10: i32 = Constant<4> [ORD=5]
0x7f7f5b02dd10: i32 = add 0x7f7f5b02da10, 0x7f7f5b02dc10 [ORD=5]
0x7f7f5b02d210: <multiple use>
0x7f7f5b02de10: i32,ch = load 0x7f7f5b02d910, 0x7f7f5b02dd10, 0x7f7f5b02d210
<LD4[getelementptr inbounds ([3 x i32]* @a, i32 0, i32 1)]> [ORD=6]
...
Replacing.3 0x7f7f5b02dd10: i32 = add 0x7f7f5b02da10, 0x7f7f5b02dc10 [ORD=5]
With: 0x7f7f5b030010: i32 = GlobalAddress<[3 x i32]* @a> + 4
Replacing.3 0x7f7f5b02d610: i32 = add 0x7f7f5b02d410, 0x7f7f5b02d510 [ORD=2]
With: 0x7f7f5b02db10: i32 = GlobalAddress<%struct.Date* @date> + 8
Optimized lowered selection DAG: BB#0 'main:entry'
SelectionDAG has 15 nodes:
0x7f7f5b02d210: i32 = undef [ORD=1]
0x7f7f5ac10590: ch = EntryToken [ORD=1]
0x7f7f5b02d010: i32 = Constant<0> [ORD=1]
0x7f7f5b02d110: i32 = FrameIndex<0> [ORD=1]
0x7f7f5b02d210: <multiple use>
0x7f7f5b02d310: ch = store 0x7f7f5ac10590, 0x7f7f5b02d010, 0x7f7f5b02d110,
0x7f7f5b02d210<ST4[%retval]> [ORD=1]
0x7f7f5b02db10: i32 = GlobalAddress<%struct.Date* @date> + 8
0x7f7f5b02d210: <multiple use>
0x7f7f5b02d710: i32,ch = load 0x7f7f5b02d310, 0x7f7f5b02db10, 0x7f7f5b02d210
<LD4[getelementptr inbounds (%struct.Date* @date, i32 0, i32 2)]> [ORD=3]
0x7f7f5b02d710: <multiple use>
0x7f7f5b02d710: <multiple use>
0x7f7f5b02d810: i32 = FrameIndex<1> [ORD=4]
0x7f7f5b02d210: <multiple use>
0x7f7f5b02d910: ch = store 0x7f7f5b02d710:1, 0x7f7f5b02d710, 0x7f7f5b02d810,
0x7f7f5b02d210<ST4[%day]> [ORD=4]
0x7f7f5b030010: i32 = GlobalAddress<[3 x i32]* @a> + 4
0x7f7f5b02d210: <multiple use>
0x7f7f5b02de10: i32,ch = load 0x7f7f5b02d910, 0x7f7f5b030010, 0x7f7f5b02d210
<LD4[getelementptr inbounds ([3 x i32]* @a, i32 0, i32 1)]> [ORD=6]
...
The output reveals the DAG translation process. As shown, the DAG node for date.day (add GlobalAddress<[3 x i32]* @a> 0, Constant<8>) with three nodes is replaced by a single node GlobalAddress<%struct.Date* @date> + 8. The same applies to a[1].
This replacement occurs because TargetLowering.cpp::isOffsetFoldingLegal(…)
returns true in llc -static
static addressing mode.
In Cpu0, the ld instruction format is ld $r1, offset($r2),
meaning it loads the value at address($r2) + offset into $r1.
To correct this, override isOffsetFoldingLegal(…) as follows:
lib/CodeGen/SelectionDAG/TargetLowering.cpp
bool
TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// Assume that everything is safe in static mode.
if (getTargetMachine().getRelocationModel() == Reloc::Static)
return true;
// In dynamic-no-pic mode, assume that known defined values are safe.
if (getTargetMachine().getRelocationModel() == Reloc::DynamicNoPIC &&
GA &&
!GA->getGlobal()->isDeclaration() &&
!GA->getGlobal()->isWeakForLinker())
return true;
// Otherwise assume nothing is safe.
return false;
}
lbdex/chapters/Chapter7_1/Cpu0ISelLowering.cpp
bool
Cpu0TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// The Cpu0 target isn't yet aware of offsets.
return false;
}
Additionally, add the following code to Cpu0ISelDAGToDAG.cpp:
When SelectAddr(…) in Cpu0ISelDAGToDAG.cpp is called, Addr represents the DAG node for date.day:
lbdex/chapters/Chapter7_1/Cpu0ISelDAGToDAG.cpp
/// ComplexPattern used on Cpu0InstrInfo
/// Used on Cpu0 Load/Store instructions
bool Cpu0DAGToDAGISel::
SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) {
// Addresses of the form FI+const or FI|const
if (CurDAG->isBaseWithConstantOffset(Addr)) {
ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
if (isInt<16>(CN->getSExtValue())) {
// If the first operand is a FI, get the TargetFI Node
if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
(Addr.getOperand(0)))
Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
else
Base = Addr.getOperand(0);
Offset = CurDAG->getTargetConstant(CN->getZExtValue(), DL, ValTy);
return true;
}
}
...
}
Recall that we have translated the DAG list for date.day
(add GlobalAddress<[3 x i32]* @a> 0, Constant<8>)
into
(add (add Cpu0ISD::Hi (Cpu0II::MO_ABS_HI), Cpu0ISD::Lo(Cpu0II::MO_ABS_LO)),
Constant<8>)
by the following code in Cpu0ISelLowering.h
.
lbdex/chapters/Chapter6_1/Cpu0ISelLowering.h
// This method creates the following nodes, which are necessary for
// computing a symbol's address in non-PIC mode:
//
// (add %hi(sym), %lo(sym))
template<class NodeTy>
SDValue getAddrNonPIC(NodeTy *N, EVT Ty, SelectionDAG &DAG) const {
SDLoc DL(N);
SDValue Hi = getTargetNode(N, Ty, DAG, Cpu0II::MO_ABS_HI);
SDValue Lo = getTargetNode(N, Ty, DAG, Cpu0II::MO_ABS_LO);
return DAG.getNode(ISD::ADD, DL, Ty,
DAG.getNode(Cpu0ISD::Hi, DL, Ty, Hi),
DAG.getNode(Cpu0ISD::Lo, DL, Ty, Lo));
}
add (add Cpu0ISD::Hi (Cpu0II::MO_ABS_HI), Cpu0ISD::Lo(Cpu0II::MO_ABS_LO)), Constant<8>
Since Addr.getOpcode() = ISD:ADD, Addr.getOperand(0) = (add Cpu0ISD::Hi (Cpu0II::MO_ABS_HI), Cpu0ISD::Lo(Cpu0II::MO_ABS_LO)), and Addr.getOperand(1).getOpcode() = ISD::Constant, we set Base to (add Cpu0ISD::Hi (Cpu0II::MO_ABS_HI), Cpu0ISD::Lo(Cpu0II::MO_ABS_LO)) and Offset to Constant<8>. This ensures ld $r1, 8($r2) is correctly generated in the Instruction Selection stage.
Run Chapter7_1/ with ch7_1_globalstructoffset.cpp to obtain the correct instruction.
...
lui $2, %hi(date)
ori $2, $2, %lo(date)
ld $2, 8($2) // correct
...
The ch7_1_localarrayinit.cpp is for local variable initialization test. The result as follows,
lbdex/input/ch7_1_localarrayinit.cpp
int main()
{
int a[3]={0, 1, 2};
return 0;
}
118-165-79-206:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_localarrayinit.cpp -emit-llvm -o ch7_1_localarrayinit.bc
118-165-79-206:input Jonathan$ llvm-dis ch7_1_localarrayinit.bc -o -
...
define i32 @main() nounwind ssp {
entry:
%retval = alloca i32, align 4
%a = alloca [3 x i32], align 4
store i32 0, i32* %retval
%0 = bitcast [3 x i32]* %a to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* bitcast ([3 x i32]*
@_ZZ4mainE1a to i8*), i32 12, i32 4, i1 false)
ret i32 0
}
; Function Attrs: nounwind
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) #1
118-165-79-206:input Jonathan$ ~/llvm/test/build/
bin/llc -march=cpu0 -relocation-model=pic -filetype=asm ch7_1_localarrayinit.bc -o -
...
# BB#0: # %entry
addiu $sp, $sp, -16
addiu $2, $zero, 0
st $2, 12($fp)
ld $2, %got($_ZZ4mainE1a)($gp)
ori $2, $2, %lo($_ZZ4mainE1a)
ld $3, 8($2)
st $3, 8($fp)
ld $3, 4($2)
st $3, 4($fp)
ld $2, 0($2)
st $2, 0($fp)
addiu $sp, $sp, 16
ret $lr
...
.type $_ZZ4mainE1a,@object # @_ZZ4mainE1a
.section .rodata,"a",@progbits
.align 2
$_ZZ4mainE1a:
.4byte 0 # 0x0
.4byte 1 # 0x1
.4byte 2 # 0x2
.size $_ZZ4mainE1a, 12
Vector type (SIMD) support¶
Vector types are used when multiple primitive data are operated in parallel using a single instruction (SIMD) [3]. Mips supports the following llvm IRs “icmp slt” and “sext” for vector type, Cpu0 supports them either.
Vector types enable multiple primitive data operations in parallel using a single instruction (SIMD) [3]. MIPS supports icmp slt and sext LLVM IRs for vector types, which Cpu0 also supports.
lbdex/input/ch7_1_vector.cpp
typedef long vector8long __attribute__((__vector_size__(32)));
typedef long vector8short __attribute__((__vector_size__(16)));
int test_cmplt_short() {
volatile vector8short a0 = {0, 1, 2, 3};
volatile vector8short b0 = {2, 2, 2, 4};
volatile vector8short c0;
c0 = a0 < b0; // c0[0] = -1 (since 0 < 2 is true), c0[1] = -1, c0[2] = 0 (since 2 < 2 is false), c0[3] = -1
return (int)(c0[0]+c0[1]+c0[2]+c0[3]); // -3
}
int test_cmplt_long() {
volatile vector8long a0 = {2, 2, 2, 2, 1, 1, 1, 1};
volatile vector8long b0 = {1, 1, 1, 1, 2, 2, 2, 2};
volatile vector8long c0;
c0 = a0 < b0; // c0[0..3] = {0, 0, ...}, c0[4..7] = {-1, ...}
return (c0[0]+c0[1]+c0[2]+c0[3]+c0[4]+c0[5]+c0[6]+c0[7]); //-4
}
118-165-79-206:input Jonathan$ clang -target mips-unknown-linux-gnu -c
ch7_1_vector.cpp -emit-llvm -o ch7_1_vector.bc
118-165-79-206:input Jonathan$ ~/llvm/test/build/bin/
llvm-dis ch7_1_vector.bc -o -
...
; Function Attrs: nounwind
define i32 @_Z16test_cmplt_shortv() #0 {
%a0 = alloca <4 x i32>, align 16
%b0 = alloca <4 x i32>, align 16
%c0 = alloca <4 x i32>, align 16
store volatile <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %a0, align 16
store volatile <4 x i32> <i32 2, i32 2, i32 2, i32 2>, <4 x i32>* %b0, align 16
%1 = load volatile <4 x i32>, <4 x i32>* %a0, align 16
%2 = load volatile <4 x i32>, <4 x i32>* %b0, align 16
%3 = icmp slt <4 x i32> %1, %2
%4 = sext <4 x i1> %3 to <4 x i32>
store volatile <4 x i32> %4, <4 x i32>* %c0, align 16
%5 = load volatile <4 x i32>, <4 x i32>* %c0, align 16
%6 = extractelement <4 x i32> %5, i32 0
%7 = load volatile <4 x i32>, <4 x i32>* %c0, align 16
%8 = extractelement <4 x i32> %7, i32 1
%9 = add nsw i32 %6, %8
%10 = load volatile <4 x i32>, <4 x i32>* %c0, align 16
%11 = extractelement <4 x i32> %10, i32 2
%12 = add nsw i32 %9, %11
%13 = load volatile <4 x i32>, <4 x i32>* %c0, align 16
%14 = extractelement <4 x i32> %13, i32 3
%15 = add nsw i32 %12, %14
ret i32 %15
}
118-165-79-206:input Jonathan$ ~/llvm/test/build/bin/llc
-march=cpu0 -mcpu=cpu032II -relocation-model=pic -filetype=asm ch7_1_vector.bc
-o -
.text
.section .mdebug.abiO32
.previous
.file "ch7_1_vector.bc"
.globl _Z16test_cmplt_shortv
.p2align 2
.type _Z16test_cmplt_shortv,@function
.ent _Z16test_cmplt_shortv # @_Z16test_cmplt_shortv
_Z16test_cmplt_shortv:
.frame $fp,48,$lr
.mask 0x00000000,0
.set noreorder
.set nomacro
# BB#0:
addiu $sp, $sp, -48
addiu $2, $zero, 3
st $2, 44($sp)
addiu $2, $zero, 1
st $2, 36($sp)
addiu $2, $zero, 0
st $2, 32($sp)
addiu $2, $zero, 2
st $2, 40($sp)
st $2, 28($sp)
st $2, 24($sp)
st $2, 20($sp)
st $2, 16($sp)
ld $2, 32($sp)
ld $3, 44($sp)
ld $4, 40($sp)
ld $5, 36($sp)
ld $t9, 20($sp)
slt $5, $5, $t9
ld $t9, 24($sp)
slt $4, $4, $t9
ld $t9, 28($sp)
slt $3, $3, $t9
shl $3, $3, 31
sra $3, $3, 31
ld $t9, 16($sp)
st $3, 12($sp)
shl $3, $4, 31
sra $3, $3, 31
st $3, 8($sp)
shl $3, $5, 31
sra $3, $3, 31
st $3, 4($sp)
slt $2, $2, $t9
shl $2, $2, 31
sra $2, $2, 31
st $2, 0($sp)
ld $2, 12($sp)
ld $2, 8($sp)
ld $2, 4($sp)
ld $2, 0($sp)
ld $3, 4($sp)
addu $2, $2, $3
ld $3, 12($sp)
ld $3, 8($sp)
ld $3, 0($sp)
ld $3, 8($sp)
addu $2, $2, $3
ld $3, 12($sp)
ld $3, 4($sp)
ld $3, 0($sp)
ld $3, 12($sp)
addu $2, $2, $3
ld $3, 8($sp)
ld $3, 4($sp)
ld $3, 0($sp)
addiu $sp, $sp, 48
ret $lr
.set macro
.set reorder
.end _Z16test_cmplt_shortv
$func_end0:
.size _Z16test_cmplt_shortv, ($func_end0)-_Z16test_cmplt_shortv
.ident "Apple LLVM version 7.0.0 (clang-700.1.76)"
.section ".note.GNU-stack","",@progbits
Since test_longlong_shift2() in ch7_1_vector.cpp requires storeRegToStack() in Cpu0SEInstInfo.cpp, it cannot be verified at this point.
lbdex/chapters/Chapter7_1/Cpu0ISelLowering.h
/// getSetCCResultType - get the ISD::SETCC result ValueType
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
lbdex/chapters/Chapter7_1/Cpu0ISelLowering.cpp
EVT Cpu0TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
EVT VT) const {
if (!VT.isVector())
return MVT::i32;
return VT.changeVectorElementTypeToInteger();
}