Autore Topic: An assembler/disassembler for the QPU processors on the Raspberry Pi  (Letto 152 volte)

0 Utenti e 1 Visitatore stanno visualizzando questo topic.

Offline Flavio58

Re:An assembler/disassembler for the QPU processors on the Raspberry Pi
« Risposta #5 il: Marzo 14, 2018, 03:32:55 am »
Advertisement
Codice: [Seleziona]
// QPU Instruction unpacking
//
// Add/Mul Operations:
//   mulop:3 addop:5 ra:6 rb:6 adda:3 addb:3 mula:3 mulb:3, op:4 packbits:8 addcc:3 mulcc:3 F:1 X:1 wa:6 wb:6
//
// Branches:
//   addr:32, 1111 0000 cond:4 relative:1 register:1 ra:5 X:1 wa:6 wb:6
//
// 32 Bit Immediates:
//   data:32, 1110 unknown:8 addcc:3 mulcc:3 F:1 X:1 wa:6 wb:6

unsigned tmpthis=0;
unsigned tmpnext=0;
char tmpbuff[256];
#define tmpalloc(sizebytes) ( tmpthis = tmpnext+sizebytes > sizeof(tmpbuff) ? 0 : tmpnext, tmpnext = (tmpthis+sizebytes), &tmpbuff[tmpthis])

const char *qpu_r(uint32_t ra, uint32_t rb, uint32_t adda, uint32_t op, int rotator) {

if (op == 13) {
if (rb<48) {
if (adda==6) return banka_r[ra];
if (adda==7) return imm[rb];
}
else {
if ((adda<6) && rotator) {
char *tmp = tmpalloc(32);
sprintf(tmp, "%s%s", acc_names[adda], imm[rb]);
return tmp;
}
if ((adda==6) && rotator) {
char *tmp = tmpalloc(32);
sprintf(tmp, "%s%s", banka_r[ra], imm[rb]);
return tmp;
}
if ((adda==7) && rotator) {
return "err?";
}
}
}

if (adda==6) return banka_r[ra];
if (adda==7) return bankb_r[rb];
return acc_names[adda];
}

const char *qpu_w_add(uint32_t wa, uint32_t X) {
return X ? bankb_w[wa] : banka_w[wa];
}

const char *qpu_w_mul(uint32_t wb, uint32_t X) {
return X ? banka_w[wb] : bankb_w[wb];
}

const char *qpu_unpack_add(uint32_t packmul, uint32_t unpack, uint32_t adda) {
if ((packmul == 0) && (adda == 6))
return srcunpackadd[unpack];
if ((packmul == 1) && (adda == 4))
return srcunpackmul[unpack];
return "";
}

Codice: [Seleziona]

const char *qpu_unpack_mul(uint32_t packmul, uint32_t unpack, uint32_t adda) {
if ((packmul == 0) && (adda == 6))
return srcunpackmul[unpack];
if ((packmul == 1) && (adda == 4))
return srcunpackmul[unpack];
return "";
}

const char *qpu_pack_add(uint32_t packmul, uint32_t pack, uint32_t wa, uint32_t X) {
if ((packmul == 0) && (X==0) && (wa<=32)) //todo: what is the real limit on ra range?
return dstpackadd[pack];
return "";
}

const char *qpu_pack_mul(uint32_t packmul, uint32_t pack, uint32_t wa, uint32_t X) {
if ((packmul == 0) && (X==1) && (wa<=32)) //todo: what is the real limit on ra range?
return dstpackmul[pack];
if (packmul == 1)
return dstpackmul[pack];
return "";
}

void show_qpu_add_mul(uint32_t i0, uint32_t i1)
{
uint32_t mulop = (i0 >> 29) & 0x7;
uint32_t addop = (i0 >> 24) & 0x1f;
uint32_t ra    = (i0 >> 18) & 0x3f;
uint32_t rb    = (i0 >> 12) & 0x3f;
uint32_t adda  = (i0 >>  9) & 0x07;
uint32_t addb  = (i0 >>  6) & 0x07;
uint32_t mula  = (i0 >>  3) & 0x07;
uint32_t mulb  = (i0 >>  0) & 0x07;
uint32_t op    = (i1 >> 28) & 0x0f;
uint32_t packbits  = (i1 >> 20) & 0xff;
uint32_t unpacking = (packbits >> 5) & 0x7;
uint32_t packmul   = (packbits >> 4) & 0x1;
uint32_t packing   = (packbits >> 0) & 0xf;
uint32_t addcc = (i1 >> 17) & 0x07;
uint32_t mulcc = (i1 >> 14) & 0x07;
uint32_t F     = (i1 >> 13) & 0x01;
uint32_t X     = (i1 >> 12) & 0x01;
uint32_t wa    = (i1 >> 6) & 0x3f;
uint32_t wb    = (i1 >> 0) & 0x3f;

if (showfields) {
printf("mulop=%d, addop=%d, ra=%d, rb=%d, adda=%d, addb=%d, mula=%d, mulb=%d, op=%d, unpacking=%d, packmul=%d, packing=%d, addcc=%d, mulcc=%d, F=%d, X=%d, wa=%d, wb=%d  \n",
mulop, addop, ra, rb, adda, addb, mula, mulb, op, unpacking, packmul, packing, addcc, mulcc, F, X, wa, wb);
}

uint32_t addF  = (F==1) && (addop != 0) && (addcc != 0);
uint32_t mulF  = (F==1) && !addF;

// Instruction formats:
// op[cc][setf]
// op[cc][setf] rd[.pack]
// op[cc][setf] rd[.pack], ra[.unpack]
// op[cc][setf] rd[.pack], ra[.unpack], rb[.unpack]
const char *args[] = {
"", " %s%s", " %s%s, %s%s", " %s%s, %s%s, %s%s"
};

uint32_t arity = 3;
if (addop == 0) {
arity = 0;
addcc = 1;
}
else if ((adda == addb) && ((addop == 7) || (addop == 8) || (addop == 21) || (addop == 23) || (addop == 24))) {
arity = 2;
if (addop == 21) addop = 32;
}

// add op always
printf("%s%s%s", addops[addop], cc[addcc], setf[addF]);
printf(args[arity], qpu_w_add(wa, X), qpu_pack_add(packmul, packing, wa, X), qpu_r(ra, rb, adda, op, 0), qpu_unpack_add(packmul, unpacking, adda), qpu_r(ra, rb, addb, op, 0), qpu_unpack_add(packmul, unpacking, addb));

// show mul op if non nop or control op is non nop
        if (mulop || (op != 1)) {

uint32_t arity = 3;
if (mulop == 0) {
arity = 0;
mulcc = 1;
}
else if ((mula == mulb) && (mulop == 4)) {
arity = 2;
if (mulop == 4) mulop = 8;
}

printf("; %s%s%s", mulops[mulop], cc[mulcc], setf[mulF]);
///* 000003a0: 36020037 18025841 */  xor r1, r0, r0; fmul ra1, ra0, unif
printf(args[arity], qpu_w_mul(wb, X), qpu_pack_mul(packmul, packing, wb, X), qpu_r(ra, rb, mula, op, 1), qpu_unpack_mul(packmul, unpacking, mula), qpu_r(ra, rb, mulb, op, 1), qpu_unpack_mul(packmul, unpacking, mulb));
}

// show control op if non nop
if ((op != 1) && (op != 13)) {
printf("; %s", ops[op]);
}
printf("\n");

}

void show_qpu_branch(uint32_t i0, uint32_t i1)
{
uint32_t addr     = i0;
uint32_t unknown  = (i1 >> 24) & 0x0f;
uint32_t cond     = (i1 >> 20) & 0x0f;
uint32_t pcrel    = (i1 >> 19) & 0x01;
uint32_t addreg   = (i1 >> 18) & 0x01;
uint32_t ra       = (i1 >> 13) & 0x1f;
uint32_t X        = (i1 >> 12) & 0x01;
uint32_t wa       = (i1 >>  6) & 0x3f;
uint32_t wb       = (i1 >>  0) & 0x3f;

if (showfields) {
printf("branch addr=0x%08x, unknown=%x, cond=%02d, pcrel=%x, addreg=%x, ra=%02d, X=%x, wa=%02d, wb=%02x\n",
addr, unknown, cond, pcrel, addreg, ra, X, wa, wb);
}
// branch: b[link][cc] [linkreg,] [basedreg,]
if (wa==39)
printf("%s%s %s, %s%+d", pcrel ? "brr" : "bra", bcc[cond], qpu_w_mul(wb, X), addreg ? qpu_r(ra, ra, 6, (i1 >> 28)&0xf, 0) : "", addr);
else if (wb==39)
printf("%s%s %s, %s%+d", pcrel ? "brr" : "bra", bcc[cond], qpu_w_add(wa, X), addreg ? qpu_r(ra, ra, 6, (i1 >> 28)&0xf, 0) : "", addr);
else
printf("%s%s %s, %s, %s%+d", pcrel ? "brr" : "bra", bcc[cond], qpu_w_add(wa, X), qpu_w_mul(wb, X), addreg ? qpu_r(ra, ra, 6, (i1 >> 28)&0xf, 0) : "", addr);

if (!addreg) printf(" // 0x%08x", base+addr+8*4);
printf("\n");

}

const char *qpu_ldi_unpack(uint32_t unpack, uint32_t data)
{
char *tmp = tmpalloc(128);
// unpack = 1 (2 bit signed vectors), 3 = (2 bit unsigned vectors);
if ((unpack==1) || (unpack==3)) {
int d[16];
for (int i=0; i<16; i++) {
d[i] = ((data >> (16+i-1))&0x2) | ((data >> i) & 0x1);
if ((unpack == 1) && d[i] &0x2)
d[i] |= 0xfffffffc;
}
sprintf(tmp, "[%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d]",
d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7],
d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]);
}
else {
sprintf(tmp, "0x%08x", data);
}
return tmp;
}

void show_qpu_imm32(uint32_t i0, uint32_t i1)
{
uint32_t data = i0;
uint32_t packbits  = (i1 >> 20) & 0xff;
uint32_t unpacking = (packbits >> 5) & 0x7;
uint32_t packmul   = (packbits >> 4) & 0x1;
uint32_t packing   = (packbits >> 0) & 0xf;
uint32_t addcc   = (i1 >> 17) & 0x07;
uint32_t mulcc   = (i1 >> 14) & 0x07;
uint32_t F       = (i1 >> 13) & 0x01;
uint32_t X       = (i1 >> 12) & 0x01;
uint32_t wa      = (i1 >>  6) & 0x3f;
uint32_t wb      = (i1 >>  0) & 0x3f;

if (showfields) {
printf("imm32 data=0x%08x, unpacking=0x%d, packmul=%d, packing=%d, addcc=%x, mulcc=%x, F=%x, X=%x, wa=%02d, wb=%02d\n",
data, unpacking, packmul, packing, addcc, mulcc, F, X, wa, wb);
}

const char *inst = ops[(i1 >> 28) & 0xf];

if (unpacking & 0x4) {
inst = (data & 0x10) ? "sacq" : "srel";
if (data <= 0x1f)
data = data & 0xffffffef;
}

// addop: op[cc][setf] rd[.pack?], immediate
if (packbits==0 && addcc==0 && wa==39)
printf("nop");
else
printf("%s%s%s %s%s, %s", inst, cc[addcc], setf[F], qpu_w_add(wa, X), qpu_pack_add(packmul, packing, wa, X), qpu_ldi_unpack(unpacking, data));

// mulop: [op[cc][setf] rd[.pack?], immediate
if (mulcc) {
printf("; %s%s%s %s%s, %s", inst, cc[mulcc], setf[F], qpu_w_mul(wb, X), qpu_pack_mul(packmul, packing, wa, X), qpu_ldi_unpack(unpacking, data));
}

printf("\n");
}

void show_qpu_inst(uint32_t *inst) {
uint32_t i0 = inst[0];
uint32_t i1 = inst[1];

int op = (i1 >> 28) & 0xf;
if (op<14) show_qpu_add_mul(i0, i1);
if (op==14) show_qpu_imm32(i0, i1);
if (op==15) show_qpu_branch(i0, i1);
}

void show_qpu_fragment(uint32_t *inst, int length) {
uint32_t i = 0;
for(;i<length; i+=2) {
base = i*4;
printf("/* %08x: %08x %08x */  ", i*4, inst[i], inst[i+1]); show_qpu_inst(&inst[i]);
}
printf("\n");
}

uint32_t *file_load(const char *filename, uint32_t *filesize) {
uint32_t *memory = 0;
FILE *f = fopen(filename, "rb");
if (f) {
fseek(f, 0, SEEK_END);
long size = ftell(f);
fseek(f, 0, SEEK_SET);
memory = (uint32_t*)(malloc(size+1));
memory[size] = 0;
if ((memory==0) || (fread(memory, size, 1, f)==0)) {
free(memory);
memory = 0;
}
fclose(f);
if (filesize)
*filesize = size;
}
return memory;
}

void file_unload(uint32_t *data) {
free(data);
}

void qpu_dis_file(const char *filename) {
printf("Disassembling %s\n", filename);
uint32_t size;
uint32_t *fragment = file_load(filename, &size);
if (fragment==0) {
printf("Couldn't read fragment %s\n", filename);
return;
}
printf("Fragment %s, size %d\n", filename, size/4);
show_qpu_fragment(fragment, (size/4));
file_unload(fragment);
}

int main(int argc, char * argv[]) {
if (argc < 2) {
    fprintf(stderr, "qpu-disassemble: Pass in a file name to disassemble as the first argument\n");
    exit(1);
  }
  qpu_dis_file(argv[1]);
}


Makefile

Codice: [Seleziona]
all: qpu-asm qpu-dis

qpu-asm: qpu-asm.cpp
g++ -g -o qpu-asm qpu-asm.cpp

qpu-dis: qpu-dis.cpp
g++ -g -o qpu-dis qpu-dis.cpp
Consulente in Informatica dal 1984

Software automazione, progettazione elettronica, computer vision, intelligenza artificiale, IoT, sicurezza informatica, tecnologie di sicurezza militare, SIGINT. 

Facebook:https://www.facebook.com/flaviobernardotti58
Twitter : https://www.twitter.com/Flavio58

Cell:  +39 366 3416556

f.bernardotti@deeplearningitalia.eu

#deeplearning #computervision #embeddedboard #iot #ai

Offline Flavio58

Re:An assembler/disassembler for the QPU processors on the Raspberry Pi
« Risposta #4 il: Marzo 14, 2018, 03:32:37 am »
Codice: [Seleziona]
# VPM_BLOCK_READ_SETUP
# ~~~~~~~~~~~~~~~~~~~~
# Controls how values are read from the VPM data cache into the QPU.
# Arguments:
#  NUM: 0-16 - How many elements to read at a time.
#  STRIDE: 0-64 - The amount to increment the address by after each read.
#  HORIZ: 0 or 1 - Whether the layour is horizontal (1) or vertical (0).
#  LANED: 0 or 1 - Whether the layout is laned (1) or packed (0).
#  SIZE: 0, 1, 2 - The data unit size, 8-bit (0), 16-bit(1), or 32-bit (2).
#  ADDR: 0-255 - Packed address, meaning depends on exact unit size and mode.
# See http://www.broadcom.com/docs/support/videocore/VideoCoreIV-AG100-R.pdf page 58
define(`VPM_BLOCK_READ_SETUP_ID_SHIFT', 30)
define(`VPM_BLOCK_READ_SETUP_NUM_SHIFT', 20)
define(`VPM_BLOCK_READ_SETUP_STRIDE_SHIFT', 12)
define(`VPM_BLOCK_READ_SETUP_HORIZ_SHIFT', 11)
define(`VPM_BLOCK_READ_SETUP_LANED_SHIFT', 10)
define(`VPM_BLOCK_READ_SETUP_SIZE_SHIFT', 8)
define(`VPM_BLOCK_READ_SETUP_ADDR_SHIFT', 0)
define(`VPM_BLOCK_READ_SETUP_VALUE', `eval(
(0<<VPM_BLOCK_READ_SETUP_ID_SHIFT)|
($1<<VPM_BLOCK_READ_SETUP_NUM_SHIFT)|
($2<<VPM_BLOCK_READ_SETUP_STRIDE_SHIFT)|
($3<<VPM_BLOCK_READ_SETUP_HORIZ_SHIFT)|
($4<<VPM_BLOCK_READ_SETUP_LANED_SHIFT)|
($5<<VPM_BLOCK_READ_SETUP_SIZE_SHIFT)|
($6<<VPM_BLOCK_READ_SETUP_ADDR_SHIFT))')
define(`VPM_BLOCK_READ_SETUP', `ldi ra49, VPM_BLOCK_READ_SETUP_VALUE($1, $2, $3, $4, $5, $6)')

# VPM_DMA_STORE_SETUP
# ~~~~~~~~~~~~~~~~~~~
# Configures the DMA controller to transfer data from the VPM cache to main memory.
# Once the setup's been done, you then need to call VPM_DMA_STORE_START to kick
# off the transfer.
# Arguments:
#  UNITS: 0-128 - Number of rows of 2D block in memory.
#  DEPTH: 0-128 - How long each row is (in bytes?).
#  HORIZ: 0 or 1 - Whether the layout is horizontal (1) or vertical (0).
#  ADDRY: The Y coordinate of the address in the VPM space to start from.
#  ADDRX: The X coordinate of the address in the VPM space to start from.
#  MODEW: 0-7 : 0 is 32-bit, 2-3 is 16-bit with offset, 4-7 is 8-bit with offset.
# See http://www.broadcom.com/docs/support/videocore/VideoCoreIV-AG100-R.pdf page 58
define(`VPM_DMA_STORE_SETUP_ID_SHIFT', 30)
define(`VPM_DMA_STORE_SETUP_UNITS_SHIFT', 23)
define(`VPM_DMA_STORE_SETUP_DEPTH_SHIFT', 16)
define(`VPM_DMA_STORE_SETUP_HORIZ_SHIFT', 14)
define(`VPM_DMA_STORE_SETUP_ADDRY_SHIFT', 7)
define(`VPM_DMA_STORE_SETUP_ADDRX_SHIFT', 3)
define(`VPM_DMA_STORE_SETUP_MODEW_SHIFT', 0)
define(`VPM_DMA_STORE_SETUP_VALUE', `eval(
(2<<VPM_DMA_STORE_SETUP_ID_SHIFT)|
($1<<VPM_DMA_STORE_SETUP_UNITS_SHIFT)|
($2<<VPM_DMA_STORE_SETUP_DEPTH_SHIFT)|
($3<<VPM_DMA_STORE_SETUP_HORIZ_SHIFT)|
($4<<VPM_DMA_STORE_SETUP_ADDRY_SHIFT)|
($5<<VPM_DMA_STORE_SETUP_ADDRX_SHIFT)|
($6<<VPM_DMA_STORE_SETUP_MODEW_SHIFT))')
define(`VPM_DMA_STORE_SETUP', `ldi rb49, VPM_DMA_STORE_SETUP_VALUE($1, $2, $3, $4, $5, $6)')

# VPM_DMA_STORE_START
# ~~~~~~~~~~~~~~~~~~~
# Kicks off the transfer of data from the local VPM data cache to main memory.
# It will use the settings from VPM_DMA_STORE_SETUP to control the copy process.
# Arguments:
#  address: A register name that holds the address in main memory to write to.
define(`VPM_DMA_STORE_START', `or rb50, $1, 0;          nop')

# VPM_DMA_STORE_WAIT_FOR_COMPLETION
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Pause until the previous DMA store operation has finished.
define(`VPM_DMA_STORE_WAIT_FOR_COMPLETION', `or rb39, rb50, rb50;       nop')

# VPM_DMA_LOAD_SETUP
# ~~~~~~~~~~~~~~~~~~
# Initializes the settings for transfering data from main memory into the VPM cache.
# Arguments:
#  MODEW: 0-7 : 0 is 32-bit, 2-3 is 16-bit with offset, 4-7 is 8-bit with offset.
#  MPITCH: 0-15: The amount to increment the memory pointer between rows, calculated as 8*2^MPITCH bytes.
#  ROWLEN: 0-15: The number of elements in each row in main memory.
#  NROWS: 0-15: How many rows to read from memory.
#  VPITCH: 0-15: How much to increment the VPM address by after each row is loaded.
#  VERT: 0 or 1 - Whether the layout is vertical (1) or horizontal (0). Be careful, this is inverted compared to normal.
#  ADDRY: 0-64 - The Y coordinate of the address in the VPM space to start loading into.
#  ADDRX: 0-16 - The X coordinate of the address in the VPM space to start loading into.
define(`VPM_DMA_LOAD_SETUP_ID_SHIFT', 31)
define(`VPM_DMA_LOAD_SETUP_MODEW_SHIFT', 28)
define(`VPM_DMA_LOAD_SETUP_MPITCH_SHIFT', 24)
define(`VPM_DMA_LOAD_SETUP_ROWLEN_SHIFT', 20)
define(`VPM_DMA_LOAD_SETUP_NROWS_SHIFT', 16)
define(`VPM_DMA_LOAD_SETUP_VPITCH_SHIFT', 12)
define(`VPM_DMA_LOAD_SETUP_VERT_SHIFT', 11)
define(`VPM_DMA_LOAD_SETUP_ADDRY_SHIFT', 4)
define(`VPM_DMA_LOAD_SETUP_ADDRX_SHIFT', 0)
define(`VPM_DMA_LOAD_SETUP_VALUE', `eval(
(1<<VPM_DMA_LOAD_SETUP_ID_SHIFT)|
($1<<VPM_DMA_LOAD_SETUP_MODEW_SHIFT)|
($2<<VPM_DMA_LOAD_SETUP_MPITCH_SHIFT)|
($3<<VPM_DMA_LOAD_SETUP_ROWLEN_SHIFT)|
($4<<VPM_DMA_LOAD_SETUP_NROWS_SHIFT)|
($5<<VPM_DMA_LOAD_SETUP_VPITCH_SHIFT)|
($6<<VPM_DMA_LOAD_SETUP_VERT_SHIFT)|
($7<<VPM_DMA_LOAD_SETUP_ADDRY_SHIFT)|
($8<<VPM_DMA_LOAD_SETUP_ADDRX_SHIFT))')
define(`VPM_DMA_LOAD_SETUP', `ldi ra49, VPM_DMA_LOAD_SETUP_VALUE($1, $2, $3, $4, $5, $6, $7, $8)')

# VPM_DMA_LOAD_START
# ~~~~~~~~~~~~~~~~~~~
# Kicks off the transfer of data from main memory to the local VPM data cache.
# It will use the settings from VPM_DMA_LOAD_SETUP to control the copy process.
# Arguments:
#  address: A register name that holds the address in main memory to read from.
define(`VPM_DMA_LOAD_START', `or ra50, $1, 0;          nop')

# VPM_DMA_LOAD_WAIT_FOR_COMPLETION
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Pause until the previous DMA load operation has finished.
define(`VPM_DMA_LOAD_WAIT_FOR_COMPLETION', `or rb39, ra50, ra50;       nop')

# END_PROGRAM
# ~~~~~~~~~~~
# Triggers a host interrupt to transfer control back to the main CPU.
define(`END_PROGRAM_HARD', `
or rb38, r0, 1;       nop
nop.tend ra39, ra39, ra39;       nop rb39, rb39, rb39
nop ra39, ra39, ra39;       nop rb39, rb39, rb39
nop ra39, ra39, ra39;       nop rb39, rb39, rb39')

define(`END_PROGRAM_SOFT', `
nop.tend ra39, ra39, ra39;      nop rb39, rb39, rb39
NOP
NOP
')

# NOP
# ~~~
# Do nothing on both pipes for a cycle
define(`NOP', `nop ra39, ra39, ra39;       nop rb39, rb39, rb39')


Codice: [Seleziona]
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <stdint.h>

void show_qpu_inst(uint32_t *inst);
void show_qpu_fragment(uint32_t *inst, int length);


int base;
int showfields = 1;

const char *acc_names[] = {
"r0", "r1", "r2", "r3", "r4", "r5"
};

const char *banka_r[64] = {
"ra0", "ra1", "ra2", "ra3", "ra4", "ra5", "ra6", "ra7",
"ra8", "ra9", "ra10", "ra11", "ra12", "ra13", "ra14", "ra15", //ra15 is w in shaders
"ra16", "ra17", "ra18", "ra19", "ra20", "ra21", "ra22", "ra23",
"ra24", "ra25", "ra26", "ra27", "ra28", "ra29", "ra30", "ra31",
"unif", "ra33?", "ra34?", "vary", "ra36?", "ra37?", "elem_num", "nop",
"ra40", "x_coord", "ms_mask", "ra43?", "ra44?", "ra45?", "ra46?", "ra47?",
"vpm", "vr_busy", "vr_wait", "mutex", "ra52?", "ra53?", "ra54?", "ra55?",
"ra56?", "ra57?", "ra58?", "ra59?", "ra60?", "ra61?", "ra62?", "ra63?",
};

const char *bankb_r[64] = {
"rb0", "rb1", "rb2", "rb3", "rb4", "rb5", "rb6", "rb7",
"rb8", "rb9", "rb10", "rb11", "rb12", "rb13", "rb14", "rb15", //rb15 is z in shaders
"rb16", "rb17", "rb18", "rb19", "rb20", "rb21", "rb22", "rb23",
"rb24", "rb25", "rb26", "rb27", "rb28", "rb29", "rb30", "rb31",
"unif", "rb33?", "rb34?", "vary", "rb36?", "rb37?", "qpu_num", "nop",
"rb40?", "y_coord", "rev_flag", "rb43?", "rb44?", "rb45?", "rb46?", "rb47?",
"vpm", "vw_busy", "vw_wait", "mutex", "rb52?", "rb53?", "rb54?", "rb55?",
"rb56?", "rb57?", "rb58?", "rb59?", "rb60?", "rb61?", "rb62?", "rb63?",
};

const char *banka_w[64] = {
"ra0", "ra1", "ra2", "ra3", "ra4", "ra5", "ra6", "ra7",
"ra8", "ra9", "ra10", "ra11", "ra12", "ra13", "ra14", "ra15", //ra15 is w in shaders
"ra16", "ra17", "ra18", "ra19", "ra20", "ra21", "ra22", "ra23",
"ra24", "ra25", "ra26", "ra27", "ra28", "ra29", "ra30", "ra31",
"r0", "r1", "r2", "r3", "tmurs", "r5quad", "irq", "-",
"unif_addr", "x_coord", "ms_mask", "stencil", "tlbz", "tlbm", "tlbc", "tlbam",
"vpm", "vr_setup", "vr_addr", "mutex", "recip", "recipsqrt", "exp", "log",
"t0s", "t0t", "t0r", "t0b", "t1s", "t1t", "t1r", "t1b",
};

const char *bankb_w[64] = {
"rb0", "rb1", "rb2", "rb3", "rb4", "rb5", "rb6", "rb7",
"rb8", "rb9", "rb10", "rb11", "rb12", "rb13", "rb14", "rb15", //rb15 is z in shaders
"rb16", "rb17", "rb18", "rb19", "rb20", "rb21", "rb22", "rb23",
"rb24", "rb25", "rb26", "rb27", "rb28", "rb29", "rb30", "rb31",
"r0", "r1", "r2", "r3", "tmurs", "r5rep", "irq", "-",
"unif_addr_rel", "y_coord", "rev_flag", "stencil", "tlbz", "tlbm", "tlbc", "tlbam",
"vpm", "vw_setup", "vw_addr", "mutex", "recip", "recipsqrt", "exp", "log",
"t0s", "t0t", "t0r", "t0b", "t1s", "t1t", "t1r", "t1b",
};

const char *ops[] = {
"bkpt", "nop", "thrsw", "thrend", "sbwait", "sbdone", "lthrsw", "loadcv",
"loadc", "ldcend", "ldtmu0", "ldtmu1", "loadam", "nop", "ldi", "bra",
};

const char *addops[] = {
"nop", "fadd", "fsub", "fmin", "fmax", "fminabs", "fmaxabs", "ftoi",
"itof", "addop9", "addop10", "addop11", "add", "sub", "shr", "asr",
"ror", "shl", "min", "max", "and", "or", "xor", "not",
"clz", "addop25", "addop26", "addop27", "addop28", "addop29", "v8adds", "v8subs",

"mov"
};

const char *mulops[] = {
"nop", "fmul", "mul24", "v8muld", "v8min", "v8max", "v8adds", "v8subs",

"mov"
};

const char *cc[] = {
".never", "", ".zs", ".zc", ".ns", ".nc", ".cs", ".cc"
};

const char *dstpackadd[] = {
"", ".16a", ".16b", ".8abcd", ".8a", ".8b", ".8c", ".8d", ".s", ".16as", ".16bs", ".8abcds", ".8as", ".8bs", ".8cs", ".8ds"
};

const char *dstpackmul[] = {
"", ".packm01", ".packm02", ".8abcd", ".8a", ".8b", ".8c", ".8d", ".packm08", ".packm09", ".packm10", ".packm11", ".packm12", ".packm13", ".packm14", ".packm15"
};

const char *srcunpackadd[] = {
"", ".16a", ".16b", ".8dr", ".8a", ".8b", ".8c", ".8d"
};

const char *srcunpackmul[] = {
"", ".16a", ".16b", ".8dr", ".8a", ".8b", ".8c", ".8d"
};

const char *bcc[] = {
".allz", ".allnz", ".anyz", ".anynz", ".alln", ".allnn", ".anyn", ".anynn",
".allc", ".allnc", ".anyc", ".anync", ".cc12", ".cc13", ".cc14", ""
};

const char *imm[] = {
"0", "1", "2", "3", "4", "5", "6", "7",
"8", "9", "10", "11", "12", "13", "14", "15",
"-16", "-15", "-14", "-13", "-12", "-11", "-10", "-9",
"-8", "-7", "-6", "-5", "-4", "-3", "-2", "-1",
"1.0", "2.0", "4.0", "8.0", "16.0", "32.0", "64.0", "128.0",
"1/256", "1/128", "1/64", "1/32", "1/16", "1/8", "1/4", "1/2",
" >> r5", " >> 1", " >> 2", " >> 3", " >> 4", " >> 5", " >> 6", " >> 7",
" >> 8", " >> 9", " >> 10", " >> 11", " >> 12", " >> 13", " >> 14", " >> 15"
};

const char *setf[] = {
"", ".setf"
};
Consulente in Informatica dal 1984

Software automazione, progettazione elettronica, computer vision, intelligenza artificiale, IoT, sicurezza informatica, tecnologie di sicurezza militare, SIGINT. 

Facebook:https://www.facebook.com/flaviobernardotti58
Twitter : https://www.twitter.com/Flavio58

Cell:  +39 366 3416556

f.bernardotti@deeplearningitalia.eu

#deeplearning #computervision #embeddedboard #iot #ai

Offline Flavio58

Re:An assembler/disassembler for the QPU processors on the Raspberry Pi
« Risposta #3 il: Marzo 14, 2018, 03:31:25 am »
Codice: [Seleziona]
uint64_t assembleSEMA(context& ctx, string word)
{

    uint64_t ins = (uint64_t)0x74 << 57;

    string token_str;
    token_t tok = nextToken(ctx.stream, token_str, &ctx.stream);
    if (tok != WORD) {
        cerr << "semaphore instruction expecting down/up or acquire/release" << endl;
        return -1;
    }

    uint8_t sa = 0;             // up
    if (token_str == "down" || token_str == "acquire")
        sa = 1;

    tok = nextToken(ctx.stream, token_str, &ctx.stream);
    if (tok != COMMA)   return -1;
    tok = nextToken(ctx.stream, token_str, &ctx.stream);
    uint32_t imm = parseSmallImmediate(token_str);
    if (imm < 0) {
        cerr << "semaphore out of range" << endl;
        return -1;
    }
    // cond_add, cond_mul = NEVER, ws, sf = false
    ins |= (uint64_t)39 << 38;          // waddr_add
    ins |= (uint64_t)39 << 32;          // waddr_mul
    ins |= sa << 4;
    ins |= (uint8_t)imm;

    cout << "Assembling SEMAPHORE instruction (" << imm << "), " << (int)sa << endl;

    return ins;
}


int main(int argc, char **argv)
{
    char *outfname = 0;
    int c;

    char* writeCPP = NULL;
    while ((c = getopt(argc, argv, "o:c:")) != -1) {
        switch (c) {
            case 'o':
                outfname = optarg;
                break;
            case 'c':
                writeCPP = optarg;
                break;
        }
    }

    if (!outfname) {
        cerr << "Usage: " << argv[0] << " -o <output>" << endl;
        return -1;
    }

    char line[128];
    string token_string;

    struct context ctx;
    ctx.pc = 0;

    vector<uint64_t> instructions;

    while (cin.getline(line, 128))
    {
        const char *p = line;
        ctx.stream = p;
        token_t tok = nextToken(ctx.stream, token_string, &ctx.stream);

        if (tok == END)
            continue;

        if (tok == WORD)
        {
            // read-ahead to see if the next token is a colon in which case
            // this is a label.
            const char *discard = NULL;
            string nextTokenStr;
            if (nextToken(ctx.stream, nextTokenStr, &discard) == COLON) {
                ctx.labels[token_string] = ctx.pc;
                continue;
            }

            enum { INVALID, ALU, BRANCH, LDI, SEMA } opType = INVALID;
            if (addOpCode(token_string) != 0xFF || mulOpCode(token_string) != 0xFF)
                opType = ALU;
            if (token_string == "ldi") opType = LDI;
            if (token_string == "bra" || token_string == "brr") opType = BRANCH;
            if (token_string == "sema") opType = SEMA;

            if (opType == INVALID) {
                cout << "Unable to assemble line; invalid opcode: " << line << endl;
                return -1;
            }

            uint64_t ins = 0;
            switch (opType) {
                case ALU: ins = assembleALU(ctx, token_string); break;
                case BRANCH: ins = assembleBRANCH(ctx, token_string); break;
                case LDI: ins = assembleLDI(ctx, token_string); break;
                case SEMA: ins = assembleSEMA(ctx, token_string); break;
            }

            if (ins == (uint64_t)-1) {
                cerr << "Error on line: " << line << endl;
                return -1;
            }

            instructions.push_back(ins);
            ctx.pc += 8;            // bytes;
        }
    }

    // Process relocations
    ctx.labels["ZERO"] = 0x0;
    for (int i=0; i < ctx.relocations.size(); i++)
    {
        relocation& r = ctx.relocations[i];
        if (ctx.labels.count(r.label) < 1)
        {
            cerr << "undefined label: " << r.label << endl;
            return -1;
        }
        int offset = ctx.labels[r.label] - (r.pc + 4*8);
        if (r.label == "ZERO")
            offset = 0x0;
        cout << "Processing relocation at " << r.pc << " : " << r.label
                                            << " : " << offset << endl;
        uint64_t ins = instructions[r.pc / 8];
        ins &= (uint64_t)0xFFFFFFFF << 32;   // zero bottom 32-bits for new value
        ins |= (uint32_t)offset;
        instructions[r.pc / 8] = ins;
    }

    FILE *outfile = fopen(outfname, "w");
    if (!outfile)
    {
        cerr << "Unable to open output file " << string(outfname) << endl;
        return -1;
    }

    if (writeCPP) {
      fprintf(outfile, "#include <stdint.h>\n");
      fprintf(outfile, "#include <stddef.h>\n\n");
      fprintf(outfile, "uint32_t %s[%d] = {\n", writeCPP, (instructions.size() * 2));
      uint32_t* instructionsData = (uint32_t*)(&instructions[0]);
      for (int i=0; i < instructions.size(); i++) {
        fprintf(outfile, "  0x%08x, 0x%08x,\n", instructionsData[(i * 2) + 0], instructionsData[(i * 2) + 1]);
      }
      fprintf(outfile, "};\n\n");
      fprintf(outfile, "size_t %sByteCount = %d;\n", writeCPP, (instructions.size() * 8));
    } else {
      for (int i=0; i < instructions.size(); i++)
          fwrite(&instructions[i], sizeof(uint64_t), 1, outfile);
    }

    fclose(outfile);
    cout << "Done.  Num instructions: " << instructions.size() << ", "
         << instructions.size() * 8 << " bytes." << endl;
}


Modulo DMA

Codice: [Seleziona]
define(`MUTEX_ACQUIRE',     `or ra39, ra51, rb39;           nop')
define(`MUTEX_RELEASE',     `or ra51, ra39, ra39;           nop')

# Hardwired IO registers
define(`rVpmWriteFifo', `rb48')
define(`rVpmReadFifo', `ra48')
define(`raReadUniform', `ra32')
define(`rbReadUniform', `rb32')
define(`raZero', `ra39')
define(`rbZero', `rb39')

# Macro argument constants
define(`MODEW_32_BIT', 0)
define(`MODEW_16_BIT_OFFSET_0', 2)
define(`MODEW_16_BIT_OFFSET_1', 3)
define(`MODEW_8_BIT_OFFSET_0', 4)
define(`MODEW_8_BIT_OFFSET_1', 5)
define(`MODEW_8_BIT_OFFSET_2', 6)
define(`MODEW_8_BIT_OFFSET_3', 7)
define(`SIZE_8_BIT', 0)
define(`SIZE_16_BIT', 1)
define(`SIZE_32_BIT', 2)
define(`IS_HORIZ', 1)
define(`NOT_HORIZ', 0)
define(`IS_VERT', 1)
define(`NOT_VERT', 0)
define(`IS_LANED', 1)
define(`NOT_LANED', 0)

# VPM_BLOCK_WRITE_SETUP
# ~~~~~~~~~~~~~~~~~~~~~
# Sets up things so writes go into the small VPM data cache.
# Once the data's been written (by outputting repeatedly to the VPM_WRITE_FIFO
# register rb48), you then call VPM_DMA_WRITE_SETUP to configure the main
# memory destination and writing pattern.
# Arguments:
#  STRIDE: 0-64 - How much to increment the ADDR after each write.
#  HORIZ: 0 or 1 - Whether the layout is horizontal (1) or vertical (0).
#  LANED: 0 or 1 - Whether the layout is laned (1) or packed (0).
#  SIZE: 0, 1, 2 - The data unit size, 8-bit (0), 16-bit(1), or 32-bit (2).
#  ADDR: 0-255 - Packed address, meaning depends on exact unit size and mode.
# See http://www.broadcom.com/docs/support/videocore/VideoCoreIV-AG100-R.pdf page 57
define(`VPM_BLOCK_WRITE_SETUP_ID_SHIFT', 30)
define(`VPM_BLOCK_WRITE_SETUP_STRIDE_SHIFT', 12)
define(`VPM_BLOCK_WRITE_SETUP_HORIZ_SHIFT', 11)
define(`VPM_BLOCK_WRITE_SETUP_LANED_SHIFT', 10)
define(`VPM_BLOCK_WRITE_SETUP_SIZE_SHIFT', 8)
define(`VPM_BLOCK_WRITE_SETUP_ADDR_SHIFT', 0)
define(`VPM_BLOCK_WRITE_SETUP_VALUE', `eval(
(0<<VPM_BLOCK_WRITE_SETUP_ID_SHIFT)|
($1<<VPM_BLOCK_WRITE_SETUP_STRIDE_SHIFT)|
($2<<VPM_BLOCK_WRITE_SETUP_HORIZ_SHIFT)|
($3<<VPM_BLOCK_WRITE_SETUP_LANED_SHIFT)|
($4<<VPM_BLOCK_WRITE_SETUP_SIZE_SHIFT)|
($5<<VPM_BLOCK_WRITE_SETUP_ADDR_SHIFT))')
define(`VPM_BLOCK_WRITE_SETUP', `ldi rb49, VPM_BLOCK_WRITE_SETUP_VALUE($1, $2, $3, $4, $5)')
Consulente in Informatica dal 1984

Software automazione, progettazione elettronica, computer vision, intelligenza artificiale, IoT, sicurezza informatica, tecnologie di sicurezza militare, SIGINT. 

Facebook:https://www.facebook.com/flaviobernardotti58
Twitter : https://www.twitter.com/Flavio58

Cell:  +39 366 3416556

f.bernardotti@deeplearningitalia.eu

#deeplearning #computervision #embeddedboard #iot #ai

Offline Flavio58

Re:An assembler/disassembler for the QPU processors on the Raspberry Pi
« Risposta #2 il: Marzo 14, 2018, 03:28:26 am »
Codice: [Seleziona]

uint8_t setALUMux(const QPUreg& reg)
{
    switch (reg.file) {
        case QPUreg::A: return 0x6;
        case QPUreg::B: return 0x7;
        case QPUreg::ACCUM:
            if (reg.num > 6 || reg.num < 0) {
                cerr << "Invalid accumulator register; out of range" << endl;
                exit(0);
            }
            return reg.num;
        case QPUreg::SMALL: return 0x7;
    }
}


token_t nextToken(const char *stream, string& out, const char **ptr)
{
    char buffer[128];
    int i = 0;

    *ptr = stream;
    if (!stream || !*stream)
        return END;

    while (*stream == ' ' || *stream == '\t')
        stream++;

    if (isdigit(*stream))
    {
        // read until we don't find a hex digit, x (for hex) or .
        while (isxdigit(*stream) || isdigit(*stream) || *stream == '.' || *stream == 'x') {
            buffer[i++] = *stream++;
            if (*stream == 0 || i > sizeof(buffer) - 1)
                break;
        }
        buffer[i++] = '\0';
        out = buffer;
        *ptr = stream;

        return WORD;
    }


    if (*stream == '.') { *ptr = stream+1; return DOT; }
    if (*stream == ',') { *ptr = stream+1; return COMMA; }
    if (*stream == ';') { *ptr = stream+1; return SEMI; }
    if (*stream == '#') { *ptr = stream+1; return END; }
    if (*stream == ':') { *ptr = stream+1; return COLON; }

    while (*stream != '.' && *stream != ',' && *stream != ';'
                          && *stream != ' ' && *stream != '\t'
                          && *stream != ':')
    {
        buffer[i++] = *stream++;
        if (*stream == 0 || i > sizeof(buffer)-1)
            break;
    }

    buffer[i++] = '\0';
    out = buffer;
    *ptr = stream;
    return WORD;
}


bool aluHelper(const char *stream, QPUreg& dest, QPUreg& r1, QPUreg& r2, uint8_t& sig, uint32_t& unpack, uint32_t& pm, uint32_t& pack, const char **ptr)
{
    string token_str;
    token_t tok = nextToken(stream, token_str, &stream);

    if (tok == DOT) {
        // conditional
        nextToken(stream, token_str, &stream);
        cout << "flag/conditional = " << token_str << endl;
        if (token_str == "ldtmu0") {
            sig = 10;
        } else if (token_str == "ldtmu1") {
            sig = 11;
        } else if (token_str == "tend") {
            sig = 3;
        } else if (parsePacking(token_str, &unpack, &pm, &pack)) {
          // Do nothing, the parse function has filled in the values
        } else {
          cout << "Conditional couldn't be understood: " << token_str << endl;
          return false;
        }
        tok = nextToken(stream, token_str, &stream);
    }

    // this is supposed to be the destination register
    if (tok != WORD) {
        cout << "Expecting word.  Got: " << token_str << endl;
        return false;
    }

    if (!parseRegister(token_str, dest)) {
      return false;
    }
    tok = nextToken(stream, token_str, &stream);
    if (tok != COMMA) return false;
    tok = nextToken(stream, token_str, &stream);
    if (!parseRegister(token_str, r1)) {
      return false;
    }

    tok = nextToken(stream, token_str, &stream);
    if (tok != COMMA) return false;
    tok = nextToken(stream, token_str, &stream);
    if (!parseRegister(token_str, r2)) {
        r2.file = QPUreg::SMALL;
        int32_t imm = parseSmallImmediate(token_str);
        if (imm < 0) {
          return false;
        }
        r2.num = imm;
    }

    /*
    cout << "dest: " << printRegister(dest) << ", r1: "
                     << printRegister(r1) << ", r2: "
                     << printRegister(r2) << endl;
                     */

    *ptr = stream;
    return true;
}


uint64_t assembleALU(context& ctx, string word)
{
    string token_str;
    uint8_t add_op = addOpCode(word);
    if (add_op == 0xFF) {
        cout << "FATAL (assert).  Bad ADD opcode: " << word << endl;
        return -1;
    }

    uint32_t unpack = 0;
    uint32_t pm = 0;
    uint32_t pack = 0;

    QPUreg addDest, addR1, addR2;
    QPUreg mulDest, mulR1, mulR2;

    uint8_t sig = 0x1;          // no-signal (TODO: plumb signals through)
    if (!aluHelper(ctx.stream, addDest, addR1, addR2, sig, unpack, pm, pack, &ctx.stream))
        return -1;

    token_t tok = nextToken(ctx.stream, token_str, &ctx.stream);
    // this should be a semi-colon
    tok = nextToken(ctx.stream, token_str, &ctx.stream);
    uint8_t mul_op = mulOpCode(token_str);
    if (mul_op == 0xFF) {
        cout << "FATAL (assert).  Bad MUL opcode: " << token_str << endl;
        return -1;
    }

    bool skipParseMul(false);
    if (mul_op == 0) {
        // nop.  If the next token is a semi or END, we'll generate
        // the registers for them
        const char *discard;
        tok = nextToken(ctx.stream, token_str, &discard);
        if (tok == END || tok == SEMI) {
            mulDest.num = 39;
            mulDest.file = (addDest.file == QPUreg::A) ? QPUreg::B : QPUreg::A;
            mulR1 = addR1;
            mulR2 = addR2;
            skipParseMul = true;
        }
    }

    if (!skipParseMul) {
        uint8_t junk;
        uint32_t junk32;
        if (!aluHelper(ctx.stream, mulDest, mulR1, mulR2, junk, junk32, junk32, junk32, &ctx.stream))
            return -1;
    }

    uint64_t ins = 0x0;
    uint8_t cond_add = 0x1;
    uint8_t cond_mul = 0x1;
    uint8_t sf = 0x1;
    if (add_op == 0)
        sf = 0x0;           // no set flags on nop

    // TODO: constraints.  We can only read from file A and file B once (dual-port)

    uint8_t ws = 0x0;
    // If the add pipe specifies file b for output, ws = 1
    if ((addDest.file == QPUreg::B) ||
        ((addDest.file == QPUreg::ACCUM) && (mulDest.file == QPUreg::A))) {
        ws = 0x1;
    }
    // if ws == 1, mul pipe must specify file a or accumulator for output
    if (ws == 0x1 && (mulDest.file != QPUreg::A) && (mulDest.file != QPUreg::ACCUM)) {
        cout << "constraint check failed.  mul pipe must specify register file A when write-swap set, but found " << printRegister(mulDest) << endl;
        return -1;
    }
    // if ws == 0, mul pipe must specify file b or accumulator for output
    if (ws == 0x0 && (mulDest.file != QPUreg::B) && (mulDest.file != QPUreg::ACCUM)) {
        cout << "constraint check failed.  mul pipe must specify register file B when write-swap clear, but found " << printRegister(mulDest) << endl;
        return -1;
    }

    // TODO: handle the accumulators and the small immediate
    uint8_t read_a = 0x0;
    uint8_t read_b = 0x0;
    bool isReadASet = false;
    bool isReadBSet = false;
    QPUreg candidates[] = {addR1, addR2, mulR1, mulR2};
    for (int index = 0; index < (sizeof(candidates)/sizeof(candidates[0])); index += 1) {
      QPUreg reg = candidates[index];
      if (reg.file == QPUreg::A) {
        if (isReadASet && (read_a != reg.num)) {
          fprintf(stderr, "Error: Can't set multiple different general registers as sources in a single ALU instruction\n");
          return -1;
        }
        isReadASet = true;
        read_a = reg.num;
      }
      if (reg.file == QPUreg::B) {
        if (isReadBSet && (read_b != reg.num)) {
          fprintf(stderr, "Error: Can't set multiple different general registers as sources in a single ALU instruction\n");
          return -1;
        }
        isReadBSet = true;
        read_b = reg.num;
      }
    }

    // checks:
    //   read_a not set and one of the muxes specifies file A ...
    //   same for read_b
    //   read_b set and there is a small immediate value

    // we could have immediates in the first register slot but not sure it makes sense
    // As above, we should check that read_b is not already set
    if (addR2.file == QPUreg::SMALL)    {
      if (isReadBSet && (read_b != addR2.num)) {
        fprintf(stderr, "Error: Can't set an immediate and general registers as sources in a single ALU instruction\n");
        return -1;
      }
      isReadBSet = true;
      read_b = addR2.num;
      sig = 13;
    }
    if (mulR2.file == QPUreg::SMALL)    {
      if (isReadBSet && (read_b != mulR2.num)) {
        fprintf(stderr, "Error: Can't set an immediate and general registers as sources in a single ALU instruction\n");
        return -1;
      }
      isReadBSet = true;
      read_b = mulR2.num;
      sig = 13;
    }

    // The accumulators are mapped to r32-35 when writing to them as destinations
    if (addDest.file == QPUreg::ACCUM) {
      addDest.num += 32;
    }
    if (mulDest.file == QPUreg::ACCUM) {
      mulDest.num += 32;
    }

    uint8_t add_a = setALUMux(addR1) & 0x7;
    uint8_t add_b = setALUMux(addR2) & 0x7;
    uint8_t mul_a = setALUMux(mulR1) & 0x7;
    uint8_t mul_b = setALUMux(mulR2) & 0x7;
    read_a &= 0x3f;
    read_b &= 0x3f;
    mul_op &= 0x7;
    add_op &= 0x1f;
    addDest.num &= 0x3f;
    mulDest.num &= 0x3f;
    cond_add &= 0x7;
    cond_mul &= 0x7;
    sf &= 0x1;
    ws &= 0x1;

//    printf("Assembling ALU instruction: %s, %d, %d\n", printRegister(addDest).c_str(), ws, sig);

    printf("ALU: %s %s, %s, %s; %s %s, %s, %s\n",
      printAddOpCode(add_op).c_str(),
      printRegister(addDest).c_str(),
      printRegister(addR1).c_str(),
      printRegister(addR2).c_str(),
      printMulOpCode(mul_op).c_str(),
      printRegister(mulDest).c_str(),
      printRegister(mulR1).c_str(),
      printRegister(mulR2).c_str()
    );

    ins = ((uint64_t)sig << 60) |
      ((uint64_t)unpack << 57) |
      ((uint64_t)pm << 56) |
      ((uint64_t)pack << 52) |
      ((uint64_t)cond_add << 49) |
      ((uint64_t)cond_mul << 46) |
      ((uint64_t)sf << 45) |
      ((uint64_t)ws << 44);
    ins |= ((uint64_t)addDest.num << 38) | ((uint64_t)mulDest.num << 32) | ((uint64_t)mul_op << 29) | ((uint64_t)add_op << 24);
    ins |= ((uint64_t)read_a << 18) | ((uint64_t)read_b << 12) | ((uint64_t)add_a << 9) | ((uint64_t)add_b << 6) | ((uint64_t)mul_a << 3) | mul_b;

    return ins;
}

Codice: [Seleziona]
uint64_t assembleLDI(context& ctx, string word)
{
    cout << "Assembling LDI instruction ... " << endl;

    string token_str;
    token_t tok = nextToken(ctx.stream, token_str, &ctx.stream);

    if (tok == DOT) {
        // conditional ... conditionals should be on each register ?
        cout << "conditional ... ";
        // chew the conditional
        nextToken(ctx.stream, token_str, &ctx.stream);

        tok = nextToken(ctx.stream, token_str, &ctx.stream);
    }

    // this is supposed to be the register
    if (tok != WORD) return -1;

    QPUreg register1, register2;
    // check errors here
    if (!parseRegister(token_str, register1)) {
      return false;
    }
    tok = nextToken(ctx.stream, token_str, &ctx.stream);
    if (tok != COMMA) return -1;
    tok = nextToken(ctx.stream, token_str, &ctx.stream);

    // this can either be another register
    // (in which case we'll use both ALUs to set)
    // or an immediate value (in which case we'll use rX39)
    register2.num = 39;
    register2.file = (register1.file == QPUreg::A) ? QPUreg::B : QPUreg::A;
    if (isRegisterWord(token_str)) {
        if (!parseRegister(token_str, register2)) {
          return -1;
        }
        tok = nextToken(ctx.stream, token_str, &ctx.stream);
        // check that this is a comma ...
    }

    uint32_t immediateType = 0x00; // A full 32-bit immediate
    unsigned int immediate;
    string restOfLine(ctx.stream);
    restOfLine = (token_str + restOfLine);
    if (!parseFullImmediate(restOfLine, &immediate, &immediateType)) {
      cerr << "Immediate couldn't be parsed: " << restOfLine << endl;
      return -1;
    }

    cout << "r1: " << printRegister(register1) << ", r2: "
                   << printRegister(register2) << ", immed: 0x"
                   << hex << immediate << dec << endl;

    // The accumulators are mapped to r32-35 in this context
    if (register1.file == QPUreg::ACCUM) {
      register1.num += 32;
    }
    if (register2.file == QPUreg::ACCUM) {
      register2.num += 32;
    }

    uint32_t high = (uint32_t)0xE << 28;
    high |= immediateType << 24;
    high |= (uint32_t)0x1 << 17;      // cond_add
    high |= (uint32_t)0x1 << 14;      // cond_mul
    high |= (uint32_t)0x0 << 13;      // sf
    high |= (uint32_t)0x0 << 12;      // ws
    uint8_t addreg = (register1.file != QPUreg::B) ? register1.num : register2.num;
    uint8_t mulreg = (register1.file == QPUreg::B) ? register1.num : register2.num;
    high |= (uint32_t)addreg << 6;
    high |= mulreg;
    uint64_t ins = ((uint64_t)high << 32) | immediate;

    return ins;
}

uint64_t assembleBRANCH(context& ctx, string word)
{
    cout << "Assembing BRANCH instruction" << endl;

    QPUreg dest;
    string token_str;
    token_t tok = nextToken(ctx.stream, token_str, &ctx.stream);

    // relative or absolute branch?
    uint8_t relative = 1;
    if (word == "bra")
        relative = 0;

    uint8_t branchCondition = 0xf;          // by default: always (unconditional branch)
    if (tok == DOT) {
        // conditional
        nextToken(ctx.stream, token_str, &ctx.stream);
        branchCondition = parseBranchCond(token_str);
        tok = nextToken(ctx.stream, token_str, &ctx.stream);
    }

    // this is the destination register
    if (tok != WORD) {
        cerr << "branch expecting destination register." << endl;
        return -1;
    }
    if (!parseRegister(token_str, dest)) {
      return false;
    }
    tok = nextToken(ctx.stream, token_str, &ctx.stream);
    if (tok != COMMA) return false;
    tok = nextToken(ctx.stream, token_str, &ctx.stream);
    if (tok != WORD) {
        cerr << "branch expecting label/target" << endl;
        return -1;
    }

    // look it up in the labels map
    int target = 0xFFFFFFFF;
    if (ctx.labels.count(token_str) < 1) {
        relocation r;
        r.label = token_str;
        r.pc = ctx.pc;
        ctx.relocations.push_back(r);
    } else
        target = ctx.labels[token_str];
    int offset = target - (ctx.pc+4*8);

    uint8_t raddr_a = 0;           // raddr_a is only 5-bits?
    uint8_t use_reg = 0;
    // if there's a third argument, it is a register offset
    const char *discard;
    tok = nextToken(ctx.stream, token_str, &discard);
    if (tok == COMMA) {
        QPUreg offsetReg;
        // chew the comma we just read
        ctx.stream = discard;
        tok = nextToken(ctx.stream, token_str, &ctx.stream);
        if (!parseRegister(token_str, offsetReg)) {
          return -1;
        }
        if (offsetReg.file != QPUreg::A) {
            cerr << "branch target offset register must be file A" << endl;
            return -1;
        }
        if (offsetReg.num > 31) {
            cerr << "branch target offset register must be < 32" << endl;
            return -1;
        }
        raddr_a = offsetReg.num;
        use_reg = 1;
    }

    uint8_t waddr_add = 39;         // link address appears at ALU outputs
    uint8_t waddr_mul = 39;
    if (dest.file == QPUreg::A) waddr_add = dest.num;
    if (dest.file == QPUreg::B) waddr_mul = dest.num;

    // TODO: generate absolute branches too

    uint64_t ins = (uint64_t)0xF << 60;
    ins |= (uint64_t)branchCondition << 52;
    ins |= (uint64_t)relative << 51;
    ins |= (uint64_t)use_reg << 50;
    ins |= (uint64_t)raddr_a << 45;
    ins |= (uint64_t)0x0 << 44;                       // write-swap
    ins |= (uint64_t)waddr_add << 38;
    ins |= (uint64_t)waddr_mul << 32;
    ins |= (uint32_t)offset;

    return ins;
}

Consulente in Informatica dal 1984

Software automazione, progettazione elettronica, computer vision, intelligenza artificiale, IoT, sicurezza informatica, tecnologie di sicurezza militare, SIGINT. 

Facebook:https://www.facebook.com/flaviobernardotti58
Twitter : https://www.twitter.com/Flavio58

Cell:  +39 366 3416556

f.bernardotti@deeplearningitalia.eu

#deeplearning #computervision #embeddedboard #iot #ai

Offline Flavio58

An assembler/disassembler for the QPU processors on the Raspberry Pi
« Risposta #1 il: Marzo 14, 2018, 03:26:54 am »
QPU assembly per raspberry

Codice: [Seleziona]
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <inttypes.h>
#include <map>
#include <vector>
#include <assert.h>
#include <errno.h>
#include <sstream>
#include <algorithm>
#include <unistd.h> // for getopt()

using namespace std;

enum token_t {
    END=-1,
    WORD,
    DOT,
    COMMA,
    SEMI,
    COLON,
};

struct QPUreg {
    enum { A, B, ACCUM, SMALL } file;
    int num;
};

struct relocation {
    string label;
    int pc;
};

struct context {
    const char *stream;
    map<string, int> labels;
    int pc;
    vector<relocation> relocations;
};


static string addOps[] = {
    "nop", "fadd", "fsub", "fmin", "fmax", "fminabs", "fmaxabs",
    "ftoi", "itof", "XXX", "XXX", "XXX", "add", "sub", "shr",
    "asr", "ror", "shl", "min", "max", "and", "or", "xor", "not",
    "clz", "XXX", "XXX", "XXX", "XXX", "XXX", "v8adds", "v8subs" };

static string mulOps[] = {
    "nop", "fmul", "mul24", "v8muld", "v8min", "v8max", "v8adds",
    "v8subs" };

static uint8_t addOpCode(const string& word)
{
    for (int i=0; i < 32; i++) {
        if (word == addOps[i])
            return i;
    }

    return 0xFF;
}

string printAddOpCode(uint8_t opcode) {
  assert((opcode >= 0) && (opcode < 32));
  return addOps[opcode];
}

static uint8_t mulOpCode(const string& word)
{
    for (int i=0; i < 8; i++) {
        if (word == mulOps[i])
            return i;
    }

    return 0xFF;
}

string printMulOpCode(uint8_t opcode) {
  assert((opcode >= 0) && (opcode < 8));
  return mulOps[opcode];
}

bool isRegisterWord(const string& word) { return word[0] == 'r'; }

string printRegister(const QPUreg& reg)
{
    char buffer[32];
    if (reg.file == QPUreg::A || reg.file == QPUreg::B) {
        snprintf(buffer, 32, "r%c%d", (reg.file == QPUreg::A) ? 'a' : 'b',
                                      reg.num);
    }
    else if (reg.file == QPUreg::ACCUM) {
        snprintf(buffer, 32, "r%d", reg.num);
    }
    else {
        snprintf(buffer, 32, ".0x%x.", reg.num);
    }

    return buffer;
}

void parsePossibleNumber(const char* possibleNumber, int base, int* outNumber, bool* outIsNumber) {
    char *endOfNumber;
    *outNumber = strtol(possibleNumber, &endOfNumber, base);
    *outIsNumber = (!(endOfNumber == possibleNumber || *endOfNumber != '\0' || errno == ERANGE));
}

bool parseRegister(const string& word, QPUreg& reg)
{
    if (word[0] != 'r')
        return false;

    int offset = 0;
    switch (word[1]) {
        case 'a': reg.file = QPUreg::A; offset = 2; break;
        case 'b': reg.file = QPUreg::B; offset = 2; break;
        default:
            reg.file = QPUreg::ACCUM;
            offset = 1;
    }

    const char* possibleNumber = (word.c_str() + offset);
    bool isNumber;
    int number;
    parsePossibleNumber(possibleNumber, 10, &number, &isNumber);
    if (!isNumber) {
      cerr << "Warning - couldn't interpret '" << word << "' as a register" << endl;
      return false;
    }
    reg.num = number;

    if ((reg.file == QPUreg::ACCUM) && (reg.num >= 6)) {
      fprintf(stderr, "Warning - accumulator out of range\n");
      return false;
    }

    return true;
}

bool parseFullImmediate(const string& str, uint32_t* outResult, uint32_t* outType)
{
    bool isNumber;
    if (str[0] == '[') {
      bool areAnyNegative = false;
      std:string cleanedString(str);
      cleanedString.erase(std::remove(cleanedString.begin(), cleanedString.end(), '['), cleanedString.end());
      cleanedString.erase(std::remove(cleanedString.begin(), cleanedString.end(), ']'), cleanedString.end());
      std::stringstream ss(cleanedString);
      std::string item;
      int itemCount = 0;
      int itemValues[16];
      while (std::getline(ss, item, ',')) {
        if (itemCount >= 16) {
          break;
        }
        bool isItemNumber;
        int itemValue;
        parsePossibleNumber(item.c_str(), 10, &itemValues[itemCount], &isItemNumber);
        if (!isItemNumber) {
          cerr << "Couldn't understand '" << item << "' as an entry in an immediate list" << endl;
          return false;
        }
        if (itemValues[itemCount] < 0) {
          areAnyNegative = true;
        }
        itemCount += 1;
      }

      if (itemCount < 16) {
          cerr << "Found too few items in the immediate array - expected 16 but had " << itemCount << endl;
          return false;
      }

      if (areAnyNegative) {
        *outType = 0x02;
      } else {
        *outType = 0x06;
      }

      uint32_t result = 0;
      for (int index = 0; index < 16; index += 1) {
        int value = itemValues[index];
        if (areAnyNegative) {
          if ((value < -1) || (value > 1)) {
            cerr << "Found an out-of-range signed value in the immediate array - expected -1, 0, or 1 but found " << value << endl;
            return false;
          }
        } else {
          if (value > 3) {
            cerr << "Found an out-of-range unsigned value in the immediate array - expected 0, 1, 2, or 3 but found " << value << endl;
            return false;
          }
        }
        uint32_t msb;
        uint32_t lsb;
        if (areAnyNegative) {
          msb = ((value & 0x80000000) >> 31);
          lsb = (value & 0x1);
        } else {
          msb = ((value & 0x2) >> 1);
          lsb = (value & 0x1);
        }
        result = (result | (lsb << (index + 0)));
        result = (result | (msb << (index + 16)));
      }

      *outResult = result;
      isNumber = true;
    } else {
      *outType = 0x00; // A full 32-bit immediate
      // if there is an 'x' we assume it's hex.
      if (str.find_first_of("x") != string::npos) {
          int signedResult;
          parsePossibleNumber(str.c_str(), 16, &signedResult, &isNumber);
          *outResult = signedResult;
      } else if (str.find_first_of(".f") != string::npos) {
          float f = strtof(str.c_str(), NULL);
          *outResult = *(uint32_t*)&f;
          isNumber = true;
      } else {
          int signedResult;
          parsePossibleNumber(str.c_str(), 10, &signedResult, &isNumber);
          *outResult = signedResult;
      }
    }
    return isNumber;
}

int32_t parseSmallImmediate(const string& str)
{
    int32_t result;
    if (str.find_first_of("x") != string::npos) {
        result = strtoul(str.c_str(), NULL, 16);
        if (result >= 16) {
          cerr << "Immediate out of range: " << str << endl;
          result = -1;
        }
    } else if (str.find_first_of("<<") != string::npos) {
        uint32_t shift = strtoul(str.c_str() + 2, NULL, 10);
        result = (48 + shift);
    } else if (str.find_first_of("-") != string::npos) {
        uint32_t value = strtoul(str.c_str() + 1, NULL, 10);
        if ((value < 1) || (value > 16)) {
          cerr << "Negative immediate out of range: " << str << endl;
          result = -1;
        } else {
          result = (32 + value);
        }
    } else {
        result = strtoul(str.c_str(), NULL, 10);
        if (result >= 16) {
          cerr << "Immediate out of range: " << str << endl;
          result = -1;
        }
    }
    return result;
}

uint8_t parseBranchCond(const string& str)
{
    if (str == "zf")            // all z flags set ("z full")
        return 0x0;
    if (str == "ze")            // all z flags clear ("z empty")
        return 0x1;
    if (str == "zs")            // any z flags set ("z set")
        return 0x2;
    if (str == "zc")            // any z flags clear ("z clear")
        return 0x3;
    if (str == "nf")            // all N flags set ("N full")
        return 0x4;
    if (str == "ne")            // all N flags clear ("N empty")
        return 0x5;
    if (str == "ns")            // any N flags set ("N set")
        return 0x6;
    if (str == "nc")            // any N flags clear ("N clear")
        return 0x7;
    if (str == "cf")            // all C flags set ("C full")
        return 0x8;
    if (str == "ce")            // all C flags clear ("C empty")
        return 0x9;
    if (str == "cs")            // any C flags set ("C set")
        return 0xa;
    if (str == "cc")            // any C flags clear ("C clear")
        return 0xb;
    if (str == "*")             // always
        return 0xf;

    // throw some exceptions
    cerr << "Invalid branch condition: " << str << endl;
    exit(0);
}

bool parsePacking(const string& str, uint32_t* outUnpack, uint32_t* outPM, uint32_t* outPack)
{
    *outUnpack = 0;
    *outPM = 0;
    *outPack = 0;
    if (str == "unpack32") {
        *outUnpack = 0;
    } else if (str == "unpack16a") {
        *outUnpack = 1;
    } else if (str == "unpack16b") {
        *outUnpack = 2;
    } else if (str == "unpack8ddupe") {
        *outUnpack = 3;
    } else if (str == "unpack8a") {
        *outUnpack = 4;
    } else if (str == "unpack8b") {
        *outUnpack = 5;
    } else if (str == "unpack8c") {
        *outUnpack = 6;
    } else if (str == "unpack8d") {
        *outUnpack = 7;
    } else if (str == "pack32") {
        *outPack = 0;
    } else if (str == "pack16a") {
        *outPack = 1;
    } else if (str == "pack16b") {
        *outPack = 2;
    } else if (str == "pack8ddupe") {
        *outPack = 3;
    } else if (str == "pack8a") {
        *outPack = 4;
    } else if (str == "pack8b") {
        *outPack = 5;
    } else if (str == "pack8c") {
        *outPack = 6;
    } else if (str == "pack8d") {
        *outPack = 7;
    } else if (str == "pack32clamp") {
        *outPack = 8;
    } else if (str == "pack16aclamp") {
        *outPack = 9;
    } else if (str == "pack16bclamp") {
        *outPack = 10;
    } else if (str == "pack8ddupeclamp") {
        *outPack = 11;
    } else if (str == "pack8aclamp") {
        *outPack = 12;
    } else if (str == "pack8bclamp") {
        *outPack = 13;
    } else if (str == "pack8cclamp") {
        *outPack = 14;
    } else if (str == "pack8dclamp") {
        *outPack = 15;
    } else {
      cerr << "Unknown pack condition: " << str << endl;
      return false;
    }

    return true;
}


Consulente in Informatica dal 1984

Software automazione, progettazione elettronica, computer vision, intelligenza artificiale, IoT, sicurezza informatica, tecnologie di sicurezza militare, SIGINT. 

Facebook:https://www.facebook.com/flaviobernardotti58
Twitter : https://www.twitter.com/Flavio58

Cell:  +39 366 3416556

f.bernardotti@deeplearningitalia.eu

#deeplearning #computervision #embeddedboard #iot #ai

 

Related Topics

  Oggetto / Aperto da Risposte Ultimo post
0 Risposte
203 Visite
Ultimo post Marzo 14, 2018, 02:29:49 pm
da Ruggero Respigo
0 Risposte
90 Visite
Ultimo post Marzo 21, 2018, 02:04:40 pm
da Ruggero Respigo
0 Risposte
97 Visite
Ultimo post Marzo 27, 2018, 12:31:59 am
da Marco1971
0 Risposte
63 Visite
Ultimo post Marzo 30, 2018, 08:04:40 pm
da Marco1971
0 Risposte
71 Visite
Ultimo post Maggio 15, 2018, 07:15:06 pm
da Flavio58

Sitemap 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326