URL
https://opencores.org/ocsvn/forwardcom/forwardcom/trunk
Subversion Repositories forwardcom
Compare Revisions
- This comparison shows the changes necessary to convert path
/forwardcom/bintools
- from Rev 55 to Rev 56
- ↔ Reverse comparison
Rev 55 → Rev 56
/emulator5.cpp
0,0 → 1,1972
/**************************** emulator5.cpp ******************************** |
* Author: Agner Fog |
* date created: 2018-02-18 |
* Last modified: 2021-06-30 |
* Version: 1.11 |
* Project: Binary tools for ForwardCom instruction set |
* Description: |
* Emulator: Execution functions for single format instructions, continued |
* |
* Copyright 2018-2021 GNU General Public License http://www.gnu.org/licenses |
*****************************************************************************/ |
|
#include "stdafx.h" |
|
// Format 1.3 B. Two vector registers and a broadcast 8-bit immediate operand. |
|
static uint64_t gp2vec (CThread * t) { |
// Move value of general purpose register RS to scalar in vector register RD. |
uint8_t rd = t->operands[0]; |
uint8_t rs = t->operands[4]; |
uint64_t result = t->registers[rs]; // read general purpose register |
t->vectorLength[rd] = dataSizeTable[t->operandType]; // set length of destination |
t->vect = 4; // stop vector loop |
return result; |
} |
|
static uint64_t vec2gp (CThread * t) { |
// Move value of first element of vector register RS to general purpose register RD. |
uint8_t rd = t->operands[0]; |
uint8_t rs = t->operands[4]; |
uint8_t size = dataSizeTable[t->operandType]; |
if (size > t->vectorLength[rs]) size = t->vectorLength[rs]; // limit size to vector length |
uint64_t result = *(uint64_t*)(t->vectors.buf() + t->MaxVectorLength*rs); // read directly from vector |
if (size < 8) result &= ((uint64_t)1 << size*8) - 1; // mask off to size |
t->registers[rd] = result; // write to general purpose register |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD |
t->returnType &= ~ 0x100; // debug return type not vector |
return result; |
} |
|
static uint64_t make_sequence (CThread * t) { |
// Make a vector with RS sequential numbers. First value is IM1. |
uint8_t rd = t->operands[0]; |
uint8_t rs = t->operands[4]; |
int32_t val = int8_t(t->pInstr->b[0]); // immediate operand, sign extended integer |
uint64_t num = t->registers[rs]; // number of elements |
uint32_t elementSize = dataSizeTable[t->operandType]; |
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize) |
SNum temp; |
// limit length |
uint64_t length = num << dsizelog; |
if (length > t->MaxVectorLength) { |
length = t->MaxVectorLength; num = length >> dsizelog; |
} |
// set length of rd |
t->vectorLength[rd] = (uint32_t)length; |
// loop through destination vector |
for (uint32_t pos = 0; pos < length; pos += elementSize) { |
switch (t->operandType) { |
case 0: case 1: case 2: case 3: |
t->writeVectorElement(rd, (uint64_t)(int64_t)val, pos); break; |
case 4: |
t->writeVectorElement(rd, (uint64_t)(int64_t)val, pos); // int128 |
t->writeVectorElement(rd, (uint64_t)((int64_t)val >> 63), pos+8); break; |
case 5: // float |
temp.f = float(val); // convert to float |
t->writeVectorElement(rd, temp.q, pos); |
break; |
case 6: // double |
temp.d = double(val); // convert to double |
t->writeVectorElement(rd, temp.q, pos); |
break; |
default: |
t->interrupt(INT_WRONG_PARAMETERS); |
} |
val++; // increment value |
} |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD |
return 0; |
} |
|
static uint64_t compress(CThread * t) { |
// Compress vector RT of length RS to a vector of half the length and half the element size. |
// Double precision -> single precision, 64-bit integer -> 32-bit integer, etc. |
|
// operands: |
uint8_t rd = t->operands[0]; |
uint8_t rs = t->operands[4]; |
uint8_t IM1 = t->parm[4].b; |
if (IM1 & 0xC0) t->interrupt(INT_WRONG_PARAMETERS); |
//uint32_t initLength = t->vectorLength[rt]; |
uint32_t oldLength = t->vectorLength[rs]; // (uint32_t)t->registers[rs]; |
uint32_t newLength = oldLength / 2; |
uint32_t pos; // position in destination vector |
uint8_t overflowU = 0; // unsigned overflow in current element |
uint8_t overflowS = 0; // signed overflow in current element |
uint8_t overflowU2 = 0; // unsigned overflow in any element |
uint8_t overflowS2 = 0; // signed overflow in any element |
uint8_t overflowF2 = 0; // floating point overflow in any element |
SNum mask = t->parm[3]; // options mask |
int8_t * source = t->vectors.buf() + (uint64_t)rs * t->MaxVectorLength; // address of RS data |
int8_t * destination = t->vectors.buf() + (uint64_t)rd * t->MaxVectorLength; // address of RD data |
|
uint8_t roundingMode = (IM1 >> 3) & 7; // floating point rounding mode |
if (roundingMode == 0) roundingMode = ((t->parm[3].i >> MSKI_ROUNDING) & 7) | 4; |
uint8_t exceptionControl = IM1 & 7; // floating point exception enable bits: |
// 1: overflow, 2: underflow, 4: inexact |
if (exceptionControl == 0) { // floating point exception control |
exceptionControl = mask.i >> (MSKI_EXCEPTIONS + 1) & 7; // exceptions from NUMCONTR |
} |
else if (exceptionControl == 7) { |
exceptionControl = 0; // 7 means none (5 means all) |
} |
|
switch (t->operandType) { // source operand type |
case 0: // int8 -> int4 |
for (pos = 0; pos < newLength; pos += 1) { |
union { |
uint16_t s; |
uint8_t b[2]; |
} u; |
u.s = *(uint16_t*)(source + 2*pos); // two values to convert to one byte |
for (int i = 0; i < 2; i++) { // loop for two bytes to convert |
uint8_t val = u.b[i]; |
overflowU = val > 0x0F; // unsigned overflow |
overflowS = val - 0xF8 > 0x0F; // signed overflow |
overflowU2 |= overflowU; overflowS2 |= overflowS; |
switch (IM1 & 7) { |
case 0: default: // wrap around |
break; |
case 4: // signed integer overflow gives zero |
if (overflowS) val = 0; |
break; |
case 5: // signed integer overflow gives signed saturation |
if (overflowS) val = 0x7 + (val >> 7); |
break; |
case 6: // unsigned integer overflow gives zero |
if (overflowU) val = 0; |
break; |
case 7: // unsigned integer overflow gives unsigned saturation |
if (overflowU) val = 0xF; |
break; |
} |
u.b[i] = val; |
} |
uint8_t val2 = (u.b[0] & 0xF) | u.b[1] << 4; |
*(uint8_t*)(destination + pos) = val2; // store two values |
} |
t->returnType = 0x110; |
break; |
case 1: // int16 -> int8 |
for (pos = 0; pos < newLength; pos += 1) { |
uint16_t val = *(uint16_t*)(source + 2*pos); // value to convert |
overflowU = val > 0xFF; // unsigned overflow |
overflowS = val - 0xFF80 > 0xFF; // signed overflow |
overflowU2 |= overflowU; overflowS2 |= overflowS; |
switch (IM1 & 7) { |
case 0: default: // wrap around |
break; |
case 4: // signed integer overflow gives zero |
if (overflowS) val = 0; |
break; |
case 5: // signed integer overflow gives signed saturation |
if (overflowS) val = 0x7F + (val >> 15); |
break; |
case 6: // unsigned integer overflow gives zero |
if (overflowU) val = 0; |
break; |
case 7: // unsigned integer overflow gives unsigned saturation |
if (overflowU) val = 0xFF; |
break; |
} |
*(uint8_t*)(destination + pos) = (uint8_t)val; // store value |
} |
t->returnType = 0x110; |
break; |
case 2: // int32 -> int16 |
for (pos = 0; pos < newLength; pos += 2) { |
uint32_t val = *(uint32_t*)(source + 2*pos); // value to convert |
overflowU = val > 0xFFFF; // unsigned overflow |
overflowS = val - 0xFFFF8000 > 0xFFFF; // signed overflow |
switch (IM1 & 7) { |
case 0: default: // wrap around |
break; |
case 4: // signed integer overflow gives zero |
if (overflowS) val = 0; |
break; |
case 5: // signed integer overflow gives signed saturation |
if (overflowS) val = 0x7FFF + (val >> 31); |
break; |
case 6: // unsigned integer overflow gives zero |
if (overflowU) val = 0; |
break; |
case 7: // unsigned integer overflow gives unsigned saturation |
if (overflowU) val = 0xFFFF; |
break; |
} |
*(uint16_t*)(destination + pos) = (uint16_t)val; // store value |
} |
t->returnType = 0x111; |
break; |
case 3: // int64 -> int32 |
for (pos = 0; pos < newLength; pos += 4) { |
uint64_t val = *(uint64_t*)(source + 2*pos); // value to convert |
overflowU = val > 0xFFFFFFFFU; // unsigned overflow |
overflowS = val - 0xFFFFFFFF80000000 > 0xFFFFFFFFU; // signed overflow |
switch (IM1 & 7) { |
case 0: default: // wrap around |
break; |
case 4: // signed integer overflow gives zero |
if (overflowS) val = 0; |
break; |
case 5: // signed integer overflow gives signed saturation |
if (overflowS) val = 0x7FFFFFFF + (val >> 63); |
break; |
case 6: // unsigned integer overflow gives zero |
if (overflowU) val = 0; |
break; |
case 7: // unsigned integer overflow gives unsigned saturation |
if (overflowU) val = 0xFFFFFFFF; |
break; |
} |
*(uint32_t*)(destination + pos) = (uint32_t)val; // store value |
} |
t->returnType = 0x112; |
break; |
case 4: // int128 -> int64 |
for (pos = 0; pos < newLength; pos += 8) { |
uint64_t valLo = *(uint64_t*)(source + 2*pos); // value to convert, low part |
uint64_t valHi = *(uint64_t*)(source + 2*pos + 8); // value to convert, high part |
overflowU = valHi != 0; // unsigned overflow |
if ((int64_t)valLo < 0) overflowS = valHi+1 != 0; // signed overflow |
else overflowS = valHi != 0; |
overflowU2 |= overflowU; overflowS2 |= overflowS; |
switch (IM1 & 7) { |
case 0: default: // wrap around |
break; |
case 4: // signed integer overflow gives zero |
if (overflowS) valLo = 0; |
break; |
case 5: // signed integer overflow gives signed saturation |
if (overflowS) valLo = nsign_d + (valHi >> 63); |
break; |
case 6: // unsigned integer overflow gives zero |
if (overflowU) valHi = valLo = 0; |
break; |
case 7: // unsigned integer overflow gives unsigned saturation |
if (overflowU) valLo = 0xFFFFFFFFFFFFFFFF; |
break; |
} |
} |
t->returnType = 0x113; |
break; |
case 5: // float -> float16 |
for (pos = 0; pos < newLength; pos += 2) { |
SNum val; |
val.i = *(uint32_t*)(source + 2 * pos); // value to convert |
uint16_t val2 = float2half(val.f); // convert to half precision |
if (!isnan_or_inf_f(val.i)) { |
// check rounding mode |
switch (roundingMode) { |
case 1: // odd if not exact |
if (half2float(val2) != val.f) val2 |= 1; |
break; |
case 4: default: // nearest or even |
break; |
case 5: // down |
if (half2float(val2) > val.f) { |
if (val2 << 1 == 0) val2 = 0x8001; // 0 -> subnormal negative |
else if (int16_t(val2) > 0) val2--; |
else val2++; |
} |
break; |
case 6: // up |
if (half2float(val2) < val.f) { |
if (val2 << 1 == 0) val2 = 0x0001; // 0 -> subnormal positive |
else if (int16_t(val2) > 0) val2++; |
else val2--; |
} |
break; |
case 7: // towards zero |
if (half2float(val2) != val.f && (val2 << 1 != 0)) { |
val2--; |
} |
break; |
} |
// check overflow |
overflowS = (val2 & 0x7FFF) == 0x7C00 && !isinf_f(val.i);// detect overflow |
overflowF2 |= overflowS; |
if (overflowS) { // check for overflow |
if (exceptionControl & 1) { // overflow exception -> NAN |
val2 = (uint16_t)t->makeNan(nan_overflow_conv, 1); // overflow |
} |
} |
else if ((exceptionControl & 6) && val2 << 1 == 0 && val.f != 0.f) { |
val2 = (uint16_t)t->makeNan(nan_underflow, 1); // underflow exception (inexact implies underflow) |
} |
else if ((exceptionControl & 4) && half2float(val2) != val.f) { |
val2 = (uint16_t)t->makeNan(nan_inexact, 1); // inexact exception |
} |
} |
*(uint16_t*)(destination + pos) = val2; // store value |
} |
t->returnType = 0x118; |
break; |
case 6: // double -> float |
for (pos = 0; pos < newLength; pos += 4) { |
SNum val1, val2; |
val1.q = *(uint64_t*)(source + 2 * pos); // value to convert |
// check NAN and INF |
if (isnan_or_inf_d(val1.q)) { |
union { // single precision float |
float f; |
struct { // structure of a NAN |
uint32_t payload : 22; |
uint32_t quiet : 1; |
uint32_t expo : 8; |
uint32_t sign : 1; |
}; |
} u; |
u.payload = val1.i & ((1 << 22) - 1); // ForwardCom has right-justified NAN payload, unlike other binary systems |
u.quiet = val1.q >> 51 & 1; |
u.expo = 0xFF; |
u.sign = val1.q >> 63 & 1; |
val2.f = u.f; |
} |
else { |
val2.f = float(val1.d); // convert to single precision |
// check rounding mode |
uint8_t roundingMode = (IM1 >> 3) & 7; |
if (roundingMode == 0) roundingMode = ((t->parm[3].i >> MSKI_ROUNDING) & 7) | 4; |
switch (roundingMode) { |
case 1: // odd if not exact |
if (val2.f != val1.d) { |
val2.i |= 1; |
} |
break; |
case 4: default: // nearest or even |
break; |
case 5: // down |
if (val2.f > val1.d) { |
if (val2.f == 0.f) val2.i = 0x80000001; // 0 -> subnormal negative |
else if (val2.i > 0) val2.i--; |
else val2.i++; |
} |
break; |
case 6: // up |
if (val2.f < val1.d) { |
if (val2.f == 0.f) val2.i = 0x00000001; // 0 -> subnormal positive |
else if (val2.i > 0) val2.i++; |
else val2.i--; |
} |
break; |
case 7: // towards zero |
if (val2.f != val1.d && val2.f != 0.f) { |
val2.i--; |
} |
break; |
} |
// check overflow |
overflowS = isinf_f(val2.i) && !isinf_d(val1.q); // detect overflow |
overflowF2 |= overflowS; |
if (overflowS) { // check for overflow |
if (exceptionControl & 1) { // overflow exception -> NAN |
val2.q = t->makeNan(nan_overflow_conv, 5); // overflow |
} |
} |
else if ((exceptionControl & 6) && val2.f == 0.f && val1.d != 0.) { |
val2.q = t->makeNan(nan_underflow, 5); // underflow exception |
} |
else if ((exceptionControl & 4) && val2.f != val1.d) { |
val2.q = t->makeNan(nan_inexact, 5); // inexact exception |
} |
} |
*(uint32_t*)(destination + pos) = val2.i; // store value |
} |
t->returnType = 0x115; |
break; |
default: |
t->interrupt(INT_WRONG_PARAMETERS); |
} |
// check overflow traps |
/* |
if (mask.i & MSK_OVERFL_ALL) { |
if ((mask.i & MSK_OVERFL_SIGN) && overflowS2) t->interrupt(INT_OVERFL_SIGN); // signed overflow |
else if ((mask.i & MSK_OVERFL_UNSIGN) && overflowU2) t->interrupt(INT_OVERFL_UNSIGN); // unsigned overflow |
else if ((mask.i & MSK_OVERFL_FLOAT) && overflowF2) t->interrupt(INT_OVERFL_FLOAT); // float overflow |
} */ |
t->vectorLength[rd] = newLength; // save new vector length |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save. result has already been saved |
return 0; |
} |
|
static uint64_t expand(CThread * t) { |
// Expand vector RS to a vector of the double length and the double element size. |
// OT specifies the element size or precision of the destination. |
// Half precision -> single precision, 32-bit integer -> 64-bit integer, etc. |
|
// Operands: |
uint8_t rd = t->operands[0]; |
uint8_t rs = t->operands[4]; |
uint8_t IM1 = t->parm[4].b; |
if (IM1 & 0xFC) t->interrupt(INT_WRONG_PARAMETERS); |
bool signExtend = (IM1 & 2) == 0; |
|
uint32_t initLength = t->vectorLength[rs]; |
uint32_t newLength = 2 * initLength; |
if (newLength > t->MaxVectorLength) newLength = t->MaxVectorLength; |
// uint32_t oldLength = newLength / 2; |
uint32_t pos; // position in source vector |
int8_t * source = t->vectors.buf() + (uint32_t)rs * t->MaxVectorLength; // address of RT data |
int8_t * destination = t->vectors.buf() + (uint32_t)rd * t->MaxVectorLength; // address of RD data |
if (rd == rs) { |
// source and destination are the same. Make a temporary copy of source to avoid overwriting |
memcpy(t->tempBuffer, source, initLength); |
source = t->tempBuffer; |
} |
switch (t->operandType) { |
case 0: // int4 -> int8 |
for (pos = 0; pos < newLength; pos += 1) { |
uint8_t val1 = *(uint8_t*)(source + pos); // values to convert |
union { |
uint16_t s; |
uint8_t b[2]; |
int8_t bs[2]; |
} val2; |
if (signExtend) { |
val2.bs[0] = (int8_t)val1 << 4 >> 4; // sign extend |
val2.bs[1] = (int8_t)val1 >> 4; // sign extend |
} |
else { |
val2.b[0] = val1 & 0xF; // zero extend |
val2.b[1] = val1 >> 4; // zero extend |
} |
*(uint16_t*)(destination + pos*2) = val2.s; // store value |
} |
break; |
case 1: // int8 -> int16 |
for (pos = 0; pos < newLength; pos += 1) { |
uint16_t val = *(uint8_t*)(source + pos); // value to convert |
if (signExtend) val = uint16_t((int16_t)(val << 8) >> 8); // sign extend |
*(uint16_t*)(destination + pos*2) = val; // store value |
} |
break; |
case 2: // int16 -> int32 |
for (pos = 0; pos < newLength; pos += 2) { |
uint32_t val = *(uint16_t*)(source + pos); // value to convert |
if (signExtend) val = uint32_t((int32_t)(val << 16) >> 16); // sign extend |
*(uint32_t*)(destination + pos*2) = val; // store value |
} |
break; |
case 3: // int32 -> int64 |
for (pos = 0; pos < newLength; pos += 4) { |
uint64_t val = *(uint32_t*)(source + pos); // value to convert |
if (signExtend) val = uint64_t((int64_t)(val << 32) >> 32); // sign extend |
*(uint64_t*)(destination + pos*2) = val; // store value |
} |
break; |
case 4: // int64 -> int128 |
for (pos = 0; pos < newLength; pos += 8) { |
uint64_t valLo = *(uint64_t*)(source + pos); // value to convert |
uint64_t valHi = 0; |
if (signExtend) valHi = uint64_t((int64_t)valLo >> 63); // sign extend |
*(uint64_t*)(destination + pos*2) = valLo; // store low part |
*(uint64_t*)(destination + pos*2 + 8) = valHi; // store high part |
} |
break; |
case 5: // float16 -> float |
for (pos = 0; pos < newLength; pos += 2) { |
uint16_t val1 = *(uint16_t*)(source + pos); // value to convert |
float val2 = half2float(val1); // convert half precision to float |
*(float*)(destination + pos*2) = val2; // store value |
} |
break; |
case 6: // float -> double |
for (pos = 0; pos < newLength; pos += 4) { |
SNum val1; |
val1.i = *(uint32_t*)(source + pos); // value to convert |
double val2 = val1.f; // convert to double precision |
// check NAN |
// ForwardCom has right-justified NAN payload, unlike other binary systems |
if (isnan_f(val1.i)) { |
union { // single precision float |
double d; |
struct { // structure of a NAN |
uint64_t payload : 51; |
uint64_t quiet : 1; |
uint64_t expo : 11; |
uint64_t sign : 1; |
}; |
} u; |
u.payload = val1.q & ((1 << 22) - 1); |
u.quiet = val1.i >> 22 & 1; |
u.expo = 0x7FF; |
u.sign = val1.q >> 63 & 1; |
val2 = u.d; |
} |
*(double*)(destination + pos*2) = val2; // store value |
} |
break; |
default: |
t->interrupt(INT_WRONG_PARAMETERS); |
} |
t->vectorLength[rd] = newLength; // save new vector length |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save. result has already been saved |
return 0; |
} |
|
static uint64_t float2int (CThread * t) { |
// Conversion of floating point to signed or unsigned integer with the same operand size. |
// The rounding mode and overflow control is specified in IM1. |
SNum a = t->parm[1]; |
SNum b = t->parm[4]; |
int64_t result = 0; |
uint32_t dataSize = dataSizeTable[t->operandType]; |
uint8_t roundingMode = b.b >> 3 & 3; |
uint8_t signMode = roundingMode | (b.b & 2) << 1; // bit 0-1: rounding mode, bit 2: usigned |
bool overflow = false; |
bool invalid = false; |
|
if (dataSize == 2) { // float16 -> int16 |
const float max = (float)(int32_t)0x7FFF; |
const float min = -max - 1.0f; |
const float umax = (float)(uint32_t)0xFFFFu; |
if (isnan_h(a.s)) { |
invalid = true; |
} |
else { |
float f = half2float(a.s); |
switch (signMode) { // rounding mode: |
case 0: // nearest or even |
if (f >= max + 0.5f || f < min - 0.5f) overflow = true; |
result = (int)(nearbyint(f)); |
break; |
case 1: // down |
if (f >= max + 1.0f || f <= min) overflow = true; |
result = (int)(floor(f)); |
break; |
case 2: // up |
if (f > max || f <= min - 1.0f) overflow = true; |
result = (int)(ceil(f)); |
break; |
case 3: // towards zero |
if (f >= max + 1.0f || f <= min - 1.0f) overflow = true; |
result = (int)(f); |
break; |
case 4: // unsigned nearest or even |
if (f >= umax + 0.5f || f < - 0.5f) overflow = true; |
result = (int)(nearbyint(f)); |
break; |
case 5: case 7: // unsigned down |
if (f >= umax + 1.0f || f < 0.0f) overflow = true; |
result = (int)(floor(f)); |
break; |
case 6: // unsigned up |
if (f > umax || f <= -1.0f) overflow = true; |
else result = (int)(ceil(f)); |
} |
if (overflow) { |
switch (b.b & 7) { // overflow options |
case 0: default: // wrap around |
result &= 0xFFFFu; |
break; |
case 4: case 6: |
result = 0; |
break; |
case 5: // signed saturation |
result = 0x7FFF + int(f < 0); |
break; |
case 7: // unsigned saturation |
result = 0xFFFFu; |
break; |
} |
} |
if (invalid) { |
result = (b.b & 0x20) ? 0x8000u : 0; |
} |
} |
} |
else if (dataSize == 4) { // float -> int32 |
const float max = (float)(int32_t)nsign_f; |
const float min = -max - 1.0f; |
const float umax = (float)(uint32_t)0xFFFFFFFFu; |
if (isnan_f(a.i)) { |
invalid = true; |
} |
else { |
switch (signMode) { // rounding mode: |
case 0: // nearest or even |
if (a.f >= max + 0.5f || a.f < min - 0.5f) overflow = true; |
result = (int64_t)(nearbyint(a.f)); |
break; |
case 1: // down |
if (a.f >= max + 1.0f || a.f <= min) overflow = true; |
result = (int64_t)(floor(a.f)); |
break; |
case 2: // up |
if (a.f > max || a.f <= min - 1.0f) overflow = true; |
result = (int64_t)(ceil(a.f)); |
break; |
case 3: // towards zero |
if (a.f >= max + 1.0f || a.f <= min - 1.0f) overflow = true; |
result = (int64_t)(a.f); |
break; |
case 4: // unsigned nearest or even |
if (a.f >= umax + 0.5f || a.f < - 0.5f) overflow = true; |
result = (int64_t)(nearbyint(a.f)); |
break; |
case 5: case 7: // unsigned down |
if (a.f >= umax + 1.0f || a.f < 0.0f) overflow = true; |
result = (int64_t)(floor(a.f)); |
break; |
case 6: // unsigned up |
if (a.f > umax || a.f <= -1.0f) overflow = true; |
else result = (int64_t)(ceil(a.f)); |
} |
if (overflow) { |
switch (b.b & 7) { // overflow options |
case 0: // wrap around |
result &= 0xFFFFFFFFu; |
break; |
case 4: case 6: |
result = 0; |
break; |
case 5: // signed saturation |
result = 0x7FFFFFFF + int(a.f < 0); |
break; |
case 7: // unsigned saturation |
result = 0xFFFFFFFFu; |
break; |
} |
} |
if (invalid) { |
result = (b.b & 0x20) ? sign_f : 0; |
} |
} |
} |
else if (dataSize == 8) { // double -> int64 |
const double max = (double)(int64_t)nsign_d; |
const double min = -max - 1.0f; |
const double umax = (double)0xFFFFFFFFFFFFFFFFu; |
if (isnan_d(a.q)) { |
invalid = true; |
} |
else { |
switch (signMode) { // rounding mode: |
case 0: // nearest or even |
if (a.d >= max + 0.5 || a.d < min - 0.5) overflow = true; |
result = (int64_t)(nearbyint(a.d)); |
break; |
case 1: // down |
if (a.d >= max + 1.0 || a.d <= min) overflow = true; |
result = (int64_t)(floor(a.d)); |
break; |
case 2: // up |
if (a.d > max || a.d <= min - 1.0) overflow = true; |
result = (int64_t)(ceil(a.d)); |
break; |
case 3: // towards zero |
if (a.d >= max + 1.0 || a.d <= min - 1.0) overflow = true; |
result = (int64_t)(a.d); |
break; |
case 4: // unsigned nearest or even |
if (a.d >= umax + 0.5 || a.d < - 0.5) overflow = true; |
result = (uint64_t)(nearbyint(a.d)); |
break; |
case 5: case 7: // unsigned down |
if (a.d >= umax + 1.0 || a.d < 0.0) overflow = true; |
result = (uint64_t)(floor(a.d)); |
break; |
case 6: // unsigned up |
if (a.d > umax || a.d <= -1.0) overflow = true; |
result = (uint64_t)(ceil(a.d)); |
} |
} |
if (overflow) { |
switch (b.b & 7) { // overflow options |
case 0: // wrap around |
break; |
case 4: case 6: |
result = 0; |
break; |
case 5: // signed saturation |
result = nsign_d + int(a.d < 0); |
break; |
case 7: // unsigned saturation |
result = 0xFFFFFFFFFFFFFFFFu; |
break; |
} |
} |
if (invalid) { |
result = (b.b & 0x20) ? sign_d : 0; |
} |
} |
else t->interrupt(INT_WRONG_PARAMETERS); |
/* Traps not supported |
if (overflow && (mask.i & MSK_OVERFL_SIGN)) { |
t->interrupt(INT_OVERFL_SIGN); // signed overflow |
result = dataSizeMask[t->operandType] >> 1; // INT_MAX |
} |
if (invalid && (mask.i & MSK_FLOAT_NAN_LOSS)) { |
t->interrupt(INT_FLOAT_NAN_LOSS); // nan converted to integer |
result = dataSizeMask[t->operandType] >> 1; // INT_MAX |
} */ |
if ((t->operandType & 7) >= 5) t->operandType -= 3; // debug return type is integer |
return result; |
} |
|
static uint64_t int2float (CThread * t) { |
// Conversion of signed or unsigned integer to floating point with same operand size. |
SNum a = t->parm[1]; |
SNum IM1 = t->parm[4]; |
bool isSigned = (IM1.b & 1) == 0; // signed integer |
bool inexactX = (IM1.b & 4) != 0; // make NAN exception if inexact |
|
SNum result; |
uint32_t dataSize = dataSizeTable[t->operandType]; |
switch (dataSize) { |
case 2: // int16 -> float16 |
if (isSigned) { |
result.s = float2half(float(a.ss)); |
if (inexactX && int32_t(half2float(result.s)) != a.ss) { |
result.q = t->makeNan(nan_inexact, 1); |
} |
} |
else { // unsigned |
result.s = float2half(float(a.s)); |
if (inexactX && uint32_t(half2float(result.s)) != a.s) { |
result.q = t->makeNan(nan_inexact, 1); |
} |
} |
t->returnType = 0x118; // debug return type is float16 |
break; |
|
case 4: // int32 -> float |
if (isSigned) { |
result.f = (float)a.is; |
if (inexactX && int32_t(result.f) != a.is) { |
result.q = t->makeNan(nan_inexact, 5); |
} |
} |
else { |
result.f = (float)a.i; |
if (inexactX && uint32_t(result.f) != a.i) { |
result.q = t->makeNan(nan_inexact, 5); |
} |
} |
t->returnType = 0x115; // debug return type is float |
break; |
|
case 8: // int64 -> double |
if (isSigned) { |
result.d = (double)a.qs; |
if (inexactX && int64_t(result.d) != a.qs) { |
result.q = t->makeNan(nan_inexact, 6); |
} |
} |
else { |
result.d = (double)a.q; |
if (inexactX && uint64_t(result.d) != a.q) { |
result.q = t->makeNan(nan_inexact, 6); |
} |
} |
t->returnType = 0x116; // debug return type is double |
break; |
|
default: |
t->interrupt(INT_WRONG_PARAMETERS); |
result.q = 0; |
} |
return result.q; |
} |
|
static uint64_t round_ (CThread * t) { |
// Round floating point to integer in floating point representation. |
// The rounding mode is specified in IM1. |
// Conversion of floating point to signed integer with the same operand size. |
// The rounding mode is specified in IM1. |
SNum a = t->parm[1]; |
SNum b = t->parm[4]; |
SNum result; |
uint32_t dataSize = dataSizeTable[t->operandType]; |
if (dataSize == 4) { // float -> int32 |
switch (b.b) { // rounding mode: |
case 0: // nearest or even |
result.f = nearbyintf(a.f); |
break; |
case 1: // down |
result.f = floorf(a.f); |
break; |
case 2: // up |
result.f = ceilf(a.f); |
break; |
case 3: // towards zero |
result.f = truncf(a.f); |
break; |
default: t->interrupt(INT_WRONG_PARAMETERS); |
} |
} |
else if (dataSize == 8) { // double -> int64 |
switch (b.b) { // rounding mode: |
case 0: // nearest or even |
result.d = nearbyint(a.d); |
break; |
case 1: // down |
result.d = floor(a.d); |
break; |
case 2: // up |
result.d = ceil(a.d); |
break; |
case 3: // towards zero |
result.d = trunc(a.d); |
break; |
default: t->interrupt(INT_WRONG_PARAMETERS); |
} |
} |
return result.q; |
} |
|
static uint64_t round2n (CThread * t) { |
// Round to nearest multiple of 2n. |
// RD = 2^n * round(2^(−n)*RS). |
// n is a signed integer constant in IM1 |
SNum b = t->parm[4]; // n |
//SNum mask = t->parm[3]; |
uint32_t exponent1; |
uint64_t result = 0; |
if (t->operandType == 5) { // float |
union { |
uint32_t i; |
float f; |
struct { |
uint32_t mantissa : 23; |
uint32_t exponent : 8; |
uint32_t sign : 1; |
}; |
} u; |
u.i = t->parm[1].i; // input a |
if (isnan_f(u.i)) return u.i; // a is nan |
exponent1 = u.exponent; |
if (exponent1 == 0) { |
u.mantissa = 0; // a is zero or subnormal. return zero |
return u.i; |
} |
exponent1 -= b.i; // subtract b from exponent |
if ((int32_t)exponent1 <= 0) { // underflow |
//if (mask.i & MSK_FLOAT_UNDERFL) t->interrupt(INT_FLOAT_UNDERFL); |
return 0; |
} |
else if ((int32_t)exponent1 >= 0xFF) { // overflow |
//if (mask.i & MSK_OVERFL_FLOAT) t->interrupt(INT_OVERFL_FLOAT); |
return inf_f; |
} |
u.exponent = exponent1; |
u.f = nearbyintf(u.f); // round |
if (u.f != 0) u.exponent += b.i; // add b to exponent |
result = u.i; |
} |
else if (t->operandType == 6) { // double |
union { |
uint64_t q; |
double d; |
struct { |
uint64_t mantissa : 52; |
uint64_t exponent : 11; |
uint64_t sign : 1; |
}; |
} u; |
u.q = t->parm[1].q; // input a |
if (isnan_d(u.q)) return u.q; // a is nan |
exponent1 = u.exponent; |
if (exponent1 == 0) { |
u.mantissa = 0; // a is zero or subnormal. return zero |
return u.q; |
} |
exponent1 -= b.i; // subtract b from exponent |
if ((int32_t)exponent1 <= 0) { // underflow |
//if (mask.i & MSK_FLOAT_UNDERFL) t->interrupt(INT_FLOAT_UNDERFL); |
return 0; |
} |
else if ((int32_t)exponent1 >= 0x7FF) { // overflow |
//if (mask.i & MSK_OVERFL_FLOAT) t->interrupt(INT_OVERFL_FLOAT); |
return inf_d; |
} |
u.exponent = exponent1; |
u.d = nearbyint(u.d); // round |
if (u.d != 0) u.exponent += b.i; // add b to exponent |
result = u.q; |
} |
else t->interrupt(INT_WRONG_PARAMETERS); |
return result; |
} |
|
static uint64_t abs_ (CThread * t) { |
// Absolute value of integer. |
// IM1 determines handling of overflow: 0: wrap around, 1: saturate, 2: zero, 3: trap |
SNum a = t->parm[1]; // x |
SNum b = t->parm[4]; // option |
uint64_t sizemask = dataSizeMask[t->operandType]; // mask for operand size |
uint64_t signbit = (sizemask >> 1) + 1; // just the sign bit |
if (a.q & signbit) { |
// a is negative |
if (t->operandType > 4) { // floating point types |
return a.q & ~signbit; // just remove sign bit |
} |
if ((a.q & sizemask) == signbit) { |
// overflow |
switch (b.b & ~4) { |
case 0: // wrap around |
break; |
case 1: // saturate |
return a.q - 1; |
case 2: // zero |
return 0; |
default: |
t->interrupt(INT_WRONG_PARAMETERS); |
} |
if ((b.b & 4) /* && (t->parm[3].i & MSK_OVERFL_SIGN)*/) { // trap |
t->interrupt(INT_OVERFL_SIGN); // signed overflow |
} |
} |
a.qs = - a.qs; // change sign |
} |
return a.q; |
} |
|
static uint64_t fp_category (CThread * t) { |
// Check if floating point numbers belong to the categories indicated by constant |
// 0 ± NAN, 1 ± Zero, 2 −Subnormal, 3 +Subnormal, 4 −Normal, 5 +Normal, 6 −Infinite, 7 +Infinite |
SNum a = t->parm[1]; // x |
SNum b = t->parm[4]; // option |
uint32_t exponent; |
uint8_t category = 0; // detected category bits |
switch (t->operandType) { |
case 2: case 5: // float |
exponent = a.i >> 23 & 0xFF; // isolate exponent |
if (exponent == 0xFF) { // nan or inf |
if (a.i << 9) category = 1; // nan |
else if (a.i >> 31) category = 0x40; // -inf |
else category = 0x80; // + inf |
} |
else if (exponent == 0) { |
if ((a.i << 9) == 0) category = 2; // zero |
else if (a.i >> 31) category = 4; // - subnormal |
else category = 8; // + subnormal |
} |
else if (a.i >> 31) category = 0x10; // - normal |
else category = 0x20; // + normal |
break; |
case 3: case 6: // double |
exponent = a.q >> 52 & 0x7FF; // isolate exponent |
if (exponent == 0x7FF) { // nan or inf |
if (a.q << 12) category = 1; // nan |
else if (a.q >> 63) category = 0x40; // -inf |
else category = 0x80; // + inf |
} |
else if (exponent == 0) { |
if ((a.q << 12) == 0) category = 2; // zero |
else if (a.q >> 63) category = 4; // - subnormal |
else category = 8; // + subnormal |
} |
else if (a.q >> 63) category = 0x10; // - normal |
else category = 0x20; // + normal |
break; |
default: |
t->interrupt(INT_WRONG_PARAMETERS); |
} |
uint8_t result = (category & b.b) != 0; // test if a belongs to any of the indicated categories |
if ((t->operandType & 7) >= 5) t->operandType -= 3; // debug return type is integer |
return (t->numContr & ~(uint64_t)1) | result; // get remaining bits from NUMCONTR |
} |
|
static uint64_t broad_ (CThread * t) { |
// 18: Broadcast 8-bit signed constant into all elements of RD with length RS (31 in RS field gives scalar output). |
// 19: broadcast_max. Broadcast 8-bit constant into all elements of RD with maximum vector length. |
uint8_t rd = t->operands[0]; |
uint8_t rs = t->operands[4]; |
uint8_t rm = t->operands[1]; // mask register |
SNum b = t->parm[2]; // constant |
uint64_t length; // length of destination vector |
if (t->op == 18) { // length given by RS |
length = t->registers[rs]; |
if (length > t->MaxVectorLength) length = t->MaxVectorLength; |
} |
else { // length is maximum |
length = t->MaxVectorLength; |
} |
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize) |
length = length >> dsizelog << dsizelog; // round down to nearest multiple of operand size |
// set length of destination vector |
t->vectorLength[rd] = (uint32_t)length; |
// loop to set all elements |
for (uint32_t pos = 0; pos < (uint32_t)length; pos += 1 << dsizelog) { |
if ((rm & 0x1F) != 0x1F && !(t->readVectorElement(rm, pos) & 1)) { // mask is zero. get fallback |
if (t->op == 18 || rs >= 31) b.q = 0; // threre is no fallback. write zero |
else b.q = t->readVectorElement(rs, pos); // rs is fallback |
} |
t->writeVectorElement(rd, b.q, pos); // write vector element |
} |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD |
return 0; |
} |
|
static uint32_t byteSwap(uint32_t x) { // swap bytes, used by byte_reverse function |
union { |
uint32_t i; |
uint8_t b[4]; |
} a, b; |
a.i = x; |
b.b[0] = a.b[3]; b.b[1] = a.b[2]; b.b[2] = a.b[1]; b.b[3] = a.b[0]; |
return b.i; |
} |
|
static uint8_t bitSwap(uint8_t x) { // swap bits, used by bit_reverse function |
x = x >> 4 | x << 4; // swap 4-bit nipples |
x = (x >> 2 & 0x33) | (x << 2 & 0xCC); // swap 2-bit groups |
x = (x >> 1 & 0x55) | (x << 1 & 0xAA); // swap single bits |
return x; |
} |
|
static uint64_t byte_reverse (CThread * t) { |
// Reverse the order of bits or bytes in each element of vector |
SNum a = t->parm[1]; // value |
uint8_t IM1 = t->parm[2].b; // immediate operand |
if (IM1 & 1) { |
// bit reverse: Reverse the order of bits in each element of vector |
union { |
uint64_t q; |
uint32_t i[2]; |
uint8_t b[8]; |
} u; |
u.q = a.q; |
uint8_t t1; uint32_t t2; |
switch (dataSizeTableLog[t->operandType]) { |
case 0: // 8 bit |
u.b[0] = bitSwap(u.b[0]); break; |
case 1: // 16 bit |
t1 = bitSwap(u.b[0]); u.b[0] = bitSwap(u.b[1]); u.b[1] = t1; break; |
case 2: // 32 bit |
u.i[0] = byteSwap(u.i[0]); |
for (t1 = 0; t1 < 4; t1++) u.b[t1] = bitSwap(u.b[t1]); |
break; |
case 3: // 64 bit |
t2 = byteSwap(u.i[0]); u.i[0] = byteSwap(u.i[1]); u.i[1] = t2; |
for (t1 = 0; t1 < 8; t1++) u.b[t1] = bitSwap(u.b[t1]); |
break; |
case 4: // 128 bit |
t->interrupt(INT_WRONG_PARAMETERS); |
} |
return u.q; |
} |
else { |
// byte reverse: Reverse the order of bytes in each element of a vector |
uint8_t rs = t->operands[4]; |
uint32_t tmp; |
switch (dataSizeTableLog[t->operandType]) { |
case 0: // 8 bit |
break; |
case 1: // 16 bit |
a.s = a.s >> 8 | a.b << 8; break; // swap bytes |
case 2: // 32 bit |
a.i = byteSwap(a.i); break; |
case 3: // 64 bit |
tmp = byteSwap(a.i); a.q = byteSwap(a.q >> 32) | (uint64_t)tmp << 32; |
break; |
case 4: // 128 bit |
tmp = byteSwap(a.i); t->parm[5].q = byteSwap(a.q >> 32) | (uint64_t)tmp << 32; |
a.q = t->readVectorElement(rs, t->vectorOffset + 8); // high part of input |
tmp = byteSwap(a.i); a.q = byteSwap(a.q >> 32) | (uint64_t)tmp << 32; |
break; |
} |
return a.q; |
} |
} |
|
/* |
static uint64_t truth_tab2 (CThread * t) { |
// Boolean function of two inputs, given by a truth table |
SNum a = t->parm[0]; // value |
SNum b = t->parm[1]; // value |
SNum c = t->parm[4]; // truth table |
return ((c.b >> ((a.b & 1) | (b.b & 1) << 1)) & 1) | (a.q & ~uint64_t(1)); |
} */ |
|
static uint64_t bool2bits(CThread * t) { |
// The boolean vector RT is packed into the lower n bits of RD, |
// taking bit 0 of each element |
// The length of RD will be at least sufficient to contain n bits. |
|
uint8_t rd = t->operands[0]; // destination vector |
uint8_t rt = t->operands[4]; // RT = source vector |
//uint8_t rs = t->operands[4]; // RS indicates length |
uint8_t * destination = (uint8_t*)t->vectors.buf() + (int64_t)rd * t->MaxVectorLength; // address of RD data |
//uint64_t length = t->registers[rs]; // value of RS = length of destination |
uint32_t length = t->vectorLength[rt]; // length of source |
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize) |
//if (length > t->MaxVectorLength) length = t->MaxVectorLength; // limit length |
uint32_t num = length >> dsizelog; // number of elements |
length = num << dsizelog; // round down length to nearest multiple of element size |
// collect bits into blocks of 32 bits |
uint32_t bitblock = 0; |
// loop through elements of source vector |
for (uint32_t i = 0; i < num; i++) { |
uint8_t bit = t->readVectorElement(rt, i << dsizelog) & 1; |
uint8_t bitindex = i & 31; // bit position with 32 bit block of destination |
bitblock |= bit << bitindex; // add bit to bitblock |
if (bitindex == 31 || i == num - 1) { // last bit in this block |
*(uint32_t*)(destination + (i/8 & -4)) = bitblock; // write 32 bit block to destination |
bitblock = 0; // start next block |
} |
} |
// round up length of destination to multiple of 4 bytes |
uint32_t destinationLength = ((num+7)/8 + 3) & -4; |
if (destinationLength == 0) { |
destinationLength = 4; *(uint32_t*)destination = 0; |
} |
// set length of destination vector (must be done after reading source because source and destination may be the same) |
t->vectorLength[rd] = destinationLength; |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD |
if ((t->returnType & 7) >= 5) t->returnType -= 3; // make return type integer |
return 0; |
} |
|
static uint64_t bool_reduce(CThread * t) { |
// integer vector: bool_reduce. The boolean vector RT is reduced by combining bit 0 of all elements. |
// The output is a scalar integer where bit 0 is the AND combination of all the bits, |
// and bit 1 is the OR combination of all the bits. |
// The remaining bits are reserved for future use |
// float vector: category_reduce: Each bit in RD indicates that at least one element in RT belongs |
// to a certain category: |
// bit 0: NAN, bit 1: zero, bit 2: - subnormal, bitt 3: + subnormal, |
// bit 4: - normal, bit 5: + normal, bit 6: - INF, bit 7: + INF |
uint8_t rd = t->operands[0]; // destination vector |
uint8_t rt = t->operands[4]; // RT = source vector |
//uint8_t rs = t->operands[4]; // RS indicates length |
uint8_t bitOR = 0; // OR combination of all bits |
uint8_t bitAND = 1; // AND combination of all bits |
uint64_t result = 0; // result value |
uint8_t * source = (uint8_t*)t->vectors.buf() + rt*t->MaxVectorLength; // address of RT data |
//if (length > t->MaxVectorLength) length = t->MaxVectorLength; // limit length |
uint32_t elementSize = dataSizeTable[t->operandType]; // vector element size |
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize) |
uint32_t sourceLength = t->vectorLength[rt]; // length of source vector |
//uint64_t length = t->registers[rs]; // value of RS = length of destination |
uint32_t length = sourceLength; // length of source vector |
length = length >> dsizelog << dsizelog; // round down to nearest multiple of element size |
/*if (length > sourceLength) { |
length = sourceLength; // limit to length of source vector |
bitAND = 0; // bits beyond vector are 0 |
} */ |
switch (t->operandType) { |
case 0: case 1: case 2: case 3: case 4: // integer types: bool_reduce |
for (uint32_t pos = 0; pos < length; pos += elementSize) { // loop through elements of source vector |
uint8_t bit = *(source + pos) & 1; // get bit from source vector element |
bitOR |= bit; bitAND &= bit; |
} |
result = bitAND | bitOR << 1; |
break; |
case 5: // float type: category_reduce |
for (uint32_t pos = 0; pos < length; pos += elementSize) { // loop through elements of source vector |
uint32_t val = *(int32_t*)(source + pos); |
uint8_t exponent = val >> 23 & 0xFF; // isolate exponent |
uint8_t category; |
if (exponent == 0xFF) { // nan or inf |
if (val << 9) category = 1; // nan |
else if (val >> 31) category = 0x40; // -inf |
else category = 0x80; // + inf |
} |
else if (exponent == 0) { |
if ((val << 9) == 0) category = 2; // zero |
else if (val >> 31) category = 4; // - subnormal |
else category = 8; // + subnormal |
} |
else if (val >> 31) category = 0x10; // - normal |
else category = 0x20; // + normal |
result |= category; // combine categories |
} |
break; |
case 6: // double type: category_reduce |
for (uint32_t pos = 0; pos < length; pos += elementSize) { // loop through elements of source vector |
uint64_t val = *(int64_t*)(source + pos); |
uint32_t exponent = val >> 52 & 0x7FF; // isolate exponent |
uint8_t category; |
if (exponent == 0x7FF) { // nan or inf |
if (val << 12) category = 1; // nan |
else if (val >> 63) category = 0x40; // -inf |
else category = 0x80; // + inf |
} |
else if (exponent == 0) { |
if ((val << 12) == 0) category = 2; // zero |
else if (val >> 63) category = 4; // - subnormal |
else category = 8; // + subnormal |
} |
else if (val >> 63) category = 0x10; // - normal |
else category = 0x20; // + normal |
result |= category; // combine categories |
} |
break; |
default: |
t->interrupt(INT_WRONG_PARAMETERS); |
} |
t->vectorLength[rd] = 8; // set length of destination vector to 64 bits |
uint8_t * destination = (uint8_t*)t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data |
*(uint64_t*)destination = result; // write 64 bits to destination |
// (using writeVectorElement would possibly write less than 64 bits, leaving some of the destination vector unchanged) |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD. It has already been saved |
if ((t->returnType & 7) >= 5) t->returnType -= 3; // make return type integer |
return result; |
} |
|
|
static uint64_t push_v(CThread * t) { |
// push one or more vector registers on a stack pointed to by rd |
if (t->parm[2].i & 0xE0) { |
t->interrupt(INT_WRONG_PARAMETERS); return 0; // forward-growing stack not supported for vector registers |
} |
uint8_t reg0 = t->operands[0] & 0x1F; // pointer register |
uint8_t reg1 = t->operands[4] & 0x1F; // first push register |
uint8_t reglast = t->parm[2].i & 0x1F; // last push register |
uint8_t reg; // current regiser |
uint32_t length; // length of current register |
uint32_t length2; // length rounded up to nearest multiple of stack word size |
uint64_t pointer = t->registers[reg0]; |
const int stack_word_size = 8; |
t->operandType = 3; // must be 64 bits. |
// loop through registers to push |
for (reg = reg1; reg <= reglast; reg++) { |
length = t->vectorLength[reg]; |
length2 = (length + stack_word_size - 1) & -stack_word_size; // round up to multiple of 8 |
if (length != 0) { |
pointer -= length2; |
for (uint32_t j = 0; j < length2; j += 8) { |
uint64_t value = t->readVectorElement(reg, j); |
t->writeMemoryOperand(value, pointer + j); // write vector |
} |
t->returnType = 0x113; |
t->operands[0] = reg; |
t->listResult(0); |
} |
pointer -= stack_word_size; |
t->writeMemoryOperand(length, pointer); // write length |
t->returnType = 0x13; |
t->listResult(length); |
} |
t->registers[reg0] = pointer; |
t->returnType = 0x13; |
t->operands[0] = reg0; |
t->vect = 4; // stop vector loop |
t->running = 2; // don't store result register |
return pointer; |
} |
|
static uint64_t pop_v(CThread * t) { |
// pop one or more vector registers from a stack pointed to by rd |
if (t->parm[2].i & 0xE0) { |
t->interrupt(INT_WRONG_PARAMETERS); return 0; // forward-growing stack not supported for vector registers |
} |
uint8_t reg0 = t->operands[0] & 0x1F; // pointer register |
uint8_t reg1 = t->operands[4] & 0x1F; // first pop register |
uint8_t reglast = t->parm[2].i & 0x1F; // last pop register |
uint8_t reg; // current regiser |
uint32_t length; // length of current register |
uint32_t length2; // length rounded up to nearest multiple of stack word size |
uint64_t pointer = t->registers[reg0]; // value of stack pointer or pointer register |
const int stack_word_size = 8; |
t->operandType = 3; // must be 64 bits. |
// reverse loop through registers to pop |
for (reg = reglast; reg >= reg1; reg--) { |
length = (uint32_t)t->readMemoryOperand(pointer); // read length |
length2 = (length + stack_word_size - 1) & -stack_word_size; // round up to multiple of 8 |
t->vectorLength[reg] = length; // set vector length |
pointer += stack_word_size; // pop length |
if (length != 0) { |
for (uint32_t j = 0; j < length2; j += 8) { // read vector |
uint64_t value = t->readMemoryOperand(pointer + j); // read from memory |
t->writeVectorElement(reg, value, j); |
} |
pointer += length2; |
t->returnType = 0x113; |
t->operands[0] = reg; |
t->listResult(0); |
} |
t->returnType = 0x13; |
t->listResult(length); |
} |
t->registers[reg0] = pointer; |
t->returnType = 0x13; |
t->operands[0] = reg0; |
t->vect = 4; // stop vector loop |
t->running = 2; // don't store result register |
return pointer; |
} |
|
static uint64_t clear_(CThread * t) { |
// clear one or more vector registers |
uint8_t reg1 = t->operands[4] & 0x1F; // first register |
uint8_t reglast = t->parm[2].i & 0x1F; // last register |
uint8_t reg; // current regiser |
for (reg = reg1; reg <= reglast; reg++) { |
t->vectorLength[reg] = 0; |
} |
t->vect = 4; // stop vector loop |
t->running = 2; // don't store result register |
t->returnType = 0; |
return 0; |
} |
|
|
// Format 1.4 C. One vector register and a broadcast 16-bit immediate operand. |
|
static uint64_t move_i16 (CThread * t) { |
// Move 16 bit integer constant to 16-bit scalar |
uint8_t rd = t->operands[0]; // destination vector |
t->vectorLength[rd] = 2; // set length of destination |
t->vect = 4; // stop vector loop |
return t->parm[2].q; |
} |
|
//static uint64_t add_i16 (CThread * t) {return f_add(t);} // Add broadcasted 16 bit constant to 16-bit vector elements |
|
static uint64_t and_i16 (CThread * t) { |
// AND broadcasted 16 bit constant |
return t->parm[1].q & t->parm[2].q; |
} |
|
static uint64_t or_i16 (CThread * t) { |
// OR broadcasted 16 bit constant |
return t->parm[1].q | t->parm[2].q; |
} |
|
static uint64_t xor_i16 (CThread * t) { |
// XOR broadcasted 16 bit constant |
return t->parm[1].q ^ t->parm[2].q; |
} |
|
static uint64_t add_h16 (CThread * t) { |
// add constant to half precision vector |
return f_add_h(t); |
} |
|
static uint64_t mul_h16 (CThread * t) { |
// multiply half precision vector with constant |
return f_mul_h(t); |
} |
|
static uint64_t move_8shift8 (CThread * t) { |
// RD = IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1 to make 32/64 bit scalar |
// 40: 32 bit, 41: 64 bit |
uint8_t rd = t->operands[0]; // destination vector |
t->vectorLength[rd] = (t->op & 1) ? 8 : 4; // set length of destination |
t->vect = 4; // stop vector loop |
return (uint64_t)(int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs); // shift and sign extend |
} |
|
static uint64_t add_8shift8 (CThread * t) { |
// RD += IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1, add to 32/64 bit vector |
// 42: 32 bit, 43: 64 bit |
int64_t save2 = t->parm[2].qs; |
t->parm[2].qs = int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs; // shift and sign extend |
int64_t result = f_add(t); // use f_add for getting overflow traps |
t->parm[2].qs = save2; // restore constant |
return result; |
} |
|
static uint64_t and_8shift8 (CThread * t) { |
// RD &= IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1, AND with 32/64 bit vector |
// 44: 32 bit, 45: 64 bit |
int64_t a = int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs; // shift and sign extend |
return t->parm[1].q & a; |
} |
|
static uint64_t or_8shift8 (CThread * t) { |
// RD |= IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1, OR with 32/64 bit vector |
// 46: 32 bit, 47: 64 bit |
int64_t a = int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs; // shift and sign extend |
return t->parm[1].q | a; |
} |
|
static uint64_t xor_8shift8 (CThread * t) { |
// RD |= IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1, XOR with 32/64 bit vector |
// 48: 32 bit, 49: 64 bit |
int64_t a = int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs; // shift and sign extend |
return t->parm[1].q ^ a; |
} |
|
static uint64_t move_half2float (CThread * t) { |
// Move converted half precision floating point constant to single precision scalar |
t->vectorLength[t->operands[0]] = 4; // set length of destination |
t->vectorLengthR = 4; |
t->vect = 4; // stop vector loop |
return t->parm[2].q; |
} |
|
static uint64_t move_half2double (CThread * t) { |
// Move converted half precision floating point constant to double precision scalar |
t->vectorLength[t->operands[0]] = 8; // set length of destination |
t->vect = 4; // stop vector loop |
return t->parm[2].q; |
} |
|
static uint64_t add_half2float (CThread * t) { |
// Add broadcast half precision floating point constant to single precision vector |
return f_add(t); |
} |
|
static uint64_t add_half2double (CThread * t) { |
// Add broadcast half precision floating point constant to double precision vector |
return f_add(t); |
} |
|
static uint64_t mul_half2float (CThread * t) { |
// multiply broadcast half precision floating point constant with single precision vector |
return f_mul(t); |
} |
|
static uint64_t mul_half2double (CThread * t) { |
// multiply broadcast half precision floating point constant with double precision vector |
return f_mul(t); |
} |
|
// Format 2.6 A. Three vector registers and a 32-bit immediate operand. |
|
static uint64_t load_hi (CThread * t) { |
// Make vector of two elements. dest[0] = 0, dest[1] = IM2. |
uint8_t rd = t->operands[0]; |
uint8_t dsize = dataSizeTable[t->operandType]; |
t->vectorLength[rd] = dsize * 2; // set length of destination |
t->writeVectorElement(rd, 0, 0); // write 0 |
t->writeVectorElement(rd, t->parm[2].q, dsize);// write IM2 |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD |
return 0; |
} |
|
static uint64_t insert_hi (CThread * t) { |
// Make vector of two elements. dest[0] = src1[0], dest[1] = IM2. |
uint8_t rd = t->operands[0]; |
uint8_t dsize = dataSizeTable[t->operandType]; |
t->vectorLength[rd] = dsize * 2; // set length of destination |
t->writeVectorElement(rd, t->parm[1].q, 0); // write src1 |
t->writeVectorElement(rd, t->parm[2].q, dsize);// write IM2 |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD |
return 0; |
} |
|
static uint64_t make_mask (CThread * t) { |
// Make vector where bit 0 of each element comes from bits in IM2, the remaining bits come from RT. |
SNum m = t->parm[3]; // mask or numcontr |
SNum b = t->parm[2]; // constant operand |
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize) |
uint32_t elementNum = t->vectorOffset >> dsizelog; // index to vector element |
if ((t->operandType & 7) >= 5) t->operandType -= 3; // debug return type is integer |
return (m.q & ~(uint64_t)1) | (b.i >> (elementNum & 31) & 1); |
} |
|
static uint64_t replace_ (CThread * t) { |
// Replace elements in RT by constant IM2 |
// format 2.6: 32 bits, format 3.1: 64 bits |
return t->parm[2].q; |
} |
|
static uint64_t replace_even (CThread * t) { |
// Replace even-numbered elements in RT by constant IM2 |
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize) |
uint32_t elementNum = t->vectorOffset >> dsizelog; // index to vector element |
return (elementNum & 1) ? t->parm[1].q : t->parm[2].q; |
} |
|
static uint64_t replace_odd (CThread * t) { |
// Replace odd-numbered elements in RT by constant IM2 |
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize) |
uint32_t elementNum = t->vectorOffset >> dsizelog; // index to vector element |
return (elementNum & 1) ? t->parm[2].q : t->parm[1].q; |
} |
|
static uint64_t broadcast_32 (CThread * t) { |
// Broadcast 32-bit or 64 -bit constant into all elements of RD with length RS (31 in RS field gives scalar output). |
uint8_t rd = t->operands[0]; |
uint8_t rs = t->operands[4]; |
uint8_t rm = t->operands[1]; // mask register |
uint32_t elementSize = dataSizeTable[t->operandType]; |
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize) |
uint64_t length; // length of destination |
int64_t value; |
if (rs == 31) length = elementSize; |
else length = t->registers[rs] << dsizelog >> dsizelog; // round length to multiple of elementSize |
if (length > t->MaxVectorLength) length = t->MaxVectorLength; |
t->vectorLength[rd] = (uint32_t)length; // set length of destination |
for (uint32_t pos = 0; pos < length; pos += elementSize) { // loop through vector |
if (rm >= 7 || (t->readVectorElement(rm, pos) & 1)) value = t->parm[2].qs; // check mask |
else value = 0; |
t->writeVectorElement(rd, value, pos); // write to destination |
} |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD |
return 0; |
} |
|
static uint64_t permute (CThread * t) { |
// The vector elements of RS are permuted within each block of size RT bytes. |
// The number of elements in each block, n = RT / OS |
// format 2.2.6 op 1.1: index vector is last operand |
// format 2.6 op 8: index vector is constant IM2, 4 bits for each element |
uint8_t rd = t->operands[0]; // destination |
uint8_t rm = t->operands[1]; // mask register |
uint8_t vin; // input data register |
uint8_t vpat = 0; // pattern register |
uint8_t bs; // block size, g.p. register |
uint32_t pattern = 0; // IM2 = pattern, if constant |
bool constPat = false; // pattern is a constant |
if (t->fInstr->format2 == 0x226) { |
vin = t->operands[3]; // ru = input data |
vpat = t->operands[4]; // rs = pattern |
bs = t->operands[5]; // block size, g.p. register |
} |
else { // format 2.6 |
vin = t->operands[3]; // rs = input data |
bs = t->operands[4]; // block size, g.p. register |
pattern = t->parm[4].i; // IM2 = pattern, if constant |
constPat = true; |
} |
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize) |
//uint32_t elementSize = 1 << dsizelog; |
uint32_t length = t->vectorLength[vin]; // vector length |
t->vectorLength[rd] = length; // set length of destination |
int8_t * source = t->vectors.buf() + (uint32_t)(vin & 0x1F) * t->MaxVectorLength; // address of source data vector |
if (vin == rd) { |
// source and destination are the same. Make a temporary copy of source to avoid overwriting |
memcpy(t->tempBuffer, source, length); |
source = t->tempBuffer; |
} |
uint64_t blocksize = t->registers[bs]; // bytes per block |
uint64_t value; // value of element |
uint64_t index; // index to source element |
if (blocksize == 0 || (blocksize & (blocksize-1)) || blocksize > t->MaxVectorLength) { |
t->interrupt(INT_WRONG_PARAMETERS); // RS must be a power of 2 |
} |
else { |
uint32_t num = (uint32_t)blocksize >> dsizelog; // elements per block |
for (uint32_t block = 0; block < length; block += (uint32_t)blocksize) { // loop through blocks |
for (uint32_t element = 0; element < num; element++) { // loop through elements within block |
if (constPat) { // get index from constant |
index = (pattern >> (element&7)*4) & 0xF; // index to select block element |
} |
else { // get index from vector |
index = t->readVectorElement(vpat, block + (element << dsizelog)); |
} |
if (index < num && (rm == 7 || t->readVectorElement(rm, block + (element << dsizelog)) & 1)) { // check mask |
value = *(uint64_t*)(source + block + ((uint32_t)index << dsizelog)); // pick indexed element from source vector |
} |
else value = 0; // index out of range or mask = 0 |
t->writeVectorElement(rd, value, block + (element << dsizelog)); // write destination |
} |
} |
} |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD |
return 0; |
} |
|
/* |
static uint64_t replace_bits (CThread * t) { |
// Replace a group of contiguous bits in RT by a specified constant |
SNum a = t->parm[1]; // input operand |
SNum b = t->parm[2]; // input constant |
uint64_t val = b.s; // value of replacement bits |
uint8_t pos = uint8_t(b.i >> 16); // position of replacement |
uint8_t num = uint8_t(b.i >> 24); // number of consecutive bits to replace |
uint64_t mask = ((uint64_t)1 << num) - 1; // mask with num 1-bits |
return (a.q & ~(mask<<pos)) | ((val & mask) << pos); |
}*/ |
|
// Format 2.5 A. Single format instructions with memory operands or mixed register types |
|
static uint64_t store_i32 (CThread * t) { |
// Store 32-bit constant IM2 to memory operand [RS+IM1] |
uint64_t value = t->parm[2].q; |
if ((t->parm[3].b & 1) == 0) value = 0; // check mask |
t->writeMemoryOperand(value, t->memAddress); |
t->running = 2; // don't save RD |
t->returnType = (t->returnType & 7) | 0x20; |
return 0; |
} |
|
//static uint64_t fence_ (CThread * t) {return f_nop(t);} |
|
static uint64_t compare_swap (CThread * t) { |
// Atomic compare and exchange with address [RS+IM2] |
uint64_t val1 = t->parm[0].q; |
uint64_t val2 = t->parm[1].q; |
// to do: use intrinsic compareandexchange or mutex or pause all threads if multiple threads |
uint64_t address = t->memAddress; |
uint64_t sizemask = dataSizeMask[t->operandType]; // mask for operand size |
uint64_t val3 = t->readMemoryOperand(address); // read value from memory |
if (((val3 ^ val1) & sizemask) == 0) { // value match |
t->writeMemoryOperand(val2, address); // write new value to memory |
} |
t->vect = 4; // stop vector loop |
return val3; // return old value |
} |
|
static uint64_t read_insert (CThread * t) { |
// Replace one element in vector RD, starting at offset RT*OS, with scalar memory operand [RS+IM2] |
uint8_t rd = t->operands[0]; |
uint8_t rs = t->operands[4]; |
uint32_t elementSize = dataSizeTable[t->operandType]; |
uint64_t value = t->readMemoryOperand(t->memAddress); |
uint64_t pos = t->registers[rs] * elementSize; |
if (pos < t->vectorLength[rd]) { |
t->writeVectorElement(rd, value, (uint32_t)pos); |
} |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD |
return 0; |
} |
|
static uint64_t extract_store (CThread * t) { |
// Extract one element from vector RD, starting at offset RT*OS, with size OS into memory operand [RS+IM2] |
uint8_t rd = t->operands[0]; |
uint8_t rs = t->operands[4]; |
uint32_t elementSize = dataSizeTable[t->operandType]; |
uint64_t pos = t->registers[rs] * elementSize; |
uint64_t value = t->readVectorElement(rd, (uint32_t)pos); |
t->writeMemoryOperand(value, t->memAddress); |
t->returnType = (t->returnType & 7) | 0x20; // debug return type is memory |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD |
t->vectorLengthR = elementSize; // size of memory destination |
return 0; |
} |
|
|
// Format 2.2.6 E. Four vector registers |
|
static uint64_t concatenate (CThread * t) { |
// A vector RU of length RT and a vector RS of length RT are concatenated into a vector RD of length 2*RT. |
uint8_t rd = t->operands[0]; |
uint8_t ru = t->operands[3]; |
uint8_t rs = t->operands[4]; |
uint8_t rt = t->operands[5]; |
uint64_t length1 = t->registers[rt]; |
if (length1 > t->MaxVectorLength) length1 = t->MaxVectorLength; |
uint32_t length2 = 2 * (uint32_t)length1; |
if (length2 > t->MaxVectorLength) length2 = t->MaxVectorLength; |
t->vectorLength[rd] = length2; // set length of destination vector |
int8_t * source1 = t->vectors.buf() + ru*t->MaxVectorLength; // address of RU data |
int8_t * source2 = t->vectors.buf() + rs*t->MaxVectorLength; // address of RS data |
int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data |
memcpy(destination, source1, (uint32_t)length1); // copy from RU |
memcpy(destination + (uint32_t)length1, source2, length2 - (uint32_t)length1); // copy from RS |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD |
return 0; |
} |
|
static uint64_t interleave (CThread * t) { |
// Interleave elements of vectors RU and RS of length RT/2 to produce vector RD of length RT. |
// Even-numbered elements of the destination come from RU and odd-numbered elements from RS. |
uint8_t rd = t->operands[0]; // destination |
uint8_t ru = t->operands[3]; // first input vector |
uint8_t rs = t->operands[4]; // second input vector |
uint8_t rt = t->operands[5]; // length |
uint8_t rm = t->operands[1]; // mask |
uint64_t length = t->registers[rt]; |
if (length > t->MaxVectorLength) length = t->MaxVectorLength; |
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize) |
length = length >> dsizelog << dsizelog; // round down to nearest multiple of element size |
uint32_t elementSize = 1 << dsizelog; // size of each element |
t->vectorLength[rd] = (uint32_t)length; // set length of destination |
uint8_t even = 1; |
uint32_t pos1 = 0; |
uint64_t value; |
for (uint32_t pos2 = 0; pos2 < length; pos2 += elementSize) { |
if (even) { |
value = t->readVectorElement(ru, pos1); |
} |
else { |
value = t->readVectorElement(rs, pos1); |
pos1 += elementSize; |
} |
even ^= 1; // toggle between even and odd |
if (rm < 7 && (t->readVectorElement(rm, pos2) & 1) == 0) value = 0; // mask is 0 |
t->writeVectorElement(rd, value, pos2); |
} |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD |
return 0; |
} |
|
|
// Format 2.2.7 E. Three vector registers and a 16 bit immediate |
|
static uint64_t move_bits (CThread * t) { |
// Replace one or more contiguous bits at one position of RS with contiguous bits from another position of RT |
// Format 2.0.7 E: general purpose registers |
// Format 2.2.7 E: vector registers |
// The position in src2 is the lower 8 bits of IM2. a = IM2 & 0xFF. |
// The position in src1 is the upper 8 bits of IM2. b = IM2 >> 0xFF. |
// The number of bits to move is c = IM3. |
SNum s1 = t->parm[0]; // input operand src1 |
SNum s2 = t->parm[1]; // input operand src2 |
SNum im = t->parm[4]; // input operand IM2 |
SNum mask = t->parm[3]; // |
uint8_t c = t->pInstr->a.im3; // input operand IM3 = number of bits |
uint8_t pos1 = im.s >> 8; // bit position in src1. (can overflow, not handled) |
uint8_t pos2 = im.b; // bit position in src2. (can overflow, not handled) |
uint64_t bitmask = ((uint64_t)1 << c) - 1; // mask of c bits. (cannot overflow because c is max 63) |
uint64_t result = (s1.q & ~(bitmask << pos1)) | ((s2.q >> pos2) & bitmask) << pos1; |
if ((mask.b & 1) == 0) { // single format instructions with template E must handle mask here |
result = s1.q; // fallback |
if (t->operands[2] == 31) result = 0; // fallback = 0 |
} |
return result; |
} |
|
static uint64_t mask_length (CThread * t) { |
// Make a boolean vector to mask the first n bytes of a vector. |
// The output vector RD will have the same length as the input vector RS. |
// RT indicates the length of the part that is enabled by the mask (n). |
// IM3 contains the following option bits: |
// bit 0 = 0: bit 0 will be 1 in the first n bytes in the output and 0 in the rest. |
// bit 0 = 1: bit 0 will be 0 in the first n bytes in the output and 1 in the rest. |
// bit 1 = 1: copy remaining bits from input vector RT into each vector element. |
// bit 2 = 1: copy remaining bits from the numeric control register. |
// bit 4 = 1: broadcast remaining bits from IM2 into all 32-bit words of RD: |
// Bit 1-7 of IM2 go to bit 1-7 of RD. Bit 8-11 of IM2 go to bit 20-23 of RD. Bit 12-15 of IM2 go to bit 26-29 of RD. |
// Output bits that are not set by any of these options will be zero. If multiple options are specified, the results will be OR’ed. |
uint8_t rd = t->operands[0]; // destination |
uint8_t rs = t->operands[3]; // src2 |
uint8_t rt = t->operands[4]; // length |
SNum s2 = t->parm[0]; // input operand src2 |
SNum im2 = t->parm[4]; // input operand IM2 |
uint8_t im3 = t->pInstr->a.im3; // input operand IM3 = options |
t->vectorLengthR = t->vectorLength[rd] = t->vectorLength[rs]; // set length of destination |
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize) |
uint64_t n = t->registers[rt]; // number of masked elements |
uint32_t i = t->vectorOffset >> dsizelog; // current element index |
uint8_t bit = i < n; // element is within the first n |
bit ^= im3 & 1; // invert option |
uint64_t result = 0; |
if (im3 & 2) result |= s2.q; // copy remaining bits from src1 |
if (im3 & 4) result |= t->numContr; // copy remaining bits from NUMCONTR |
if (im3 & 0x10) { // copy bits from IM2 |
uint32_t rr = (im2.b & ~1) | bit; // bit 1-7 -> bit 1-7 |
rr |= (im2.s & 0xF00) << 12; // bit 8-11 -> bit 20-23 |
rr |= (im2.s & 0xF000) << 14; // bit 12-15 -> bit 26-29 |
result |= rr | ((uint64_t)rr << 32); // copy these bits twice |
} |
result = (result & ~(uint64_t)1) | bit; // combine |
return result; |
} |
|
static uint64_t truth_tab3 (CThread * t) { |
// Bitwise boolean function of three inputs, given by a truth table |
SNum a = t->parm[0]; // first operand |
SNum b = t->parm[1]; // second operand |
SNum c = t->parm[2]; // third operand |
SNum mask = t->parm[3]; // mask register |
uint32_t table = t->pInstr->a.im2; // truth table |
uint8_t options = t->pInstr->a.im3; // option bits |
|
uint32_t dataSize = dataSizeTableBits[t->operandType]; // number of bits |
if (options & 3) dataSize = 1; // only a single bit |
uint64_t result = 0; // calculate result |
|
for (int i = dataSize - 1; i >= 0; i--) { // loop through bits |
uint64_t bit_pointer = uint64_t(1) << i; // selected bit |
uint8_t index = 0; // index into truth table |
if (a.q & bit_pointer) index = 1; |
if (b.q & bit_pointer) index |= 2; |
if (c.q & bit_pointer) index |= 4; |
uint64_t bit = table >> index & 1; // lookup in truth table |
result = result << 1 | bit; // insert bit into result |
} |
if (options & 2) { // take remaining bits from mask or numcontr |
result |= mask.q & ~(uint64_t)1; |
} |
return result; |
} |
|
static uint64_t repeat_block (CThread * t) { |
// Repeat a block of data to make a longer vector. |
// RS is input vector containing data block to repeat. |
// IM2 is length in bytes of the block to repeat (must be a multiple of 4). |
// RT is the length of destination vector RD. |
uint8_t rd = t->operands[0]; |
uint8_t rs = t->operands[3]; |
uint8_t rt = t->operands[4]; |
uint32_t blen = t->parm[4].i; // block length |
uint64_t length = t->registers[rt]; // length of destination |
if (length > t->MaxVectorLength) length = t->MaxVectorLength; |
if (blen > t->MaxVectorLength) blen = t->MaxVectorLength; |
t->vectorLength[rd] = (uint32_t)length; // set length of destination |
if (blen & 3) t->interrupt(INT_WRONG_PARAMETERS); // must be a multiple of 4 |
int8_t * source = t->vectors.buf() + rs*t->MaxVectorLength; // address of RS data |
int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data |
if (length > t->vectorLength[rs]) { // reading beyond the end of the source vector. make sure the rest is zero |
memset(source + t->vectorLength[rs], 0, size_t(length - t->vectorLength[rs])); |
} |
for (uint32_t pos = 0; pos < length; pos += blen) { // loop through blocks |
uint32_t blen2 = blen; |
if (pos + blen2 > length) blen2 = (uint32_t)length - pos; // avoid last block going too far |
memcpy(destination + pos, source, blen2); // copy block |
} |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD |
return 0; |
} |
|
static uint64_t repeat_within_blocks (CThread * t) { |
// Broadcast the first element of each block of data in a vector to the entire block. |
// RS is input vector containing data blocks. |
// IM2 is length in bytes of each block (must be a multiple of the operand size). |
// RT is length of destination vector RD. |
// The operand size must be at least 4 bytes. |
uint8_t rd = t->operands[0]; |
uint8_t rs = t->operands[3]; |
uint8_t rt = t->operands[4]; |
uint32_t blen = t->parm[4].i; // block length |
uint64_t length = t->registers[rt]; // length of destination |
if (length > t->MaxVectorLength) length = t->MaxVectorLength; |
if (blen > t->MaxVectorLength) blen = t->MaxVectorLength; |
t->vectorLength[rd] = (uint32_t)length; // set length of destination |
uint32_t elementSize = dataSizeTable[t->operandType]; |
if (elementSize < 4 || (blen & (elementSize - 1))) t->interrupt(INT_WRONG_PARAMETERS); // must be a multiple of elementsize |
int8_t * source = t->vectors.buf() + rs*t->MaxVectorLength; // address of RS data |
int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data |
if (length > t->vectorLength[rs]) { // reading beyond the end of the source vector. make sure the rest is zero |
memset(source + t->vectorLength[rs], 0, size_t(length - t->vectorLength[rs])); |
} |
for (uint32_t pos = 0; pos < length; pos += blen) { // loop through blocks |
uint32_t blen2 = blen; |
if (pos + blen2 > length) blen2 = (uint32_t)length - pos; // avoid last block going too far |
for (uint32_t i = 0; i < blen2; i += elementSize) { // loop within block |
memcpy(destination + pos + i, source + pos, elementSize); // copy first element |
} |
} |
t->vect = 4; // stop vector loop |
t->running = 2; // don't save RD |
return 0; |
} |
|
|
// tables of single format instructions |
|
// Format 1.3 B. Two vector registers and a broadcast 8-bit immediate operand. |
PFunc funcTab7[64] = { |
gp2vec, vec2gp, 0, make_sequence, insert_, extract_, compress, expand, // 0 - 7 |
0, 0, 0, 0, float2int, int2float, round_, round2n, // 8 - 15 |
abs_, fp_category, broad_, broad_, byte_reverse, bitscan_, popcount_, 0, // 16 - 23 |
0, bool2bits, bool_reduce, 0, 0, 0, 0, 0, // 24 - 31 |
0, 0, 0, 0, 0, 0, 0, 0, // 32 - 39 |
0, 0, 0, 0, 0, 0, 0, 0, // 40 - 47 |
0, 0, 0, 0, 0, 0, 0, 0, // 48 - 55 |
push_v, pop_v, clear_, 0, 0, 0, 0, 0, // 56 - 63 |
}; |
|
// Format 1.4 C. One vector register and a broadcast 16-bit immediate operand. |
PFunc funcTab8[64] = { |
move_i16, f_add, and_i16, or_i16, xor_i16, 0, 0, 0, // 0 - 7 |
move_8shift8, move_8shift8, add_8shift8, add_8shift8, and_8shift8, and_8shift8, or_8shift8, or_8shift8, // 8 - 15 |
xor_8shift8, xor_8shift8, 0, 0, 0, 0, 0, 0, // 16 - 23 |
0, 0, 0, 0, 0, 0, 0, 0, // 24 - 31 |
move_half2float, move_half2double, add_half2float, add_half2double, mul_half2float, mul_half2double, 0, 0, // 32 - 39 |
add_h16, mul_h16, 0, 0, 0, 0, 0, 0, // 40 - 47 |
0, 0, 0, 0, 0, 0, 0, 0, // 48 - 55 |
0, 0, 0, 0, 0, 0, 0, 0, // 56 - 63 |
}; |
|
// Format 2.5 A. Single format instructions with memory operands or mixed register types |
PFunc funcTab10[64] = { |
0, 0, 0, 0, 0, 0, 0, 0, // 0 - 7 |
store_i32, 0, 0, 0, 0, 0, 0, 0, // 8 - 15 |
f_nop, 0, compare_swap, 0, 0, 0, 0, 0, // 16 - 23 |
0, 0, 0, 0, 0, 0, 0, 0, // 24 - 31 |
read_insert, 0, 0, 0, 0, 0, 0, 0, // 32 - 39 |
extract_store, 0, 0, 0, 0, 0, 0, 0, // 40 - 47 |
}; |
|
|
// Format 2.6 A. Three vector registers and a 32-bit immediate operand. |
PFunc funcTab11[64] = { |
load_hi, insert_hi, make_mask, replace_, replace_even, replace_odd, broadcast_32, 0, // 0 - 7 |
permute, 0, 0, 0, 0, 0, 0, 0 // 8 - 15 |
}; |
|
// Format 3.1 A. Three vector registers and a 64-bit immediate operand. |
PFunc funcTab13[64] = { |
0, 0, 0, 0, 0, 0, 0, 0, // 0 - 7 |
0, 0, 0, 0, 0, 0, 0, 0, // 8 - 15 |
0, 0, 0, 0, 0, 0, 0, 0, // 16 - 23 |
0, 0, 0, 0, 0, 0, 0, 0, // 34 - 31 |
replace_, broadcast_32, 0, 0, 0, 0, 0, 0, // 32 - 39 |
}; |
|
|
// Dispatch functions for single format instruction with E template. |
// (full tables of all possible single format instruction with E template would |
// be too large with most places unused). |
|
// Format 2.0.6 E. Four general purpose registers |
static uint64_t dispatch206_1 (CThread * t) { |
switch (t->op) { |
case 8: return truth_tab3(t); |
default: |
t->interrupt(INT_UNKNOWN_INST); |
} |
return 0; |
} |
|
|
// Format 2.0.7 E. Three general purpose registers and a 16-bit immediate constant |
static uint64_t dispatch207_1 (CThread * t) { |
switch (t->op) { |
case 0: return move_bits(t); |
default: |
t->interrupt(INT_UNKNOWN_INST); |
} |
return 0; |
} |
|
// Format 2.2.6 E. Four vector registers |
static uint64_t dispatch226_1 (CThread * t) { |
switch (t->op) { |
case 0: return concatenate(t); |
case 1: return permute(t); |
case 2: return interleave(t); |
case 8: return truth_tab3(t); |
default: |
t->interrupt(INT_UNKNOWN_INST); |
} |
return 0; |
} |
|
// Format 2.2.7 E. Three vector registers and a 16-bit immediate constant |
static uint64_t dispatch227_1 (CThread * t) { |
switch (t->op) { |
case 0: return move_bits(t); |
case 1: return mask_length(t); |
case 8: return repeat_block(t); |
case 9: return repeat_within_blocks(t); |
default: |
t->interrupt(INT_UNKNOWN_INST); |
} |
return 0; |
} |
|
// Table of dispatch functions for all possible single format instructions with E template |
PFunc EDispatchTable[96] = { |
0, 0, 0, 0, 0, 0, dispatch206_1, dispatch207_1, // 2.0.x i.1 |
0, 0, 0, 0, 0, 0, dispatch226_1, dispatch227_1, // 2.2.x i.1 |
0, 0, 0, 0, 0, 0, 0, 0, // 3.0.x i.1 |
0, 0, 0, 0, 0, 0, 0, 0, // 3.2.x i.1 |
|
0, 0, 0, 0, 0, 0, 0, 0, // 2.0.x i.2 |
0, 0, 0, 0, 0, 0, 0, 0, // 2.2.x i.2 |
0, 0, 0, 0, 0, 0, 0, 0, // 3.0.x i.2 |
0, 0, 0, 0, 0, 0, 0, 0, // 3.2.x i.2 |
|
0, 0, 0, 0, 0, 0, 0, 0, // 2.0.x i.3 |
0, 0, 0, 0, 0, 0, 0, 0, // 2.2.x i.3 |
0, 0, 0, 0, 0, 0, 0, 0, // 3.0.x i.3 |
0, 0, 0, 0, 0, 0, 0, 0 // 3.2.x i.3 |
}; |