| 1 |
56 |
Agner |
/**************************** emulator5.cpp ********************************
|
| 2 |
|
|
* Author: Agner Fog
|
| 3 |
|
|
* date created: 2018-02-18
|
| 4 |
|
|
* Last modified: 2021-06-30
|
| 5 |
|
|
* Version: 1.11
|
| 6 |
|
|
* Project: Binary tools for ForwardCom instruction set
|
| 7 |
|
|
* Description:
|
| 8 |
|
|
* Emulator: Execution functions for single format instructions, continued
|
| 9 |
|
|
*
|
| 10 |
|
|
* Copyright 2018-2021 GNU General Public License http://www.gnu.org/licenses
|
| 11 |
|
|
*****************************************************************************/
|
| 12 |
|
|
|
| 13 |
|
|
#include "stdafx.h"
|
| 14 |
|
|
|
| 15 |
|
|
// Format 1.3 B. Two vector registers and a broadcast 8-bit immediate operand.
|
| 16 |
|
|
|
| 17 |
|
|
static uint64_t gp2vec (CThread * t) {
|
| 18 |
|
|
// Move value of general purpose register RS to scalar in vector register RD.
|
| 19 |
|
|
uint8_t rd = t->operands[0];
|
| 20 |
|
|
uint8_t rs = t->operands[4];
|
| 21 |
|
|
uint64_t result = t->registers[rs]; // read general purpose register
|
| 22 |
|
|
t->vectorLength[rd] = dataSizeTable[t->operandType]; // set length of destination
|
| 23 |
|
|
t->vect = 4; // stop vector loop
|
| 24 |
|
|
return result;
|
| 25 |
|
|
}
|
| 26 |
|
|
|
| 27 |
|
|
static uint64_t vec2gp (CThread * t) {
|
| 28 |
|
|
// Move value of first element of vector register RS to general purpose register RD.
|
| 29 |
|
|
uint8_t rd = t->operands[0];
|
| 30 |
|
|
uint8_t rs = t->operands[4];
|
| 31 |
|
|
uint8_t size = dataSizeTable[t->operandType];
|
| 32 |
|
|
if (size > t->vectorLength[rs]) size = t->vectorLength[rs]; // limit size to vector length
|
| 33 |
|
|
uint64_t result = *(uint64_t*)(t->vectors.buf() + t->MaxVectorLength*rs); // read directly from vector
|
| 34 |
|
|
if (size < 8) result &= ((uint64_t)1 << size*8) - 1; // mask off to size
|
| 35 |
|
|
t->registers[rd] = result; // write to general purpose register
|
| 36 |
|
|
t->vect = 4; // stop vector loop
|
| 37 |
|
|
t->running = 2; // don't save RD
|
| 38 |
|
|
t->returnType &= ~ 0x100; // debug return type not vector
|
| 39 |
|
|
return result;
|
| 40 |
|
|
}
|
| 41 |
|
|
|
| 42 |
|
|
static uint64_t make_sequence (CThread * t) {
|
| 43 |
|
|
// Make a vector with RS sequential numbers. First value is IM1.
|
| 44 |
|
|
uint8_t rd = t->operands[0];
|
| 45 |
|
|
uint8_t rs = t->operands[4];
|
| 46 |
|
|
int32_t val = int8_t(t->pInstr->b[0]); // immediate operand, sign extended integer
|
| 47 |
|
|
uint64_t num = t->registers[rs]; // number of elements
|
| 48 |
|
|
uint32_t elementSize = dataSizeTable[t->operandType];
|
| 49 |
|
|
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
|
| 50 |
|
|
SNum temp;
|
| 51 |
|
|
// limit length
|
| 52 |
|
|
uint64_t length = num << dsizelog;
|
| 53 |
|
|
if (length > t->MaxVectorLength) {
|
| 54 |
|
|
length = t->MaxVectorLength; num = length >> dsizelog;
|
| 55 |
|
|
}
|
| 56 |
|
|
// set length of rd
|
| 57 |
|
|
t->vectorLength[rd] = (uint32_t)length;
|
| 58 |
|
|
// loop through destination vector
|
| 59 |
|
|
for (uint32_t pos = 0; pos < length; pos += elementSize) {
|
| 60 |
|
|
switch (t->operandType) {
|
| 61 |
|
|
case 0: case 1: case 2: case 3:
|
| 62 |
|
|
t->writeVectorElement(rd, (uint64_t)(int64_t)val, pos); break;
|
| 63 |
|
|
case 4:
|
| 64 |
|
|
t->writeVectorElement(rd, (uint64_t)(int64_t)val, pos); // int128
|
| 65 |
|
|
t->writeVectorElement(rd, (uint64_t)((int64_t)val >> 63), pos+8); break;
|
| 66 |
|
|
case 5: // float
|
| 67 |
|
|
temp.f = float(val); // convert to float
|
| 68 |
|
|
t->writeVectorElement(rd, temp.q, pos);
|
| 69 |
|
|
break;
|
| 70 |
|
|
case 6: // double
|
| 71 |
|
|
temp.d = double(val); // convert to double
|
| 72 |
|
|
t->writeVectorElement(rd, temp.q, pos);
|
| 73 |
|
|
break;
|
| 74 |
|
|
default:
|
| 75 |
|
|
t->interrupt(INT_WRONG_PARAMETERS);
|
| 76 |
|
|
}
|
| 77 |
|
|
val++; // increment value
|
| 78 |
|
|
}
|
| 79 |
|
|
t->vect = 4; // stop vector loop
|
| 80 |
|
|
t->running = 2; // don't save RD
|
| 81 |
|
|
return 0;
|
| 82 |
|
|
}
|
| 83 |
|
|
|
| 84 |
|
|
static uint64_t compress(CThread * t) {
|
| 85 |
|
|
// Compress vector RT of length RS to a vector of half the length and half the element size.
|
| 86 |
|
|
// Double precision -> single precision, 64-bit integer -> 32-bit integer, etc.
|
| 87 |
|
|
|
| 88 |
|
|
// operands:
|
| 89 |
|
|
uint8_t rd = t->operands[0];
|
| 90 |
|
|
uint8_t rs = t->operands[4];
|
| 91 |
|
|
uint8_t IM1 = t->parm[4].b;
|
| 92 |
|
|
if (IM1 & 0xC0) t->interrupt(INT_WRONG_PARAMETERS);
|
| 93 |
|
|
//uint32_t initLength = t->vectorLength[rt];
|
| 94 |
|
|
uint32_t oldLength = t->vectorLength[rs]; // (uint32_t)t->registers[rs];
|
| 95 |
|
|
uint32_t newLength = oldLength / 2;
|
| 96 |
|
|
uint32_t pos; // position in destination vector
|
| 97 |
|
|
uint8_t overflowU = 0; // unsigned overflow in current element
|
| 98 |
|
|
uint8_t overflowS = 0; // signed overflow in current element
|
| 99 |
|
|
uint8_t overflowU2 = 0; // unsigned overflow in any element
|
| 100 |
|
|
uint8_t overflowS2 = 0; // signed overflow in any element
|
| 101 |
|
|
uint8_t overflowF2 = 0; // floating point overflow in any element
|
| 102 |
|
|
SNum mask = t->parm[3]; // options mask
|
| 103 |
|
|
int8_t * source = t->vectors.buf() + (uint64_t)rs * t->MaxVectorLength; // address of RS data
|
| 104 |
|
|
int8_t * destination = t->vectors.buf() + (uint64_t)rd * t->MaxVectorLength; // address of RD data
|
| 105 |
|
|
|
| 106 |
|
|
uint8_t roundingMode = (IM1 >> 3) & 7; // floating point rounding mode
|
| 107 |
|
|
if (roundingMode == 0) roundingMode = ((t->parm[3].i >> MSKI_ROUNDING) & 7) | 4;
|
| 108 |
|
|
uint8_t exceptionControl = IM1 & 7; // floating point exception enable bits:
|
| 109 |
|
|
// 1: overflow, 2: underflow, 4: inexact
|
| 110 |
|
|
if (exceptionControl == 0) { // floating point exception control
|
| 111 |
|
|
exceptionControl = mask.i >> (MSKI_EXCEPTIONS + 1) & 7; // exceptions from NUMCONTR
|
| 112 |
|
|
}
|
| 113 |
|
|
else if (exceptionControl == 7) {
|
| 114 |
|
|
exceptionControl = 0; // 7 means none (5 means all)
|
| 115 |
|
|
}
|
| 116 |
|
|
|
| 117 |
|
|
switch (t->operandType) { // source operand type
|
| 118 |
|
|
case 0: // int8 -> int4
|
| 119 |
|
|
for (pos = 0; pos < newLength; pos += 1) {
|
| 120 |
|
|
union {
|
| 121 |
|
|
uint16_t s;
|
| 122 |
|
|
uint8_t b[2];
|
| 123 |
|
|
} u;
|
| 124 |
|
|
u.s = *(uint16_t*)(source + 2*pos); // two values to convert to one byte
|
| 125 |
|
|
for (int i = 0; i < 2; i++) { // loop for two bytes to convert
|
| 126 |
|
|
uint8_t val = u.b[i];
|
| 127 |
|
|
overflowU = val > 0x0F; // unsigned overflow
|
| 128 |
|
|
overflowS = val - 0xF8 > 0x0F; // signed overflow
|
| 129 |
|
|
overflowU2 |= overflowU; overflowS2 |= overflowS;
|
| 130 |
|
|
switch (IM1 & 7) {
|
| 131 |
|
|
case 0: default: // wrap around
|
| 132 |
|
|
break;
|
| 133 |
|
|
case 4: // signed integer overflow gives zero
|
| 134 |
|
|
if (overflowS) val = 0;
|
| 135 |
|
|
break;
|
| 136 |
|
|
case 5: // signed integer overflow gives signed saturation
|
| 137 |
|
|
if (overflowS) val = 0x7 + (val >> 7);
|
| 138 |
|
|
break;
|
| 139 |
|
|
case 6: // unsigned integer overflow gives zero
|
| 140 |
|
|
if (overflowU) val = 0;
|
| 141 |
|
|
break;
|
| 142 |
|
|
case 7: // unsigned integer overflow gives unsigned saturation
|
| 143 |
|
|
if (overflowU) val = 0xF;
|
| 144 |
|
|
break;
|
| 145 |
|
|
}
|
| 146 |
|
|
u.b[i] = val;
|
| 147 |
|
|
}
|
| 148 |
|
|
uint8_t val2 = (u.b[0] & 0xF) | u.b[1] << 4;
|
| 149 |
|
|
*(uint8_t*)(destination + pos) = val2; // store two values
|
| 150 |
|
|
}
|
| 151 |
|
|
t->returnType = 0x110;
|
| 152 |
|
|
break;
|
| 153 |
|
|
case 1: // int16 -> int8
|
| 154 |
|
|
for (pos = 0; pos < newLength; pos += 1) {
|
| 155 |
|
|
uint16_t val = *(uint16_t*)(source + 2*pos); // value to convert
|
| 156 |
|
|
overflowU = val > 0xFF; // unsigned overflow
|
| 157 |
|
|
overflowS = val - 0xFF80 > 0xFF; // signed overflow
|
| 158 |
|
|
overflowU2 |= overflowU; overflowS2 |= overflowS;
|
| 159 |
|
|
switch (IM1 & 7) {
|
| 160 |
|
|
case 0: default: // wrap around
|
| 161 |
|
|
break;
|
| 162 |
|
|
case 4: // signed integer overflow gives zero
|
| 163 |
|
|
if (overflowS) val = 0;
|
| 164 |
|
|
break;
|
| 165 |
|
|
case 5: // signed integer overflow gives signed saturation
|
| 166 |
|
|
if (overflowS) val = 0x7F + (val >> 15);
|
| 167 |
|
|
break;
|
| 168 |
|
|
case 6: // unsigned integer overflow gives zero
|
| 169 |
|
|
if (overflowU) val = 0;
|
| 170 |
|
|
break;
|
| 171 |
|
|
case 7: // unsigned integer overflow gives unsigned saturation
|
| 172 |
|
|
if (overflowU) val = 0xFF;
|
| 173 |
|
|
break;
|
| 174 |
|
|
}
|
| 175 |
|
|
*(uint8_t*)(destination + pos) = (uint8_t)val; // store value
|
| 176 |
|
|
}
|
| 177 |
|
|
t->returnType = 0x110;
|
| 178 |
|
|
break;
|
| 179 |
|
|
case 2: // int32 -> int16
|
| 180 |
|
|
for (pos = 0; pos < newLength; pos += 2) {
|
| 181 |
|
|
uint32_t val = *(uint32_t*)(source + 2*pos); // value to convert
|
| 182 |
|
|
overflowU = val > 0xFFFF; // unsigned overflow
|
| 183 |
|
|
overflowS = val - 0xFFFF8000 > 0xFFFF; // signed overflow
|
| 184 |
|
|
switch (IM1 & 7) {
|
| 185 |
|
|
case 0: default: // wrap around
|
| 186 |
|
|
break;
|
| 187 |
|
|
case 4: // signed integer overflow gives zero
|
| 188 |
|
|
if (overflowS) val = 0;
|
| 189 |
|
|
break;
|
| 190 |
|
|
case 5: // signed integer overflow gives signed saturation
|
| 191 |
|
|
if (overflowS) val = 0x7FFF + (val >> 31);
|
| 192 |
|
|
break;
|
| 193 |
|
|
case 6: // unsigned integer overflow gives zero
|
| 194 |
|
|
if (overflowU) val = 0;
|
| 195 |
|
|
break;
|
| 196 |
|
|
case 7: // unsigned integer overflow gives unsigned saturation
|
| 197 |
|
|
if (overflowU) val = 0xFFFF;
|
| 198 |
|
|
break;
|
| 199 |
|
|
}
|
| 200 |
|
|
*(uint16_t*)(destination + pos) = (uint16_t)val; // store value
|
| 201 |
|
|
}
|
| 202 |
|
|
t->returnType = 0x111;
|
| 203 |
|
|
break;
|
| 204 |
|
|
case 3: // int64 -> int32
|
| 205 |
|
|
for (pos = 0; pos < newLength; pos += 4) {
|
| 206 |
|
|
uint64_t val = *(uint64_t*)(source + 2*pos); // value to convert
|
| 207 |
|
|
overflowU = val > 0xFFFFFFFFU; // unsigned overflow
|
| 208 |
|
|
overflowS = val - 0xFFFFFFFF80000000 > 0xFFFFFFFFU; // signed overflow
|
| 209 |
|
|
switch (IM1 & 7) {
|
| 210 |
|
|
case 0: default: // wrap around
|
| 211 |
|
|
break;
|
| 212 |
|
|
case 4: // signed integer overflow gives zero
|
| 213 |
|
|
if (overflowS) val = 0;
|
| 214 |
|
|
break;
|
| 215 |
|
|
case 5: // signed integer overflow gives signed saturation
|
| 216 |
|
|
if (overflowS) val = 0x7FFFFFFF + (val >> 63);
|
| 217 |
|
|
break;
|
| 218 |
|
|
case 6: // unsigned integer overflow gives zero
|
| 219 |
|
|
if (overflowU) val = 0;
|
| 220 |
|
|
break;
|
| 221 |
|
|
case 7: // unsigned integer overflow gives unsigned saturation
|
| 222 |
|
|
if (overflowU) val = 0xFFFFFFFF;
|
| 223 |
|
|
break;
|
| 224 |
|
|
}
|
| 225 |
|
|
*(uint32_t*)(destination + pos) = (uint32_t)val; // store value
|
| 226 |
|
|
}
|
| 227 |
|
|
t->returnType = 0x112;
|
| 228 |
|
|
break;
|
| 229 |
|
|
case 4: // int128 -> int64
|
| 230 |
|
|
for (pos = 0; pos < newLength; pos += 8) {
|
| 231 |
|
|
uint64_t valLo = *(uint64_t*)(source + 2*pos); // value to convert, low part
|
| 232 |
|
|
uint64_t valHi = *(uint64_t*)(source + 2*pos + 8); // value to convert, high part
|
| 233 |
|
|
overflowU = valHi != 0; // unsigned overflow
|
| 234 |
|
|
if ((int64_t)valLo < 0) overflowS = valHi+1 != 0; // signed overflow
|
| 235 |
|
|
else overflowS = valHi != 0;
|
| 236 |
|
|
overflowU2 |= overflowU; overflowS2 |= overflowS;
|
| 237 |
|
|
switch (IM1 & 7) {
|
| 238 |
|
|
case 0: default: // wrap around
|
| 239 |
|
|
break;
|
| 240 |
|
|
case 4: // signed integer overflow gives zero
|
| 241 |
|
|
if (overflowS) valLo = 0;
|
| 242 |
|
|
break;
|
| 243 |
|
|
case 5: // signed integer overflow gives signed saturation
|
| 244 |
|
|
if (overflowS) valLo = nsign_d + (valHi >> 63);
|
| 245 |
|
|
break;
|
| 246 |
|
|
case 6: // unsigned integer overflow gives zero
|
| 247 |
|
|
if (overflowU) valHi = valLo = 0;
|
| 248 |
|
|
break;
|
| 249 |
|
|
case 7: // unsigned integer overflow gives unsigned saturation
|
| 250 |
|
|
if (overflowU) valLo = 0xFFFFFFFFFFFFFFFF;
|
| 251 |
|
|
break;
|
| 252 |
|
|
}
|
| 253 |
|
|
}
|
| 254 |
|
|
t->returnType = 0x113;
|
| 255 |
|
|
break;
|
| 256 |
|
|
case 5: // float -> float16
|
| 257 |
|
|
for (pos = 0; pos < newLength; pos += 2) {
|
| 258 |
|
|
SNum val;
|
| 259 |
|
|
val.i = *(uint32_t*)(source + 2 * pos); // value to convert
|
| 260 |
|
|
uint16_t val2 = float2half(val.f); // convert to half precision
|
| 261 |
|
|
if (!isnan_or_inf_f(val.i)) {
|
| 262 |
|
|
// check rounding mode
|
| 263 |
|
|
switch (roundingMode) {
|
| 264 |
|
|
case 1: // odd if not exact
|
| 265 |
|
|
if (half2float(val2) != val.f) val2 |= 1;
|
| 266 |
|
|
break;
|
| 267 |
|
|
case 4: default: // nearest or even
|
| 268 |
|
|
break;
|
| 269 |
|
|
case 5: // down
|
| 270 |
|
|
if (half2float(val2) > val.f) {
|
| 271 |
|
|
if (val2 << 1 == 0) val2 = 0x8001; // 0 -> subnormal negative
|
| 272 |
|
|
else if (int16_t(val2) > 0) val2--;
|
| 273 |
|
|
else val2++;
|
| 274 |
|
|
}
|
| 275 |
|
|
break;
|
| 276 |
|
|
case 6: // up
|
| 277 |
|
|
if (half2float(val2) < val.f) {
|
| 278 |
|
|
if (val2 << 1 == 0) val2 = 0x0001; // 0 -> subnormal positive
|
| 279 |
|
|
else if (int16_t(val2) > 0) val2++;
|
| 280 |
|
|
else val2--;
|
| 281 |
|
|
}
|
| 282 |
|
|
break;
|
| 283 |
|
|
case 7: // towards zero
|
| 284 |
|
|
if (half2float(val2) != val.f && (val2 << 1 != 0)) {
|
| 285 |
|
|
val2--;
|
| 286 |
|
|
}
|
| 287 |
|
|
break;
|
| 288 |
|
|
}
|
| 289 |
|
|
// check overflow
|
| 290 |
|
|
overflowS = (val2 & 0x7FFF) == 0x7C00 && !isinf_f(val.i);// detect overflow
|
| 291 |
|
|
overflowF2 |= overflowS;
|
| 292 |
|
|
if (overflowS) { // check for overflow
|
| 293 |
|
|
if (exceptionControl & 1) { // overflow exception -> NAN
|
| 294 |
|
|
val2 = (uint16_t)t->makeNan(nan_overflow_conv, 1); // overflow
|
| 295 |
|
|
}
|
| 296 |
|
|
}
|
| 297 |
|
|
else if ((exceptionControl & 6) && val2 << 1 == 0 && val.f != 0.f) {
|
| 298 |
|
|
val2 = (uint16_t)t->makeNan(nan_underflow, 1); // underflow exception (inexact implies underflow)
|
| 299 |
|
|
}
|
| 300 |
|
|
else if ((exceptionControl & 4) && half2float(val2) != val.f) {
|
| 301 |
|
|
val2 = (uint16_t)t->makeNan(nan_inexact, 1); // inexact exception
|
| 302 |
|
|
}
|
| 303 |
|
|
}
|
| 304 |
|
|
*(uint16_t*)(destination + pos) = val2; // store value
|
| 305 |
|
|
}
|
| 306 |
|
|
t->returnType = 0x118;
|
| 307 |
|
|
break;
|
| 308 |
|
|
case 6: // double -> float
|
| 309 |
|
|
for (pos = 0; pos < newLength; pos += 4) {
|
| 310 |
|
|
SNum val1, val2;
|
| 311 |
|
|
val1.q = *(uint64_t*)(source + 2 * pos); // value to convert
|
| 312 |
|
|
// check NAN and INF
|
| 313 |
|
|
if (isnan_or_inf_d(val1.q)) {
|
| 314 |
|
|
union { // single precision float
|
| 315 |
|
|
float f;
|
| 316 |
|
|
struct { // structure of a NAN
|
| 317 |
|
|
uint32_t payload : 22;
|
| 318 |
|
|
uint32_t quiet : 1;
|
| 319 |
|
|
uint32_t expo : 8;
|
| 320 |
|
|
uint32_t sign : 1;
|
| 321 |
|
|
};
|
| 322 |
|
|
} u;
|
| 323 |
|
|
u.payload = val1.i & ((1 << 22) - 1); // ForwardCom has right-justified NAN payload, unlike other binary systems
|
| 324 |
|
|
u.quiet = val1.q >> 51 & 1;
|
| 325 |
|
|
u.expo = 0xFF;
|
| 326 |
|
|
u.sign = val1.q >> 63 & 1;
|
| 327 |
|
|
val2.f = u.f;
|
| 328 |
|
|
}
|
| 329 |
|
|
else {
|
| 330 |
|
|
val2.f = float(val1.d); // convert to single precision
|
| 331 |
|
|
// check rounding mode
|
| 332 |
|
|
uint8_t roundingMode = (IM1 >> 3) & 7;
|
| 333 |
|
|
if (roundingMode == 0) roundingMode = ((t->parm[3].i >> MSKI_ROUNDING) & 7) | 4;
|
| 334 |
|
|
switch (roundingMode) {
|
| 335 |
|
|
case 1: // odd if not exact
|
| 336 |
|
|
if (val2.f != val1.d) {
|
| 337 |
|
|
val2.i |= 1;
|
| 338 |
|
|
}
|
| 339 |
|
|
break;
|
| 340 |
|
|
case 4: default: // nearest or even
|
| 341 |
|
|
break;
|
| 342 |
|
|
case 5: // down
|
| 343 |
|
|
if (val2.f > val1.d) {
|
| 344 |
|
|
if (val2.f == 0.f) val2.i = 0x80000001; // 0 -> subnormal negative
|
| 345 |
|
|
else if (val2.i > 0) val2.i--;
|
| 346 |
|
|
else val2.i++;
|
| 347 |
|
|
}
|
| 348 |
|
|
break;
|
| 349 |
|
|
case 6: // up
|
| 350 |
|
|
if (val2.f < val1.d) {
|
| 351 |
|
|
if (val2.f == 0.f) val2.i = 0x00000001; // 0 -> subnormal positive
|
| 352 |
|
|
else if (val2.i > 0) val2.i++;
|
| 353 |
|
|
else val2.i--;
|
| 354 |
|
|
}
|
| 355 |
|
|
break;
|
| 356 |
|
|
case 7: // towards zero
|
| 357 |
|
|
if (val2.f != val1.d && val2.f != 0.f) {
|
| 358 |
|
|
val2.i--;
|
| 359 |
|
|
}
|
| 360 |
|
|
break;
|
| 361 |
|
|
}
|
| 362 |
|
|
// check overflow
|
| 363 |
|
|
overflowS = isinf_f(val2.i) && !isinf_d(val1.q); // detect overflow
|
| 364 |
|
|
overflowF2 |= overflowS;
|
| 365 |
|
|
if (overflowS) { // check for overflow
|
| 366 |
|
|
if (exceptionControl & 1) { // overflow exception -> NAN
|
| 367 |
|
|
val2.q = t->makeNan(nan_overflow_conv, 5); // overflow
|
| 368 |
|
|
}
|
| 369 |
|
|
}
|
| 370 |
|
|
else if ((exceptionControl & 6) && val2.f == 0.f && val1.d != 0.) {
|
| 371 |
|
|
val2.q = t->makeNan(nan_underflow, 5); // underflow exception
|
| 372 |
|
|
}
|
| 373 |
|
|
else if ((exceptionControl & 4) && val2.f != val1.d) {
|
| 374 |
|
|
val2.q = t->makeNan(nan_inexact, 5); // inexact exception
|
| 375 |
|
|
}
|
| 376 |
|
|
}
|
| 377 |
|
|
*(uint32_t*)(destination + pos) = val2.i; // store value
|
| 378 |
|
|
}
|
| 379 |
|
|
t->returnType = 0x115;
|
| 380 |
|
|
break;
|
| 381 |
|
|
default:
|
| 382 |
|
|
t->interrupt(INT_WRONG_PARAMETERS);
|
| 383 |
|
|
}
|
| 384 |
|
|
// check overflow traps
|
| 385 |
|
|
/*
|
| 386 |
|
|
if (mask.i & MSK_OVERFL_ALL) {
|
| 387 |
|
|
if ((mask.i & MSK_OVERFL_SIGN) && overflowS2) t->interrupt(INT_OVERFL_SIGN); // signed overflow
|
| 388 |
|
|
else if ((mask.i & MSK_OVERFL_UNSIGN) && overflowU2) t->interrupt(INT_OVERFL_UNSIGN); // unsigned overflow
|
| 389 |
|
|
else if ((mask.i & MSK_OVERFL_FLOAT) && overflowF2) t->interrupt(INT_OVERFL_FLOAT); // float overflow
|
| 390 |
|
|
} */
|
| 391 |
|
|
t->vectorLength[rd] = newLength; // save new vector length
|
| 392 |
|
|
t->vect = 4; // stop vector loop
|
| 393 |
|
|
t->running = 2; // don't save. result has already been saved
|
| 394 |
|
|
return 0;
|
| 395 |
|
|
}
|
| 396 |
|
|
|
| 397 |
|
|
static uint64_t expand(CThread * t) {
|
| 398 |
|
|
// Expand vector RS to a vector of the double length and the double element size.
|
| 399 |
|
|
// OT specifies the element size or precision of the destination.
|
| 400 |
|
|
// Half precision -> single precision, 32-bit integer -> 64-bit integer, etc.
|
| 401 |
|
|
|
| 402 |
|
|
// Operands:
|
| 403 |
|
|
uint8_t rd = t->operands[0];
|
| 404 |
|
|
uint8_t rs = t->operands[4];
|
| 405 |
|
|
uint8_t IM1 = t->parm[4].b;
|
| 406 |
|
|
if (IM1 & 0xFC) t->interrupt(INT_WRONG_PARAMETERS);
|
| 407 |
|
|
bool signExtend = (IM1 & 2) == 0;
|
| 408 |
|
|
|
| 409 |
|
|
uint32_t initLength = t->vectorLength[rs];
|
| 410 |
|
|
uint32_t newLength = 2 * initLength;
|
| 411 |
|
|
if (newLength > t->MaxVectorLength) newLength = t->MaxVectorLength;
|
| 412 |
|
|
// uint32_t oldLength = newLength / 2;
|
| 413 |
|
|
uint32_t pos; // position in source vector
|
| 414 |
|
|
int8_t * source = t->vectors.buf() + (uint32_t)rs * t->MaxVectorLength; // address of RT data
|
| 415 |
|
|
int8_t * destination = t->vectors.buf() + (uint32_t)rd * t->MaxVectorLength; // address of RD data
|
| 416 |
|
|
if (rd == rs) {
|
| 417 |
|
|
// source and destination are the same. Make a temporary copy of source to avoid overwriting
|
| 418 |
|
|
memcpy(t->tempBuffer, source, initLength);
|
| 419 |
|
|
source = t->tempBuffer;
|
| 420 |
|
|
}
|
| 421 |
|
|
switch (t->operandType) {
|
| 422 |
|
|
case 0: // int4 -> int8
|
| 423 |
|
|
for (pos = 0; pos < newLength; pos += 1) {
|
| 424 |
|
|
uint8_t val1 = *(uint8_t*)(source + pos); // values to convert
|
| 425 |
|
|
union {
|
| 426 |
|
|
uint16_t s;
|
| 427 |
|
|
uint8_t b[2];
|
| 428 |
|
|
int8_t bs[2];
|
| 429 |
|
|
} val2;
|
| 430 |
|
|
if (signExtend) {
|
| 431 |
|
|
val2.bs[0] = (int8_t)val1 << 4 >> 4; // sign extend
|
| 432 |
|
|
val2.bs[1] = (int8_t)val1 >> 4; // sign extend
|
| 433 |
|
|
}
|
| 434 |
|
|
else {
|
| 435 |
|
|
val2.b[0] = val1 & 0xF; // zero extend
|
| 436 |
|
|
val2.b[1] = val1 >> 4; // zero extend
|
| 437 |
|
|
}
|
| 438 |
|
|
*(uint16_t*)(destination + pos*2) = val2.s; // store value
|
| 439 |
|
|
}
|
| 440 |
|
|
break;
|
| 441 |
|
|
case 1: // int8 -> int16
|
| 442 |
|
|
for (pos = 0; pos < newLength; pos += 1) {
|
| 443 |
|
|
uint16_t val = *(uint8_t*)(source + pos); // value to convert
|
| 444 |
|
|
if (signExtend) val = uint16_t((int16_t)(val << 8) >> 8); // sign extend
|
| 445 |
|
|
*(uint16_t*)(destination + pos*2) = val; // store value
|
| 446 |
|
|
}
|
| 447 |
|
|
break;
|
| 448 |
|
|
case 2: // int16 -> int32
|
| 449 |
|
|
for (pos = 0; pos < newLength; pos += 2) {
|
| 450 |
|
|
uint32_t val = *(uint16_t*)(source + pos); // value to convert
|
| 451 |
|
|
if (signExtend) val = uint32_t((int32_t)(val << 16) >> 16); // sign extend
|
| 452 |
|
|
*(uint32_t*)(destination + pos*2) = val; // store value
|
| 453 |
|
|
}
|
| 454 |
|
|
break;
|
| 455 |
|
|
case 3: // int32 -> int64
|
| 456 |
|
|
for (pos = 0; pos < newLength; pos += 4) {
|
| 457 |
|
|
uint64_t val = *(uint32_t*)(source + pos); // value to convert
|
| 458 |
|
|
if (signExtend) val = uint64_t((int64_t)(val << 32) >> 32); // sign extend
|
| 459 |
|
|
*(uint64_t*)(destination + pos*2) = val; // store value
|
| 460 |
|
|
}
|
| 461 |
|
|
break;
|
| 462 |
|
|
case 4: // int64 -> int128
|
| 463 |
|
|
for (pos = 0; pos < newLength; pos += 8) {
|
| 464 |
|
|
uint64_t valLo = *(uint64_t*)(source + pos); // value to convert
|
| 465 |
|
|
uint64_t valHi = 0;
|
| 466 |
|
|
if (signExtend) valHi = uint64_t((int64_t)valLo >> 63); // sign extend
|
| 467 |
|
|
*(uint64_t*)(destination + pos*2) = valLo; // store low part
|
| 468 |
|
|
*(uint64_t*)(destination + pos*2 + 8) = valHi; // store high part
|
| 469 |
|
|
}
|
| 470 |
|
|
break;
|
| 471 |
|
|
case 5: // float16 -> float
|
| 472 |
|
|
for (pos = 0; pos < newLength; pos += 2) {
|
| 473 |
|
|
uint16_t val1 = *(uint16_t*)(source + pos); // value to convert
|
| 474 |
|
|
float val2 = half2float(val1); // convert half precision to float
|
| 475 |
|
|
*(float*)(destination + pos*2) = val2; // store value
|
| 476 |
|
|
}
|
| 477 |
|
|
break;
|
| 478 |
|
|
case 6: // float -> double
|
| 479 |
|
|
for (pos = 0; pos < newLength; pos += 4) {
|
| 480 |
|
|
SNum val1;
|
| 481 |
|
|
val1.i = *(uint32_t*)(source + pos); // value to convert
|
| 482 |
|
|
double val2 = val1.f; // convert to double precision
|
| 483 |
|
|
// check NAN
|
| 484 |
|
|
// ForwardCom has right-justified NAN payload, unlike other binary systems
|
| 485 |
|
|
if (isnan_f(val1.i)) {
|
| 486 |
|
|
union { // single precision float
|
| 487 |
|
|
double d;
|
| 488 |
|
|
struct { // structure of a NAN
|
| 489 |
|
|
uint64_t payload : 51;
|
| 490 |
|
|
uint64_t quiet : 1;
|
| 491 |
|
|
uint64_t expo : 11;
|
| 492 |
|
|
uint64_t sign : 1;
|
| 493 |
|
|
};
|
| 494 |
|
|
} u;
|
| 495 |
|
|
u.payload = val1.q & ((1 << 22) - 1);
|
| 496 |
|
|
u.quiet = val1.i >> 22 & 1;
|
| 497 |
|
|
u.expo = 0x7FF;
|
| 498 |
|
|
u.sign = val1.q >> 63 & 1;
|
| 499 |
|
|
val2 = u.d;
|
| 500 |
|
|
}
|
| 501 |
|
|
*(double*)(destination + pos*2) = val2; // store value
|
| 502 |
|
|
}
|
| 503 |
|
|
break;
|
| 504 |
|
|
default:
|
| 505 |
|
|
t->interrupt(INT_WRONG_PARAMETERS);
|
| 506 |
|
|
}
|
| 507 |
|
|
t->vectorLength[rd] = newLength; // save new vector length
|
| 508 |
|
|
t->vect = 4; // stop vector loop
|
| 509 |
|
|
t->running = 2; // don't save. result has already been saved
|
| 510 |
|
|
return 0;
|
| 511 |
|
|
}
|
| 512 |
|
|
|
| 513 |
|
|
static uint64_t float2int (CThread * t) {
|
| 514 |
|
|
// Conversion of floating point to signed or unsigned integer with the same operand size.
|
| 515 |
|
|
// The rounding mode and overflow control is specified in IM1.
|
| 516 |
|
|
SNum a = t->parm[1];
|
| 517 |
|
|
SNum b = t->parm[4];
|
| 518 |
|
|
int64_t result = 0;
|
| 519 |
|
|
uint32_t dataSize = dataSizeTable[t->operandType];
|
| 520 |
|
|
uint8_t roundingMode = b.b >> 3 & 3;
|
| 521 |
|
|
uint8_t signMode = roundingMode | (b.b & 2) << 1; // bit 0-1: rounding mode, bit 2: usigned
|
| 522 |
|
|
bool overflow = false;
|
| 523 |
|
|
bool invalid = false;
|
| 524 |
|
|
|
| 525 |
|
|
if (dataSize == 2) { // float16 -> int16
|
| 526 |
|
|
const float max = (float)(int32_t)0x7FFF;
|
| 527 |
|
|
const float min = -max - 1.0f;
|
| 528 |
|
|
const float umax = (float)(uint32_t)0xFFFFu;
|
| 529 |
|
|
if (isnan_h(a.s)) {
|
| 530 |
|
|
invalid = true;
|
| 531 |
|
|
}
|
| 532 |
|
|
else {
|
| 533 |
|
|
float f = half2float(a.s);
|
| 534 |
|
|
switch (signMode) { // rounding mode:
|
| 535 |
|
|
case 0: // nearest or even
|
| 536 |
|
|
if (f >= max + 0.5f || f < min - 0.5f) overflow = true;
|
| 537 |
|
|
result = (int)(nearbyint(f));
|
| 538 |
|
|
break;
|
| 539 |
|
|
case 1: // down
|
| 540 |
|
|
if (f >= max + 1.0f || f <= min) overflow = true;
|
| 541 |
|
|
result = (int)(floor(f));
|
| 542 |
|
|
break;
|
| 543 |
|
|
case 2: // up
|
| 544 |
|
|
if (f > max || f <= min - 1.0f) overflow = true;
|
| 545 |
|
|
result = (int)(ceil(f));
|
| 546 |
|
|
break;
|
| 547 |
|
|
case 3: // towards zero
|
| 548 |
|
|
if (f >= max + 1.0f || f <= min - 1.0f) overflow = true;
|
| 549 |
|
|
result = (int)(f);
|
| 550 |
|
|
break;
|
| 551 |
|
|
case 4: // unsigned nearest or even
|
| 552 |
|
|
if (f >= umax + 0.5f || f < - 0.5f) overflow = true;
|
| 553 |
|
|
result = (int)(nearbyint(f));
|
| 554 |
|
|
break;
|
| 555 |
|
|
case 5: case 7: // unsigned down
|
| 556 |
|
|
if (f >= umax + 1.0f || f < 0.0f) overflow = true;
|
| 557 |
|
|
result = (int)(floor(f));
|
| 558 |
|
|
break;
|
| 559 |
|
|
case 6: // unsigned up
|
| 560 |
|
|
if (f > umax || f <= -1.0f) overflow = true;
|
| 561 |
|
|
else result = (int)(ceil(f));
|
| 562 |
|
|
}
|
| 563 |
|
|
if (overflow) {
|
| 564 |
|
|
switch (b.b & 7) { // overflow options
|
| 565 |
|
|
case 0: default: // wrap around
|
| 566 |
|
|
result &= 0xFFFFu;
|
| 567 |
|
|
break;
|
| 568 |
|
|
case 4: case 6:
|
| 569 |
|
|
result = 0;
|
| 570 |
|
|
break;
|
| 571 |
|
|
case 5: // signed saturation
|
| 572 |
|
|
result = 0x7FFF + int(f < 0);
|
| 573 |
|
|
break;
|
| 574 |
|
|
case 7: // unsigned saturation
|
| 575 |
|
|
result = 0xFFFFu;
|
| 576 |
|
|
break;
|
| 577 |
|
|
}
|
| 578 |
|
|
}
|
| 579 |
|
|
if (invalid) {
|
| 580 |
|
|
result = (b.b & 0x20) ? 0x8000u : 0;
|
| 581 |
|
|
}
|
| 582 |
|
|
}
|
| 583 |
|
|
}
|
| 584 |
|
|
else if (dataSize == 4) { // float -> int32
|
| 585 |
|
|
const float max = (float)(int32_t)nsign_f;
|
| 586 |
|
|
const float min = -max - 1.0f;
|
| 587 |
|
|
const float umax = (float)(uint32_t)0xFFFFFFFFu;
|
| 588 |
|
|
if (isnan_f(a.i)) {
|
| 589 |
|
|
invalid = true;
|
| 590 |
|
|
}
|
| 591 |
|
|
else {
|
| 592 |
|
|
switch (signMode) { // rounding mode:
|
| 593 |
|
|
case 0: // nearest or even
|
| 594 |
|
|
if (a.f >= max + 0.5f || a.f < min - 0.5f) overflow = true;
|
| 595 |
|
|
result = (int64_t)(nearbyint(a.f));
|
| 596 |
|
|
break;
|
| 597 |
|
|
case 1: // down
|
| 598 |
|
|
if (a.f >= max + 1.0f || a.f <= min) overflow = true;
|
| 599 |
|
|
result = (int64_t)(floor(a.f));
|
| 600 |
|
|
break;
|
| 601 |
|
|
case 2: // up
|
| 602 |
|
|
if (a.f > max || a.f <= min - 1.0f) overflow = true;
|
| 603 |
|
|
result = (int64_t)(ceil(a.f));
|
| 604 |
|
|
break;
|
| 605 |
|
|
case 3: // towards zero
|
| 606 |
|
|
if (a.f >= max + 1.0f || a.f <= min - 1.0f) overflow = true;
|
| 607 |
|
|
result = (int64_t)(a.f);
|
| 608 |
|
|
break;
|
| 609 |
|
|
case 4: // unsigned nearest or even
|
| 610 |
|
|
if (a.f >= umax + 0.5f || a.f < - 0.5f) overflow = true;
|
| 611 |
|
|
result = (int64_t)(nearbyint(a.f));
|
| 612 |
|
|
break;
|
| 613 |
|
|
case 5: case 7: // unsigned down
|
| 614 |
|
|
if (a.f >= umax + 1.0f || a.f < 0.0f) overflow = true;
|
| 615 |
|
|
result = (int64_t)(floor(a.f));
|
| 616 |
|
|
break;
|
| 617 |
|
|
case 6: // unsigned up
|
| 618 |
|
|
if (a.f > umax || a.f <= -1.0f) overflow = true;
|
| 619 |
|
|
else result = (int64_t)(ceil(a.f));
|
| 620 |
|
|
}
|
| 621 |
|
|
if (overflow) {
|
| 622 |
|
|
switch (b.b & 7) { // overflow options
|
| 623 |
|
|
case 0: // wrap around
|
| 624 |
|
|
result &= 0xFFFFFFFFu;
|
| 625 |
|
|
break;
|
| 626 |
|
|
case 4: case 6:
|
| 627 |
|
|
result = 0;
|
| 628 |
|
|
break;
|
| 629 |
|
|
case 5: // signed saturation
|
| 630 |
|
|
result = 0x7FFFFFFF + int(a.f < 0);
|
| 631 |
|
|
break;
|
| 632 |
|
|
case 7: // unsigned saturation
|
| 633 |
|
|
result = 0xFFFFFFFFu;
|
| 634 |
|
|
break;
|
| 635 |
|
|
}
|
| 636 |
|
|
}
|
| 637 |
|
|
if (invalid) {
|
| 638 |
|
|
result = (b.b & 0x20) ? sign_f : 0;
|
| 639 |
|
|
}
|
| 640 |
|
|
}
|
| 641 |
|
|
}
|
| 642 |
|
|
else if (dataSize == 8) { // double -> int64
|
| 643 |
|
|
const double max = (double)(int64_t)nsign_d;
|
| 644 |
|
|
const double min = -max - 1.0f;
|
| 645 |
|
|
const double umax = (double)0xFFFFFFFFFFFFFFFFu;
|
| 646 |
|
|
if (isnan_d(a.q)) {
|
| 647 |
|
|
invalid = true;
|
| 648 |
|
|
}
|
| 649 |
|
|
else {
|
| 650 |
|
|
switch (signMode) { // rounding mode:
|
| 651 |
|
|
case 0: // nearest or even
|
| 652 |
|
|
if (a.d >= max + 0.5 || a.d < min - 0.5) overflow = true;
|
| 653 |
|
|
result = (int64_t)(nearbyint(a.d));
|
| 654 |
|
|
break;
|
| 655 |
|
|
case 1: // down
|
| 656 |
|
|
if (a.d >= max + 1.0 || a.d <= min) overflow = true;
|
| 657 |
|
|
result = (int64_t)(floor(a.d));
|
| 658 |
|
|
break;
|
| 659 |
|
|
case 2: // up
|
| 660 |
|
|
if (a.d > max || a.d <= min - 1.0) overflow = true;
|
| 661 |
|
|
result = (int64_t)(ceil(a.d));
|
| 662 |
|
|
break;
|
| 663 |
|
|
case 3: // towards zero
|
| 664 |
|
|
if (a.d >= max + 1.0 || a.d <= min - 1.0) overflow = true;
|
| 665 |
|
|
result = (int64_t)(a.d);
|
| 666 |
|
|
break;
|
| 667 |
|
|
case 4: // unsigned nearest or even
|
| 668 |
|
|
if (a.d >= umax + 0.5 || a.d < - 0.5) overflow = true;
|
| 669 |
|
|
result = (uint64_t)(nearbyint(a.d));
|
| 670 |
|
|
break;
|
| 671 |
|
|
case 5: case 7: // unsigned down
|
| 672 |
|
|
if (a.d >= umax + 1.0 || a.d < 0.0) overflow = true;
|
| 673 |
|
|
result = (uint64_t)(floor(a.d));
|
| 674 |
|
|
break;
|
| 675 |
|
|
case 6: // unsigned up
|
| 676 |
|
|
if (a.d > umax || a.d <= -1.0) overflow = true;
|
| 677 |
|
|
result = (uint64_t)(ceil(a.d));
|
| 678 |
|
|
}
|
| 679 |
|
|
}
|
| 680 |
|
|
if (overflow) {
|
| 681 |
|
|
switch (b.b & 7) { // overflow options
|
| 682 |
|
|
case 0: // wrap around
|
| 683 |
|
|
break;
|
| 684 |
|
|
case 4: case 6:
|
| 685 |
|
|
result = 0;
|
| 686 |
|
|
break;
|
| 687 |
|
|
case 5: // signed saturation
|
| 688 |
|
|
result = nsign_d + int(a.d < 0);
|
| 689 |
|
|
break;
|
| 690 |
|
|
case 7: // unsigned saturation
|
| 691 |
|
|
result = 0xFFFFFFFFFFFFFFFFu;
|
| 692 |
|
|
break;
|
| 693 |
|
|
}
|
| 694 |
|
|
}
|
| 695 |
|
|
if (invalid) {
|
| 696 |
|
|
result = (b.b & 0x20) ? sign_d : 0;
|
| 697 |
|
|
}
|
| 698 |
|
|
}
|
| 699 |
|
|
else t->interrupt(INT_WRONG_PARAMETERS);
|
| 700 |
|
|
/* Traps not supported
|
| 701 |
|
|
if (overflow && (mask.i & MSK_OVERFL_SIGN)) {
|
| 702 |
|
|
t->interrupt(INT_OVERFL_SIGN); // signed overflow
|
| 703 |
|
|
result = dataSizeMask[t->operandType] >> 1; // INT_MAX
|
| 704 |
|
|
}
|
| 705 |
|
|
if (invalid && (mask.i & MSK_FLOAT_NAN_LOSS)) {
|
| 706 |
|
|
t->interrupt(INT_FLOAT_NAN_LOSS); // nan converted to integer
|
| 707 |
|
|
result = dataSizeMask[t->operandType] >> 1; // INT_MAX
|
| 708 |
|
|
} */
|
| 709 |
|
|
if ((t->operandType & 7) >= 5) t->operandType -= 3; // debug return type is integer
|
| 710 |
|
|
return result;
|
| 711 |
|
|
}
|
| 712 |
|
|
|
| 713 |
|
|
static uint64_t int2float (CThread * t) {
|
| 714 |
|
|
// Conversion of signed or unsigned integer to floating point with same operand size.
|
| 715 |
|
|
SNum a = t->parm[1];
|
| 716 |
|
|
SNum IM1 = t->parm[4];
|
| 717 |
|
|
bool isSigned = (IM1.b & 1) == 0; // signed integer
|
| 718 |
|
|
bool inexactX = (IM1.b & 4) != 0; // make NAN exception if inexact
|
| 719 |
|
|
|
| 720 |
|
|
SNum result;
|
| 721 |
|
|
uint32_t dataSize = dataSizeTable[t->operandType];
|
| 722 |
|
|
switch (dataSize) {
|
| 723 |
|
|
case 2: // int16 -> float16
|
| 724 |
|
|
if (isSigned) {
|
| 725 |
|
|
result.s = float2half(float(a.ss));
|
| 726 |
|
|
if (inexactX && int32_t(half2float(result.s)) != a.ss) {
|
| 727 |
|
|
result.q = t->makeNan(nan_inexact, 1);
|
| 728 |
|
|
}
|
| 729 |
|
|
}
|
| 730 |
|
|
else { // unsigned
|
| 731 |
|
|
result.s = float2half(float(a.s));
|
| 732 |
|
|
if (inexactX && uint32_t(half2float(result.s)) != a.s) {
|
| 733 |
|
|
result.q = t->makeNan(nan_inexact, 1);
|
| 734 |
|
|
}
|
| 735 |
|
|
}
|
| 736 |
|
|
t->returnType = 0x118; // debug return type is float16
|
| 737 |
|
|
break;
|
| 738 |
|
|
|
| 739 |
|
|
case 4: // int32 -> float
|
| 740 |
|
|
if (isSigned) {
|
| 741 |
|
|
result.f = (float)a.is;
|
| 742 |
|
|
if (inexactX && int32_t(result.f) != a.is) {
|
| 743 |
|
|
result.q = t->makeNan(nan_inexact, 5);
|
| 744 |
|
|
}
|
| 745 |
|
|
}
|
| 746 |
|
|
else {
|
| 747 |
|
|
result.f = (float)a.i;
|
| 748 |
|
|
if (inexactX && uint32_t(result.f) != a.i) {
|
| 749 |
|
|
result.q = t->makeNan(nan_inexact, 5);
|
| 750 |
|
|
}
|
| 751 |
|
|
}
|
| 752 |
|
|
t->returnType = 0x115; // debug return type is float
|
| 753 |
|
|
break;
|
| 754 |
|
|
|
| 755 |
|
|
case 8: // int64 -> double
|
| 756 |
|
|
if (isSigned) {
|
| 757 |
|
|
result.d = (double)a.qs;
|
| 758 |
|
|
if (inexactX && int64_t(result.d) != a.qs) {
|
| 759 |
|
|
result.q = t->makeNan(nan_inexact, 6);
|
| 760 |
|
|
}
|
| 761 |
|
|
}
|
| 762 |
|
|
else {
|
| 763 |
|
|
result.d = (double)a.q;
|
| 764 |
|
|
if (inexactX && uint64_t(result.d) != a.q) {
|
| 765 |
|
|
result.q = t->makeNan(nan_inexact, 6);
|
| 766 |
|
|
}
|
| 767 |
|
|
}
|
| 768 |
|
|
t->returnType = 0x116; // debug return type is double
|
| 769 |
|
|
break;
|
| 770 |
|
|
|
| 771 |
|
|
default:
|
| 772 |
|
|
t->interrupt(INT_WRONG_PARAMETERS);
|
| 773 |
|
|
result.q = 0;
|
| 774 |
|
|
}
|
| 775 |
|
|
return result.q;
|
| 776 |
|
|
}
|
| 777 |
|
|
|
| 778 |
|
|
static uint64_t round_ (CThread * t) {
|
| 779 |
|
|
// Round floating point to integer in floating point representation.
|
| 780 |
|
|
// The rounding mode is specified in IM1.
|
| 781 |
|
|
// Conversion of floating point to signed integer with the same operand size.
|
| 782 |
|
|
// The rounding mode is specified in IM1.
|
| 783 |
|
|
SNum a = t->parm[1];
|
| 784 |
|
|
SNum b = t->parm[4];
|
| 785 |
|
|
SNum result;
|
| 786 |
|
|
uint32_t dataSize = dataSizeTable[t->operandType];
|
| 787 |
|
|
if (dataSize == 4) { // float -> int32
|
| 788 |
|
|
switch (b.b) { // rounding mode:
|
| 789 |
|
|
case 0: // nearest or even
|
| 790 |
|
|
result.f = nearbyintf(a.f);
|
| 791 |
|
|
break;
|
| 792 |
|
|
case 1: // down
|
| 793 |
|
|
result.f = floorf(a.f);
|
| 794 |
|
|
break;
|
| 795 |
|
|
case 2: // up
|
| 796 |
|
|
result.f = ceilf(a.f);
|
| 797 |
|
|
break;
|
| 798 |
|
|
case 3: // towards zero
|
| 799 |
|
|
result.f = truncf(a.f);
|
| 800 |
|
|
break;
|
| 801 |
|
|
default: t->interrupt(INT_WRONG_PARAMETERS);
|
| 802 |
|
|
}
|
| 803 |
|
|
}
|
| 804 |
|
|
else if (dataSize == 8) { // double -> int64
|
| 805 |
|
|
switch (b.b) { // rounding mode:
|
| 806 |
|
|
case 0: // nearest or even
|
| 807 |
|
|
result.d = nearbyint(a.d);
|
| 808 |
|
|
break;
|
| 809 |
|
|
case 1: // down
|
| 810 |
|
|
result.d = floor(a.d);
|
| 811 |
|
|
break;
|
| 812 |
|
|
case 2: // up
|
| 813 |
|
|
result.d = ceil(a.d);
|
| 814 |
|
|
break;
|
| 815 |
|
|
case 3: // towards zero
|
| 816 |
|
|
result.d = trunc(a.d);
|
| 817 |
|
|
break;
|
| 818 |
|
|
default: t->interrupt(INT_WRONG_PARAMETERS);
|
| 819 |
|
|
}
|
| 820 |
|
|
}
|
| 821 |
|
|
return result.q;
|
| 822 |
|
|
}
|
| 823 |
|
|
|
| 824 |
|
|
static uint64_t round2n (CThread * t) {
|
| 825 |
|
|
// Round to nearest multiple of 2n.
|
| 826 |
|
|
// RD = 2^n * round(2^(−n)*RS).
|
| 827 |
|
|
// n is a signed integer constant in IM1
|
| 828 |
|
|
SNum b = t->parm[4]; // n
|
| 829 |
|
|
//SNum mask = t->parm[3];
|
| 830 |
|
|
uint32_t exponent1;
|
| 831 |
|
|
uint64_t result = 0;
|
| 832 |
|
|
if (t->operandType == 5) { // float
|
| 833 |
|
|
union {
|
| 834 |
|
|
uint32_t i;
|
| 835 |
|
|
float f;
|
| 836 |
|
|
struct {
|
| 837 |
|
|
uint32_t mantissa : 23;
|
| 838 |
|
|
uint32_t exponent : 8;
|
| 839 |
|
|
uint32_t sign : 1;
|
| 840 |
|
|
};
|
| 841 |
|
|
} u;
|
| 842 |
|
|
u.i = t->parm[1].i; // input a
|
| 843 |
|
|
if (isnan_f(u.i)) return u.i; // a is nan
|
| 844 |
|
|
exponent1 = u.exponent;
|
| 845 |
|
|
if (exponent1 == 0) {
|
| 846 |
|
|
u.mantissa = 0; // a is zero or subnormal. return zero
|
| 847 |
|
|
return u.i;
|
| 848 |
|
|
}
|
| 849 |
|
|
exponent1 -= b.i; // subtract b from exponent
|
| 850 |
|
|
if ((int32_t)exponent1 <= 0) { // underflow
|
| 851 |
|
|
//if (mask.i & MSK_FLOAT_UNDERFL) t->interrupt(INT_FLOAT_UNDERFL);
|
| 852 |
|
|
return 0;
|
| 853 |
|
|
}
|
| 854 |
|
|
else if ((int32_t)exponent1 >= 0xFF) { // overflow
|
| 855 |
|
|
//if (mask.i & MSK_OVERFL_FLOAT) t->interrupt(INT_OVERFL_FLOAT);
|
| 856 |
|
|
return inf_f;
|
| 857 |
|
|
}
|
| 858 |
|
|
u.exponent = exponent1;
|
| 859 |
|
|
u.f = nearbyintf(u.f); // round
|
| 860 |
|
|
if (u.f != 0) u.exponent += b.i; // add b to exponent
|
| 861 |
|
|
result = u.i;
|
| 862 |
|
|
}
|
| 863 |
|
|
else if (t->operandType == 6) { // double
|
| 864 |
|
|
union {
|
| 865 |
|
|
uint64_t q;
|
| 866 |
|
|
double d;
|
| 867 |
|
|
struct {
|
| 868 |
|
|
uint64_t mantissa : 52;
|
| 869 |
|
|
uint64_t exponent : 11;
|
| 870 |
|
|
uint64_t sign : 1;
|
| 871 |
|
|
};
|
| 872 |
|
|
} u;
|
| 873 |
|
|
u.q = t->parm[1].q; // input a
|
| 874 |
|
|
if (isnan_d(u.q)) return u.q; // a is nan
|
| 875 |
|
|
exponent1 = u.exponent;
|
| 876 |
|
|
if (exponent1 == 0) {
|
| 877 |
|
|
u.mantissa = 0; // a is zero or subnormal. return zero
|
| 878 |
|
|
return u.q;
|
| 879 |
|
|
}
|
| 880 |
|
|
exponent1 -= b.i; // subtract b from exponent
|
| 881 |
|
|
if ((int32_t)exponent1 <= 0) { // underflow
|
| 882 |
|
|
//if (mask.i & MSK_FLOAT_UNDERFL) t->interrupt(INT_FLOAT_UNDERFL);
|
| 883 |
|
|
return 0;
|
| 884 |
|
|
}
|
| 885 |
|
|
else if ((int32_t)exponent1 >= 0x7FF) { // overflow
|
| 886 |
|
|
//if (mask.i & MSK_OVERFL_FLOAT) t->interrupt(INT_OVERFL_FLOAT);
|
| 887 |
|
|
return inf_d;
|
| 888 |
|
|
}
|
| 889 |
|
|
u.exponent = exponent1;
|
| 890 |
|
|
u.d = nearbyint(u.d); // round
|
| 891 |
|
|
if (u.d != 0) u.exponent += b.i; // add b to exponent
|
| 892 |
|
|
result = u.q;
|
| 893 |
|
|
}
|
| 894 |
|
|
else t->interrupt(INT_WRONG_PARAMETERS);
|
| 895 |
|
|
return result;
|
| 896 |
|
|
}
|
| 897 |
|
|
|
| 898 |
|
|
static uint64_t abs_ (CThread * t) {
|
| 899 |
|
|
// Absolute value of integer.
|
| 900 |
|
|
// IM1 determines handling of overflow: 0: wrap around, 1: saturate, 2: zero, 3: trap
|
| 901 |
|
|
SNum a = t->parm[1]; // x
|
| 902 |
|
|
SNum b = t->parm[4]; // option
|
| 903 |
|
|
uint64_t sizemask = dataSizeMask[t->operandType]; // mask for operand size
|
| 904 |
|
|
uint64_t signbit = (sizemask >> 1) + 1; // just the sign bit
|
| 905 |
|
|
if (a.q & signbit) {
|
| 906 |
|
|
// a is negative
|
| 907 |
|
|
if (t->operandType > 4) { // floating point types
|
| 908 |
|
|
return a.q & ~signbit; // just remove sign bit
|
| 909 |
|
|
}
|
| 910 |
|
|
if ((a.q & sizemask) == signbit) {
|
| 911 |
|
|
// overflow
|
| 912 |
|
|
switch (b.b & ~4) {
|
| 913 |
|
|
case 0: // wrap around
|
| 914 |
|
|
break;
|
| 915 |
|
|
case 1: // saturate
|
| 916 |
|
|
return a.q - 1;
|
| 917 |
|
|
case 2: // zero
|
| 918 |
|
|
return 0;
|
| 919 |
|
|
default:
|
| 920 |
|
|
t->interrupt(INT_WRONG_PARAMETERS);
|
| 921 |
|
|
}
|
| 922 |
|
|
if ((b.b & 4) /* && (t->parm[3].i & MSK_OVERFL_SIGN)*/) { // trap
|
| 923 |
|
|
t->interrupt(INT_OVERFL_SIGN); // signed overflow
|
| 924 |
|
|
}
|
| 925 |
|
|
}
|
| 926 |
|
|
a.qs = - a.qs; // change sign
|
| 927 |
|
|
}
|
| 928 |
|
|
return a.q;
|
| 929 |
|
|
}
|
| 930 |
|
|
|
| 931 |
|
|
static uint64_t fp_category (CThread * t) {
|
| 932 |
|
|
// Check if floating point numbers belong to the categories indicated by constant
|
| 933 |
|
|
// 0 ± NAN, 1 ± Zero, 2 −Subnormal, 3 +Subnormal, 4 −Normal, 5 +Normal, 6 −Infinite, 7 +Infinite
|
| 934 |
|
|
SNum a = t->parm[1]; // x
|
| 935 |
|
|
SNum b = t->parm[4]; // option
|
| 936 |
|
|
uint32_t exponent;
|
| 937 |
|
|
uint8_t category = 0; // detected category bits
|
| 938 |
|
|
switch (t->operandType) {
|
| 939 |
|
|
case 2: case 5: // float
|
| 940 |
|
|
exponent = a.i >> 23 & 0xFF; // isolate exponent
|
| 941 |
|
|
if (exponent == 0xFF) { // nan or inf
|
| 942 |
|
|
if (a.i << 9) category = 1; // nan
|
| 943 |
|
|
else if (a.i >> 31) category = 0x40; // -inf
|
| 944 |
|
|
else category = 0x80; // + inf
|
| 945 |
|
|
}
|
| 946 |
|
|
else if (exponent == 0) {
|
| 947 |
|
|
if ((a.i << 9) == 0) category = 2; // zero
|
| 948 |
|
|
else if (a.i >> 31) category = 4; // - subnormal
|
| 949 |
|
|
else category = 8; // + subnormal
|
| 950 |
|
|
}
|
| 951 |
|
|
else if (a.i >> 31) category = 0x10; // - normal
|
| 952 |
|
|
else category = 0x20; // + normal
|
| 953 |
|
|
break;
|
| 954 |
|
|
case 3: case 6: // double
|
| 955 |
|
|
exponent = a.q >> 52 & 0x7FF; // isolate exponent
|
| 956 |
|
|
if (exponent == 0x7FF) { // nan or inf
|
| 957 |
|
|
if (a.q << 12) category = 1; // nan
|
| 958 |
|
|
else if (a.q >> 63) category = 0x40; // -inf
|
| 959 |
|
|
else category = 0x80; // + inf
|
| 960 |
|
|
}
|
| 961 |
|
|
else if (exponent == 0) {
|
| 962 |
|
|
if ((a.q << 12) == 0) category = 2; // zero
|
| 963 |
|
|
else if (a.q >> 63) category = 4; // - subnormal
|
| 964 |
|
|
else category = 8; // + subnormal
|
| 965 |
|
|
}
|
| 966 |
|
|
else if (a.q >> 63) category = 0x10; // - normal
|
| 967 |
|
|
else category = 0x20; // + normal
|
| 968 |
|
|
break;
|
| 969 |
|
|
default:
|
| 970 |
|
|
t->interrupt(INT_WRONG_PARAMETERS);
|
| 971 |
|
|
}
|
| 972 |
|
|
uint8_t result = (category & b.b) != 0; // test if a belongs to any of the indicated categories
|
| 973 |
|
|
if ((t->operandType & 7) >= 5) t->operandType -= 3; // debug return type is integer
|
| 974 |
|
|
return (t->numContr & ~(uint64_t)1) | result; // get remaining bits from NUMCONTR
|
| 975 |
|
|
}
|
| 976 |
|
|
|
| 977 |
|
|
static uint64_t broad_ (CThread * t) {
|
| 978 |
|
|
// 18: Broadcast 8-bit signed constant into all elements of RD with length RS (31 in RS field gives scalar output).
|
| 979 |
|
|
// 19: broadcast_max. Broadcast 8-bit constant into all elements of RD with maximum vector length.
|
| 980 |
|
|
uint8_t rd = t->operands[0];
|
| 981 |
|
|
uint8_t rs = t->operands[4];
|
| 982 |
|
|
uint8_t rm = t->operands[1]; // mask register
|
| 983 |
|
|
SNum b = t->parm[2]; // constant
|
| 984 |
|
|
uint64_t length; // length of destination vector
|
| 985 |
|
|
if (t->op == 18) { // length given by RS
|
| 986 |
|
|
length = t->registers[rs];
|
| 987 |
|
|
if (length > t->MaxVectorLength) length = t->MaxVectorLength;
|
| 988 |
|
|
}
|
| 989 |
|
|
else { // length is maximum
|
| 990 |
|
|
length = t->MaxVectorLength;
|
| 991 |
|
|
}
|
| 992 |
|
|
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
|
| 993 |
|
|
length = length >> dsizelog << dsizelog; // round down to nearest multiple of operand size
|
| 994 |
|
|
// set length of destination vector
|
| 995 |
|
|
t->vectorLength[rd] = (uint32_t)length;
|
| 996 |
|
|
// loop to set all elements
|
| 997 |
|
|
for (uint32_t pos = 0; pos < (uint32_t)length; pos += 1 << dsizelog) {
|
| 998 |
|
|
if ((rm & 0x1F) != 0x1F && !(t->readVectorElement(rm, pos) & 1)) { // mask is zero. get fallback
|
| 999 |
|
|
if (t->op == 18 || rs >= 31) b.q = 0; // threre is no fallback. write zero
|
| 1000 |
|
|
else b.q = t->readVectorElement(rs, pos); // rs is fallback
|
| 1001 |
|
|
}
|
| 1002 |
|
|
t->writeVectorElement(rd, b.q, pos); // write vector element
|
| 1003 |
|
|
}
|
| 1004 |
|
|
t->vect = 4; // stop vector loop
|
| 1005 |
|
|
t->running = 2; // don't save RD
|
| 1006 |
|
|
return 0;
|
| 1007 |
|
|
}
|
| 1008 |
|
|
|
| 1009 |
|
|
static uint32_t byteSwap(uint32_t x) { // swap bytes, used by byte_reverse function
|
| 1010 |
|
|
union {
|
| 1011 |
|
|
uint32_t i;
|
| 1012 |
|
|
uint8_t b[4];
|
| 1013 |
|
|
} a, b;
|
| 1014 |
|
|
a.i = x;
|
| 1015 |
|
|
b.b[0] = a.b[3]; b.b[1] = a.b[2]; b.b[2] = a.b[1]; b.b[3] = a.b[0];
|
| 1016 |
|
|
return b.i;
|
| 1017 |
|
|
}
|
| 1018 |
|
|
|
| 1019 |
|
|
static uint8_t bitSwap(uint8_t x) { // swap bits, used by bit_reverse function
|
| 1020 |
|
|
x = x >> 4 | x << 4; // swap 4-bit nipples
|
| 1021 |
|
|
x = (x >> 2 & 0x33) | (x << 2 & 0xCC); // swap 2-bit groups
|
| 1022 |
|
|
x = (x >> 1 & 0x55) | (x << 1 & 0xAA); // swap single bits
|
| 1023 |
|
|
return x;
|
| 1024 |
|
|
}
|
| 1025 |
|
|
|
| 1026 |
|
|
static uint64_t byte_reverse (CThread * t) {
|
| 1027 |
|
|
// Reverse the order of bits or bytes in each element of vector
|
| 1028 |
|
|
SNum a = t->parm[1]; // value
|
| 1029 |
|
|
uint8_t IM1 = t->parm[2].b; // immediate operand
|
| 1030 |
|
|
if (IM1 & 1) {
|
| 1031 |
|
|
// bit reverse: Reverse the order of bits in each element of vector
|
| 1032 |
|
|
union {
|
| 1033 |
|
|
uint64_t q;
|
| 1034 |
|
|
uint32_t i[2];
|
| 1035 |
|
|
uint8_t b[8];
|
| 1036 |
|
|
} u;
|
| 1037 |
|
|
u.q = a.q;
|
| 1038 |
|
|
uint8_t t1; uint32_t t2;
|
| 1039 |
|
|
switch (dataSizeTableLog[t->operandType]) {
|
| 1040 |
|
|
case 0: // 8 bit
|
| 1041 |
|
|
u.b[0] = bitSwap(u.b[0]); break;
|
| 1042 |
|
|
case 1: // 16 bit
|
| 1043 |
|
|
t1 = bitSwap(u.b[0]); u.b[0] = bitSwap(u.b[1]); u.b[1] = t1; break;
|
| 1044 |
|
|
case 2: // 32 bit
|
| 1045 |
|
|
u.i[0] = byteSwap(u.i[0]);
|
| 1046 |
|
|
for (t1 = 0; t1 < 4; t1++) u.b[t1] = bitSwap(u.b[t1]);
|
| 1047 |
|
|
break;
|
| 1048 |
|
|
case 3: // 64 bit
|
| 1049 |
|
|
t2 = byteSwap(u.i[0]); u.i[0] = byteSwap(u.i[1]); u.i[1] = t2;
|
| 1050 |
|
|
for (t1 = 0; t1 < 8; t1++) u.b[t1] = bitSwap(u.b[t1]);
|
| 1051 |
|
|
break;
|
| 1052 |
|
|
case 4: // 128 bit
|
| 1053 |
|
|
t->interrupt(INT_WRONG_PARAMETERS);
|
| 1054 |
|
|
}
|
| 1055 |
|
|
return u.q;
|
| 1056 |
|
|
}
|
| 1057 |
|
|
else {
|
| 1058 |
|
|
// byte reverse: Reverse the order of bytes in each element of a vector
|
| 1059 |
|
|
uint8_t rs = t->operands[4];
|
| 1060 |
|
|
uint32_t tmp;
|
| 1061 |
|
|
switch (dataSizeTableLog[t->operandType]) {
|
| 1062 |
|
|
case 0: // 8 bit
|
| 1063 |
|
|
break;
|
| 1064 |
|
|
case 1: // 16 bit
|
| 1065 |
|
|
a.s = a.s >> 8 | a.b << 8; break; // swap bytes
|
| 1066 |
|
|
case 2: // 32 bit
|
| 1067 |
|
|
a.i = byteSwap(a.i); break;
|
| 1068 |
|
|
case 3: // 64 bit
|
| 1069 |
|
|
tmp = byteSwap(a.i); a.q = byteSwap(a.q >> 32) | (uint64_t)tmp << 32;
|
| 1070 |
|
|
break;
|
| 1071 |
|
|
case 4: // 128 bit
|
| 1072 |
|
|
tmp = byteSwap(a.i); t->parm[5].q = byteSwap(a.q >> 32) | (uint64_t)tmp << 32;
|
| 1073 |
|
|
a.q = t->readVectorElement(rs, t->vectorOffset + 8); // high part of input
|
| 1074 |
|
|
tmp = byteSwap(a.i); a.q = byteSwap(a.q >> 32) | (uint64_t)tmp << 32;
|
| 1075 |
|
|
break;
|
| 1076 |
|
|
}
|
| 1077 |
|
|
return a.q;
|
| 1078 |
|
|
}
|
| 1079 |
|
|
}
|
| 1080 |
|
|
|
| 1081 |
|
|
/*
|
| 1082 |
|
|
static uint64_t truth_tab2 (CThread * t) {
|
| 1083 |
|
|
// Boolean function of two inputs, given by a truth table
|
| 1084 |
|
|
SNum a = t->parm[0]; // value
|
| 1085 |
|
|
SNum b = t->parm[1]; // value
|
| 1086 |
|
|
SNum c = t->parm[4]; // truth table
|
| 1087 |
|
|
return ((c.b >> ((a.b & 1) | (b.b & 1) << 1)) & 1) | (a.q & ~uint64_t(1));
|
| 1088 |
|
|
} */
|
| 1089 |
|
|
|
| 1090 |
|
|
static uint64_t bool2bits(CThread * t) {
|
| 1091 |
|
|
// The boolean vector RT is packed into the lower n bits of RD,
|
| 1092 |
|
|
// taking bit 0 of each element
|
| 1093 |
|
|
// The length of RD will be at least sufficient to contain n bits.
|
| 1094 |
|
|
|
| 1095 |
|
|
uint8_t rd = t->operands[0]; // destination vector
|
| 1096 |
|
|
uint8_t rt = t->operands[4]; // RT = source vector
|
| 1097 |
|
|
//uint8_t rs = t->operands[4]; // RS indicates length
|
| 1098 |
|
|
uint8_t * destination = (uint8_t*)t->vectors.buf() + (int64_t)rd * t->MaxVectorLength; // address of RD data
|
| 1099 |
|
|
//uint64_t length = t->registers[rs]; // value of RS = length of destination
|
| 1100 |
|
|
uint32_t length = t->vectorLength[rt]; // length of source
|
| 1101 |
|
|
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
|
| 1102 |
|
|
//if (length > t->MaxVectorLength) length = t->MaxVectorLength; // limit length
|
| 1103 |
|
|
uint32_t num = length >> dsizelog; // number of elements
|
| 1104 |
|
|
length = num << dsizelog; // round down length to nearest multiple of element size
|
| 1105 |
|
|
// collect bits into blocks of 32 bits
|
| 1106 |
|
|
uint32_t bitblock = 0;
|
| 1107 |
|
|
// loop through elements of source vector
|
| 1108 |
|
|
for (uint32_t i = 0; i < num; i++) {
|
| 1109 |
|
|
uint8_t bit = t->readVectorElement(rt, i << dsizelog) & 1;
|
| 1110 |
|
|
uint8_t bitindex = i & 31; // bit position with 32 bit block of destination
|
| 1111 |
|
|
bitblock |= bit << bitindex; // add bit to bitblock
|
| 1112 |
|
|
if (bitindex == 31 || i == num - 1) { // last bit in this block
|
| 1113 |
|
|
*(uint32_t*)(destination + (i/8 & -4)) = bitblock; // write 32 bit block to destination
|
| 1114 |
|
|
bitblock = 0; // start next block
|
| 1115 |
|
|
}
|
| 1116 |
|
|
}
|
| 1117 |
|
|
// round up length of destination to multiple of 4 bytes
|
| 1118 |
|
|
uint32_t destinationLength = ((num+7)/8 + 3) & -4;
|
| 1119 |
|
|
if (destinationLength == 0) {
|
| 1120 |
|
|
destinationLength = 4; *(uint32_t*)destination = 0;
|
| 1121 |
|
|
}
|
| 1122 |
|
|
// set length of destination vector (must be done after reading source because source and destination may be the same)
|
| 1123 |
|
|
t->vectorLength[rd] = destinationLength;
|
| 1124 |
|
|
t->vect = 4; // stop vector loop
|
| 1125 |
|
|
t->running = 2; // don't save RD
|
| 1126 |
|
|
if ((t->returnType & 7) >= 5) t->returnType -= 3; // make return type integer
|
| 1127 |
|
|
return 0;
|
| 1128 |
|
|
}
|
| 1129 |
|
|
|
| 1130 |
|
|
static uint64_t bool_reduce(CThread * t) {
|
| 1131 |
|
|
// integer vector: bool_reduce. The boolean vector RT is reduced by combining bit 0 of all elements.
|
| 1132 |
|
|
// The output is a scalar integer where bit 0 is the AND combination of all the bits,
|
| 1133 |
|
|
// and bit 1 is the OR combination of all the bits.
|
| 1134 |
|
|
// The remaining bits are reserved for future use
|
| 1135 |
|
|
// float vector: category_reduce: Each bit in RD indicates that at least one element in RT belongs
|
| 1136 |
|
|
// to a certain category:
|
| 1137 |
|
|
// bit 0: NAN, bit 1: zero, bit 2: - subnormal, bitt 3: + subnormal,
|
| 1138 |
|
|
// bit 4: - normal, bit 5: + normal, bit 6: - INF, bit 7: + INF
|
| 1139 |
|
|
uint8_t rd = t->operands[0]; // destination vector
|
| 1140 |
|
|
uint8_t rt = t->operands[4]; // RT = source vector
|
| 1141 |
|
|
//uint8_t rs = t->operands[4]; // RS indicates length
|
| 1142 |
|
|
uint8_t bitOR = 0; // OR combination of all bits
|
| 1143 |
|
|
uint8_t bitAND = 1; // AND combination of all bits
|
| 1144 |
|
|
uint64_t result = 0; // result value
|
| 1145 |
|
|
uint8_t * source = (uint8_t*)t->vectors.buf() + rt*t->MaxVectorLength; // address of RT data
|
| 1146 |
|
|
//if (length > t->MaxVectorLength) length = t->MaxVectorLength; // limit length
|
| 1147 |
|
|
uint32_t elementSize = dataSizeTable[t->operandType]; // vector element size
|
| 1148 |
|
|
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
|
| 1149 |
|
|
uint32_t sourceLength = t->vectorLength[rt]; // length of source vector
|
| 1150 |
|
|
//uint64_t length = t->registers[rs]; // value of RS = length of destination
|
| 1151 |
|
|
uint32_t length = sourceLength; // length of source vector
|
| 1152 |
|
|
length = length >> dsizelog << dsizelog; // round down to nearest multiple of element size
|
| 1153 |
|
|
/*if (length > sourceLength) {
|
| 1154 |
|
|
length = sourceLength; // limit to length of source vector
|
| 1155 |
|
|
bitAND = 0; // bits beyond vector are 0
|
| 1156 |
|
|
} */
|
| 1157 |
|
|
switch (t->operandType) {
|
| 1158 |
|
|
case 0: case 1: case 2: case 3: case 4: // integer types: bool_reduce
|
| 1159 |
|
|
for (uint32_t pos = 0; pos < length; pos += elementSize) { // loop through elements of source vector
|
| 1160 |
|
|
uint8_t bit = *(source + pos) & 1; // get bit from source vector element
|
| 1161 |
|
|
bitOR |= bit; bitAND &= bit;
|
| 1162 |
|
|
}
|
| 1163 |
|
|
result = bitAND | bitOR << 1;
|
| 1164 |
|
|
break;
|
| 1165 |
|
|
case 5: // float type: category_reduce
|
| 1166 |
|
|
for (uint32_t pos = 0; pos < length; pos += elementSize) { // loop through elements of source vector
|
| 1167 |
|
|
uint32_t val = *(int32_t*)(source + pos);
|
| 1168 |
|
|
uint8_t exponent = val >> 23 & 0xFF; // isolate exponent
|
| 1169 |
|
|
uint8_t category;
|
| 1170 |
|
|
if (exponent == 0xFF) { // nan or inf
|
| 1171 |
|
|
if (val << 9) category = 1; // nan
|
| 1172 |
|
|
else if (val >> 31) category = 0x40; // -inf
|
| 1173 |
|
|
else category = 0x80; // + inf
|
| 1174 |
|
|
}
|
| 1175 |
|
|
else if (exponent == 0) {
|
| 1176 |
|
|
if ((val << 9) == 0) category = 2; // zero
|
| 1177 |
|
|
else if (val >> 31) category = 4; // - subnormal
|
| 1178 |
|
|
else category = 8; // + subnormal
|
| 1179 |
|
|
}
|
| 1180 |
|
|
else if (val >> 31) category = 0x10; // - normal
|
| 1181 |
|
|
else category = 0x20; // + normal
|
| 1182 |
|
|
result |= category; // combine categories
|
| 1183 |
|
|
}
|
| 1184 |
|
|
break;
|
| 1185 |
|
|
case 6: // double type: category_reduce
|
| 1186 |
|
|
for (uint32_t pos = 0; pos < length; pos += elementSize) { // loop through elements of source vector
|
| 1187 |
|
|
uint64_t val = *(int64_t*)(source + pos);
|
| 1188 |
|
|
uint32_t exponent = val >> 52 & 0x7FF; // isolate exponent
|
| 1189 |
|
|
uint8_t category;
|
| 1190 |
|
|
if (exponent == 0x7FF) { // nan or inf
|
| 1191 |
|
|
if (val << 12) category = 1; // nan
|
| 1192 |
|
|
else if (val >> 63) category = 0x40; // -inf
|
| 1193 |
|
|
else category = 0x80; // + inf
|
| 1194 |
|
|
}
|
| 1195 |
|
|
else if (exponent == 0) {
|
| 1196 |
|
|
if ((val << 12) == 0) category = 2; // zero
|
| 1197 |
|
|
else if (val >> 63) category = 4; // - subnormal
|
| 1198 |
|
|
else category = 8; // + subnormal
|
| 1199 |
|
|
}
|
| 1200 |
|
|
else if (val >> 63) category = 0x10; // - normal
|
| 1201 |
|
|
else category = 0x20; // + normal
|
| 1202 |
|
|
result |= category; // combine categories
|
| 1203 |
|
|
}
|
| 1204 |
|
|
break;
|
| 1205 |
|
|
default:
|
| 1206 |
|
|
t->interrupt(INT_WRONG_PARAMETERS);
|
| 1207 |
|
|
}
|
| 1208 |
|
|
t->vectorLength[rd] = 8; // set length of destination vector to 64 bits
|
| 1209 |
|
|
uint8_t * destination = (uint8_t*)t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
|
| 1210 |
|
|
*(uint64_t*)destination = result; // write 64 bits to destination
|
| 1211 |
|
|
// (using writeVectorElement would possibly write less than 64 bits, leaving some of the destination vector unchanged)
|
| 1212 |
|
|
t->vect = 4; // stop vector loop
|
| 1213 |
|
|
t->running = 2; // don't save RD. It has already been saved
|
| 1214 |
|
|
if ((t->returnType & 7) >= 5) t->returnType -= 3; // make return type integer
|
| 1215 |
|
|
return result;
|
| 1216 |
|
|
}
|
| 1217 |
|
|
|
| 1218 |
|
|
|
| 1219 |
|
|
static uint64_t push_v(CThread * t) {
|
| 1220 |
|
|
// push one or more vector registers on a stack pointed to by rd
|
| 1221 |
|
|
if (t->parm[2].i & 0xE0) {
|
| 1222 |
|
|
t->interrupt(INT_WRONG_PARAMETERS); return 0; // forward-growing stack not supported for vector registers
|
| 1223 |
|
|
}
|
| 1224 |
|
|
uint8_t reg0 = t->operands[0] & 0x1F; // pointer register
|
| 1225 |
|
|
uint8_t reg1 = t->operands[4] & 0x1F; // first push register
|
| 1226 |
|
|
uint8_t reglast = t->parm[2].i & 0x1F; // last push register
|
| 1227 |
|
|
uint8_t reg; // current regiser
|
| 1228 |
|
|
uint32_t length; // length of current register
|
| 1229 |
|
|
uint32_t length2; // length rounded up to nearest multiple of stack word size
|
| 1230 |
|
|
uint64_t pointer = t->registers[reg0];
|
| 1231 |
|
|
const int stack_word_size = 8;
|
| 1232 |
|
|
t->operandType = 3; // must be 64 bits.
|
| 1233 |
|
|
// loop through registers to push
|
| 1234 |
|
|
for (reg = reg1; reg <= reglast; reg++) {
|
| 1235 |
|
|
length = t->vectorLength[reg];
|
| 1236 |
|
|
length2 = (length + stack_word_size - 1) & -stack_word_size; // round up to multiple of 8
|
| 1237 |
|
|
if (length != 0) {
|
| 1238 |
|
|
pointer -= length2;
|
| 1239 |
|
|
for (uint32_t j = 0; j < length2; j += 8) {
|
| 1240 |
|
|
uint64_t value = t->readVectorElement(reg, j);
|
| 1241 |
|
|
t->writeMemoryOperand(value, pointer + j); // write vector
|
| 1242 |
|
|
}
|
| 1243 |
|
|
t->returnType = 0x113;
|
| 1244 |
|
|
t->operands[0] = reg;
|
| 1245 |
|
|
t->listResult(0);
|
| 1246 |
|
|
}
|
| 1247 |
|
|
pointer -= stack_word_size;
|
| 1248 |
|
|
t->writeMemoryOperand(length, pointer); // write length
|
| 1249 |
|
|
t->returnType = 0x13;
|
| 1250 |
|
|
t->listResult(length);
|
| 1251 |
|
|
}
|
| 1252 |
|
|
t->registers[reg0] = pointer;
|
| 1253 |
|
|
t->returnType = 0x13;
|
| 1254 |
|
|
t->operands[0] = reg0;
|
| 1255 |
|
|
t->vect = 4; // stop vector loop
|
| 1256 |
|
|
t->running = 2; // don't store result register
|
| 1257 |
|
|
return pointer;
|
| 1258 |
|
|
}
|
| 1259 |
|
|
|
| 1260 |
|
|
static uint64_t pop_v(CThread * t) {
|
| 1261 |
|
|
// pop one or more vector registers from a stack pointed to by rd
|
| 1262 |
|
|
if (t->parm[2].i & 0xE0) {
|
| 1263 |
|
|
t->interrupt(INT_WRONG_PARAMETERS); return 0; // forward-growing stack not supported for vector registers
|
| 1264 |
|
|
}
|
| 1265 |
|
|
uint8_t reg0 = t->operands[0] & 0x1F; // pointer register
|
| 1266 |
|
|
uint8_t reg1 = t->operands[4] & 0x1F; // first pop register
|
| 1267 |
|
|
uint8_t reglast = t->parm[2].i & 0x1F; // last pop register
|
| 1268 |
|
|
uint8_t reg; // current regiser
|
| 1269 |
|
|
uint32_t length; // length of current register
|
| 1270 |
|
|
uint32_t length2; // length rounded up to nearest multiple of stack word size
|
| 1271 |
|
|
uint64_t pointer = t->registers[reg0]; // value of stack pointer or pointer register
|
| 1272 |
|
|
const int stack_word_size = 8;
|
| 1273 |
|
|
t->operandType = 3; // must be 64 bits.
|
| 1274 |
|
|
// reverse loop through registers to pop
|
| 1275 |
|
|
for (reg = reglast; reg >= reg1; reg--) {
|
| 1276 |
|
|
length = (uint32_t)t->readMemoryOperand(pointer); // read length
|
| 1277 |
|
|
length2 = (length + stack_word_size - 1) & -stack_word_size; // round up to multiple of 8
|
| 1278 |
|
|
t->vectorLength[reg] = length; // set vector length
|
| 1279 |
|
|
pointer += stack_word_size; // pop length
|
| 1280 |
|
|
if (length != 0) {
|
| 1281 |
|
|
for (uint32_t j = 0; j < length2; j += 8) { // read vector
|
| 1282 |
|
|
uint64_t value = t->readMemoryOperand(pointer + j); // read from memory
|
| 1283 |
|
|
t->writeVectorElement(reg, value, j);
|
| 1284 |
|
|
}
|
| 1285 |
|
|
pointer += length2;
|
| 1286 |
|
|
t->returnType = 0x113;
|
| 1287 |
|
|
t->operands[0] = reg;
|
| 1288 |
|
|
t->listResult(0);
|
| 1289 |
|
|
}
|
| 1290 |
|
|
t->returnType = 0x13;
|
| 1291 |
|
|
t->listResult(length);
|
| 1292 |
|
|
}
|
| 1293 |
|
|
t->registers[reg0] = pointer;
|
| 1294 |
|
|
t->returnType = 0x13;
|
| 1295 |
|
|
t->operands[0] = reg0;
|
| 1296 |
|
|
t->vect = 4; // stop vector loop
|
| 1297 |
|
|
t->running = 2; // don't store result register
|
| 1298 |
|
|
return pointer;
|
| 1299 |
|
|
}
|
| 1300 |
|
|
|
| 1301 |
|
|
static uint64_t clear_(CThread * t) {
|
| 1302 |
|
|
// clear one or more vector registers
|
| 1303 |
|
|
uint8_t reg1 = t->operands[4] & 0x1F; // first register
|
| 1304 |
|
|
uint8_t reglast = t->parm[2].i & 0x1F; // last register
|
| 1305 |
|
|
uint8_t reg; // current regiser
|
| 1306 |
|
|
for (reg = reg1; reg <= reglast; reg++) {
|
| 1307 |
|
|
t->vectorLength[reg] = 0;
|
| 1308 |
|
|
}
|
| 1309 |
|
|
t->vect = 4; // stop vector loop
|
| 1310 |
|
|
t->running = 2; // don't store result register
|
| 1311 |
|
|
t->returnType = 0;
|
| 1312 |
|
|
return 0;
|
| 1313 |
|
|
}
|
| 1314 |
|
|
|
| 1315 |
|
|
|
| 1316 |
|
|
// Format 1.4 C. One vector register and a broadcast 16-bit immediate operand.
|
| 1317 |
|
|
|
| 1318 |
|
|
static uint64_t move_i16 (CThread * t) {
|
| 1319 |
|
|
// Move 16 bit integer constant to 16-bit scalar
|
| 1320 |
|
|
uint8_t rd = t->operands[0]; // destination vector
|
| 1321 |
|
|
t->vectorLength[rd] = 2; // set length of destination
|
| 1322 |
|
|
t->vect = 4; // stop vector loop
|
| 1323 |
|
|
return t->parm[2].q;
|
| 1324 |
|
|
}
|
| 1325 |
|
|
|
| 1326 |
|
|
//static uint64_t add_i16 (CThread * t) {return f_add(t);} // Add broadcasted 16 bit constant to 16-bit vector elements
|
| 1327 |
|
|
|
| 1328 |
|
|
static uint64_t and_i16 (CThread * t) {
|
| 1329 |
|
|
// AND broadcasted 16 bit constant
|
| 1330 |
|
|
return t->parm[1].q & t->parm[2].q;
|
| 1331 |
|
|
}
|
| 1332 |
|
|
|
| 1333 |
|
|
static uint64_t or_i16 (CThread * t) {
|
| 1334 |
|
|
// OR broadcasted 16 bit constant
|
| 1335 |
|
|
return t->parm[1].q | t->parm[2].q;
|
| 1336 |
|
|
}
|
| 1337 |
|
|
|
| 1338 |
|
|
static uint64_t xor_i16 (CThread * t) {
|
| 1339 |
|
|
// XOR broadcasted 16 bit constant
|
| 1340 |
|
|
return t->parm[1].q ^ t->parm[2].q;
|
| 1341 |
|
|
}
|
| 1342 |
|
|
|
| 1343 |
|
|
static uint64_t add_h16 (CThread * t) {
|
| 1344 |
|
|
// add constant to half precision vector
|
| 1345 |
|
|
return f_add_h(t);
|
| 1346 |
|
|
}
|
| 1347 |
|
|
|
| 1348 |
|
|
static uint64_t mul_h16 (CThread * t) {
|
| 1349 |
|
|
// multiply half precision vector with constant
|
| 1350 |
|
|
return f_mul_h(t);
|
| 1351 |
|
|
}
|
| 1352 |
|
|
|
| 1353 |
|
|
static uint64_t move_8shift8 (CThread * t) {
|
| 1354 |
|
|
// RD = IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1 to make 32/64 bit scalar
|
| 1355 |
|
|
// 40: 32 bit, 41: 64 bit
|
| 1356 |
|
|
uint8_t rd = t->operands[0]; // destination vector
|
| 1357 |
|
|
t->vectorLength[rd] = (t->op & 1) ? 8 : 4; // set length of destination
|
| 1358 |
|
|
t->vect = 4; // stop vector loop
|
| 1359 |
|
|
return (uint64_t)(int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs); // shift and sign extend
|
| 1360 |
|
|
}
|
| 1361 |
|
|
|
| 1362 |
|
|
static uint64_t add_8shift8 (CThread * t) {
|
| 1363 |
|
|
// RD += IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1, add to 32/64 bit vector
|
| 1364 |
|
|
// 42: 32 bit, 43: 64 bit
|
| 1365 |
|
|
int64_t save2 = t->parm[2].qs;
|
| 1366 |
|
|
t->parm[2].qs = int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs; // shift and sign extend
|
| 1367 |
|
|
int64_t result = f_add(t); // use f_add for getting overflow traps
|
| 1368 |
|
|
t->parm[2].qs = save2; // restore constant
|
| 1369 |
|
|
return result;
|
| 1370 |
|
|
}
|
| 1371 |
|
|
|
| 1372 |
|
|
static uint64_t and_8shift8 (CThread * t) {
|
| 1373 |
|
|
// RD &= IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1, AND with 32/64 bit vector
|
| 1374 |
|
|
// 44: 32 bit, 45: 64 bit
|
| 1375 |
|
|
int64_t a = int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs; // shift and sign extend
|
| 1376 |
|
|
return t->parm[1].q & a;
|
| 1377 |
|
|
}
|
| 1378 |
|
|
|
| 1379 |
|
|
static uint64_t or_8shift8 (CThread * t) {
|
| 1380 |
|
|
// RD |= IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1, OR with 32/64 bit vector
|
| 1381 |
|
|
// 46: 32 bit, 47: 64 bit
|
| 1382 |
|
|
int64_t a = int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs; // shift and sign extend
|
| 1383 |
|
|
return t->parm[1].q | a;
|
| 1384 |
|
|
}
|
| 1385 |
|
|
|
| 1386 |
|
|
static uint64_t xor_8shift8 (CThread * t) {
|
| 1387 |
|
|
// RD |= IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1, XOR with 32/64 bit vector
|
| 1388 |
|
|
// 48: 32 bit, 49: 64 bit
|
| 1389 |
|
|
int64_t a = int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs; // shift and sign extend
|
| 1390 |
|
|
return t->parm[1].q ^ a;
|
| 1391 |
|
|
}
|
| 1392 |
|
|
|
| 1393 |
|
|
static uint64_t move_half2float (CThread * t) {
|
| 1394 |
|
|
// Move converted half precision floating point constant to single precision scalar
|
| 1395 |
|
|
t->vectorLength[t->operands[0]] = 4; // set length of destination
|
| 1396 |
|
|
t->vectorLengthR = 4;
|
| 1397 |
|
|
t->vect = 4; // stop vector loop
|
| 1398 |
|
|
return t->parm[2].q;
|
| 1399 |
|
|
}
|
| 1400 |
|
|
|
| 1401 |
|
|
static uint64_t move_half2double (CThread * t) {
|
| 1402 |
|
|
// Move converted half precision floating point constant to double precision scalar
|
| 1403 |
|
|
t->vectorLength[t->operands[0]] = 8; // set length of destination
|
| 1404 |
|
|
t->vect = 4; // stop vector loop
|
| 1405 |
|
|
return t->parm[2].q;
|
| 1406 |
|
|
}
|
| 1407 |
|
|
|
| 1408 |
|
|
static uint64_t add_half2float (CThread * t) {
|
| 1409 |
|
|
// Add broadcast half precision floating point constant to single precision vector
|
| 1410 |
|
|
return f_add(t);
|
| 1411 |
|
|
}
|
| 1412 |
|
|
|
| 1413 |
|
|
static uint64_t add_half2double (CThread * t) {
|
| 1414 |
|
|
// Add broadcast half precision floating point constant to double precision vector
|
| 1415 |
|
|
return f_add(t);
|
| 1416 |
|
|
}
|
| 1417 |
|
|
|
| 1418 |
|
|
static uint64_t mul_half2float (CThread * t) {
|
| 1419 |
|
|
// multiply broadcast half precision floating point constant with single precision vector
|
| 1420 |
|
|
return f_mul(t);
|
| 1421 |
|
|
}
|
| 1422 |
|
|
|
| 1423 |
|
|
static uint64_t mul_half2double (CThread * t) {
|
| 1424 |
|
|
// multiply broadcast half precision floating point constant with double precision vector
|
| 1425 |
|
|
return f_mul(t);
|
| 1426 |
|
|
}
|
| 1427 |
|
|
|
| 1428 |
|
|
// Format 2.6 A. Three vector registers and a 32-bit immediate operand.
|
| 1429 |
|
|
|
| 1430 |
|
|
static uint64_t load_hi (CThread * t) {
|
| 1431 |
|
|
// Make vector of two elements. dest[0] = 0, dest[1] = IM2.
|
| 1432 |
|
|
uint8_t rd = t->operands[0];
|
| 1433 |
|
|
uint8_t dsize = dataSizeTable[t->operandType];
|
| 1434 |
|
|
t->vectorLength[rd] = dsize * 2; // set length of destination
|
| 1435 |
|
|
t->writeVectorElement(rd, 0, 0); // write 0
|
| 1436 |
|
|
t->writeVectorElement(rd, t->parm[2].q, dsize);// write IM2
|
| 1437 |
|
|
t->vect = 4; // stop vector loop
|
| 1438 |
|
|
t->running = 2; // don't save RD
|
| 1439 |
|
|
return 0;
|
| 1440 |
|
|
}
|
| 1441 |
|
|
|
| 1442 |
|
|
static uint64_t insert_hi (CThread * t) {
|
| 1443 |
|
|
// Make vector of two elements. dest[0] = src1[0], dest[1] = IM2.
|
| 1444 |
|
|
uint8_t rd = t->operands[0];
|
| 1445 |
|
|
uint8_t dsize = dataSizeTable[t->operandType];
|
| 1446 |
|
|
t->vectorLength[rd] = dsize * 2; // set length of destination
|
| 1447 |
|
|
t->writeVectorElement(rd, t->parm[1].q, 0); // write src1
|
| 1448 |
|
|
t->writeVectorElement(rd, t->parm[2].q, dsize);// write IM2
|
| 1449 |
|
|
t->vect = 4; // stop vector loop
|
| 1450 |
|
|
t->running = 2; // don't save RD
|
| 1451 |
|
|
return 0;
|
| 1452 |
|
|
}
|
| 1453 |
|
|
|
| 1454 |
|
|
static uint64_t make_mask (CThread * t) {
|
| 1455 |
|
|
// Make vector where bit 0 of each element comes from bits in IM2, the remaining bits come from RT.
|
| 1456 |
|
|
SNum m = t->parm[3]; // mask or numcontr
|
| 1457 |
|
|
SNum b = t->parm[2]; // constant operand
|
| 1458 |
|
|
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
|
| 1459 |
|
|
uint32_t elementNum = t->vectorOffset >> dsizelog; // index to vector element
|
| 1460 |
|
|
if ((t->operandType & 7) >= 5) t->operandType -= 3; // debug return type is integer
|
| 1461 |
|
|
return (m.q & ~(uint64_t)1) | (b.i >> (elementNum & 31) & 1);
|
| 1462 |
|
|
}
|
| 1463 |
|
|
|
| 1464 |
|
|
static uint64_t replace_ (CThread * t) {
|
| 1465 |
|
|
// Replace elements in RT by constant IM2
|
| 1466 |
|
|
// format 2.6: 32 bits, format 3.1: 64 bits
|
| 1467 |
|
|
return t->parm[2].q;
|
| 1468 |
|
|
}
|
| 1469 |
|
|
|
| 1470 |
|
|
static uint64_t replace_even (CThread * t) {
|
| 1471 |
|
|
// Replace even-numbered elements in RT by constant IM2
|
| 1472 |
|
|
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
|
| 1473 |
|
|
uint32_t elementNum = t->vectorOffset >> dsizelog; // index to vector element
|
| 1474 |
|
|
return (elementNum & 1) ? t->parm[1].q : t->parm[2].q;
|
| 1475 |
|
|
}
|
| 1476 |
|
|
|
| 1477 |
|
|
static uint64_t replace_odd (CThread * t) {
|
| 1478 |
|
|
// Replace odd-numbered elements in RT by constant IM2
|
| 1479 |
|
|
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
|
| 1480 |
|
|
uint32_t elementNum = t->vectorOffset >> dsizelog; // index to vector element
|
| 1481 |
|
|
return (elementNum & 1) ? t->parm[2].q : t->parm[1].q;
|
| 1482 |
|
|
}
|
| 1483 |
|
|
|
| 1484 |
|
|
static uint64_t broadcast_32 (CThread * t) {
|
| 1485 |
|
|
// Broadcast 32-bit or 64 -bit constant into all elements of RD with length RS (31 in RS field gives scalar output).
|
| 1486 |
|
|
uint8_t rd = t->operands[0];
|
| 1487 |
|
|
uint8_t rs = t->operands[4];
|
| 1488 |
|
|
uint8_t rm = t->operands[1]; // mask register
|
| 1489 |
|
|
uint32_t elementSize = dataSizeTable[t->operandType];
|
| 1490 |
|
|
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
|
| 1491 |
|
|
uint64_t length; // length of destination
|
| 1492 |
|
|
int64_t value;
|
| 1493 |
|
|
if (rs == 31) length = elementSize;
|
| 1494 |
|
|
else length = t->registers[rs] << dsizelog >> dsizelog; // round length to multiple of elementSize
|
| 1495 |
|
|
if (length > t->MaxVectorLength) length = t->MaxVectorLength;
|
| 1496 |
|
|
t->vectorLength[rd] = (uint32_t)length; // set length of destination
|
| 1497 |
|
|
for (uint32_t pos = 0; pos < length; pos += elementSize) { // loop through vector
|
| 1498 |
|
|
if (rm >= 7 || (t->readVectorElement(rm, pos) & 1)) value = t->parm[2].qs; // check mask
|
| 1499 |
|
|
else value = 0;
|
| 1500 |
|
|
t->writeVectorElement(rd, value, pos); // write to destination
|
| 1501 |
|
|
}
|
| 1502 |
|
|
t->vect = 4; // stop vector loop
|
| 1503 |
|
|
t->running = 2; // don't save RD
|
| 1504 |
|
|
return 0;
|
| 1505 |
|
|
}
|
| 1506 |
|
|
|
| 1507 |
|
|
static uint64_t permute (CThread * t) {
|
| 1508 |
|
|
// The vector elements of RS are permuted within each block of size RT bytes.
|
| 1509 |
|
|
// The number of elements in each block, n = RT / OS
|
| 1510 |
|
|
// format 2.2.6 op 1.1: index vector is last operand
|
| 1511 |
|
|
// format 2.6 op 8: index vector is constant IM2, 4 bits for each element
|
| 1512 |
|
|
uint8_t rd = t->operands[0]; // destination
|
| 1513 |
|
|
uint8_t rm = t->operands[1]; // mask register
|
| 1514 |
|
|
uint8_t vin; // input data register
|
| 1515 |
|
|
uint8_t vpat = 0; // pattern register
|
| 1516 |
|
|
uint8_t bs; // block size, g.p. register
|
| 1517 |
|
|
uint32_t pattern = 0; // IM2 = pattern, if constant
|
| 1518 |
|
|
bool constPat = false; // pattern is a constant
|
| 1519 |
|
|
if (t->fInstr->format2 == 0x226) {
|
| 1520 |
|
|
vin = t->operands[3]; // ru = input data
|
| 1521 |
|
|
vpat = t->operands[4]; // rs = pattern
|
| 1522 |
|
|
bs = t->operands[5]; // block size, g.p. register
|
| 1523 |
|
|
}
|
| 1524 |
|
|
else { // format 2.6
|
| 1525 |
|
|
vin = t->operands[3]; // rs = input data
|
| 1526 |
|
|
bs = t->operands[4]; // block size, g.p. register
|
| 1527 |
|
|
pattern = t->parm[4].i; // IM2 = pattern, if constant
|
| 1528 |
|
|
constPat = true;
|
| 1529 |
|
|
}
|
| 1530 |
|
|
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
|
| 1531 |
|
|
//uint32_t elementSize = 1 << dsizelog;
|
| 1532 |
|
|
uint32_t length = t->vectorLength[vin]; // vector length
|
| 1533 |
|
|
t->vectorLength[rd] = length; // set length of destination
|
| 1534 |
|
|
int8_t * source = t->vectors.buf() + (uint32_t)(vin & 0x1F) * t->MaxVectorLength; // address of source data vector
|
| 1535 |
|
|
if (vin == rd) {
|
| 1536 |
|
|
// source and destination are the same. Make a temporary copy of source to avoid overwriting
|
| 1537 |
|
|
memcpy(t->tempBuffer, source, length);
|
| 1538 |
|
|
source = t->tempBuffer;
|
| 1539 |
|
|
}
|
| 1540 |
|
|
uint64_t blocksize = t->registers[bs]; // bytes per block
|
| 1541 |
|
|
uint64_t value; // value of element
|
| 1542 |
|
|
uint64_t index; // index to source element
|
| 1543 |
|
|
if (blocksize == 0 || (blocksize & (blocksize-1)) || blocksize > t->MaxVectorLength) {
|
| 1544 |
|
|
t->interrupt(INT_WRONG_PARAMETERS); // RS must be a power of 2
|
| 1545 |
|
|
}
|
| 1546 |
|
|
else {
|
| 1547 |
|
|
uint32_t num = (uint32_t)blocksize >> dsizelog; // elements per block
|
| 1548 |
|
|
for (uint32_t block = 0; block < length; block += (uint32_t)blocksize) { // loop through blocks
|
| 1549 |
|
|
for (uint32_t element = 0; element < num; element++) { // loop through elements within block
|
| 1550 |
|
|
if (constPat) { // get index from constant
|
| 1551 |
|
|
index = (pattern >> (element&7)*4) & 0xF; // index to select block element
|
| 1552 |
|
|
}
|
| 1553 |
|
|
else { // get index from vector
|
| 1554 |
|
|
index = t->readVectorElement(vpat, block + (element << dsizelog));
|
| 1555 |
|
|
}
|
| 1556 |
|
|
if (index < num && (rm == 7 || t->readVectorElement(rm, block + (element << dsizelog)) & 1)) { // check mask
|
| 1557 |
|
|
value = *(uint64_t*)(source + block + ((uint32_t)index << dsizelog)); // pick indexed element from source vector
|
| 1558 |
|
|
}
|
| 1559 |
|
|
else value = 0; // index out of range or mask = 0
|
| 1560 |
|
|
t->writeVectorElement(rd, value, block + (element << dsizelog)); // write destination
|
| 1561 |
|
|
}
|
| 1562 |
|
|
}
|
| 1563 |
|
|
}
|
| 1564 |
|
|
t->vect = 4; // stop vector loop
|
| 1565 |
|
|
t->running = 2; // don't save RD
|
| 1566 |
|
|
return 0;
|
| 1567 |
|
|
}
|
| 1568 |
|
|
|
| 1569 |
|
|
/*
|
| 1570 |
|
|
static uint64_t replace_bits (CThread * t) {
|
| 1571 |
|
|
// Replace a group of contiguous bits in RT by a specified constant
|
| 1572 |
|
|
SNum a = t->parm[1]; // input operand
|
| 1573 |
|
|
SNum b = t->parm[2]; // input constant
|
| 1574 |
|
|
uint64_t val = b.s; // value of replacement bits
|
| 1575 |
|
|
uint8_t pos = uint8_t(b.i >> 16); // position of replacement
|
| 1576 |
|
|
uint8_t num = uint8_t(b.i >> 24); // number of consecutive bits to replace
|
| 1577 |
|
|
uint64_t mask = ((uint64_t)1 << num) - 1; // mask with num 1-bits
|
| 1578 |
|
|
return (a.q & ~(mask<<pos)) | ((val & mask) << pos);
|
| 1579 |
|
|
}*/
|
| 1580 |
|
|
|
| 1581 |
|
|
// Format 2.5 A. Single format instructions with memory operands or mixed register types
|
| 1582 |
|
|
|
| 1583 |
|
|
static uint64_t store_i32 (CThread * t) {
|
| 1584 |
|
|
// Store 32-bit constant IM2 to memory operand [RS+IM1]
|
| 1585 |
|
|
uint64_t value = t->parm[2].q;
|
| 1586 |
|
|
if ((t->parm[3].b & 1) == 0) value = 0; // check mask
|
| 1587 |
|
|
t->writeMemoryOperand(value, t->memAddress);
|
| 1588 |
|
|
t->running = 2; // don't save RD
|
| 1589 |
|
|
t->returnType = (t->returnType & 7) | 0x20;
|
| 1590 |
|
|
return 0;
|
| 1591 |
|
|
}
|
| 1592 |
|
|
|
| 1593 |
|
|
//static uint64_t fence_ (CThread * t) {return f_nop(t);}
|
| 1594 |
|
|
|
| 1595 |
|
|
static uint64_t compare_swap (CThread * t) {
|
| 1596 |
|
|
// Atomic compare and exchange with address [RS+IM2]
|
| 1597 |
|
|
uint64_t val1 = t->parm[0].q;
|
| 1598 |
|
|
uint64_t val2 = t->parm[1].q;
|
| 1599 |
|
|
// to do: use intrinsic compareandexchange or mutex or pause all threads if multiple threads
|
| 1600 |
|
|
uint64_t address = t->memAddress;
|
| 1601 |
|
|
uint64_t sizemask = dataSizeMask[t->operandType]; // mask for operand size
|
| 1602 |
|
|
uint64_t val3 = t->readMemoryOperand(address); // read value from memory
|
| 1603 |
|
|
if (((val3 ^ val1) & sizemask) == 0) { // value match
|
| 1604 |
|
|
t->writeMemoryOperand(val2, address); // write new value to memory
|
| 1605 |
|
|
}
|
| 1606 |
|
|
t->vect = 4; // stop vector loop
|
| 1607 |
|
|
return val3; // return old value
|
| 1608 |
|
|
}
|
| 1609 |
|
|
|
| 1610 |
|
|
static uint64_t read_insert (CThread * t) {
|
| 1611 |
|
|
// Replace one element in vector RD, starting at offset RT*OS, with scalar memory operand [RS+IM2]
|
| 1612 |
|
|
uint8_t rd = t->operands[0];
|
| 1613 |
|
|
uint8_t rs = t->operands[4];
|
| 1614 |
|
|
uint32_t elementSize = dataSizeTable[t->operandType];
|
| 1615 |
|
|
uint64_t value = t->readMemoryOperand(t->memAddress);
|
| 1616 |
|
|
uint64_t pos = t->registers[rs] * elementSize;
|
| 1617 |
|
|
if (pos < t->vectorLength[rd]) {
|
| 1618 |
|
|
t->writeVectorElement(rd, value, (uint32_t)pos);
|
| 1619 |
|
|
}
|
| 1620 |
|
|
t->vect = 4; // stop vector loop
|
| 1621 |
|
|
t->running = 2; // don't save RD
|
| 1622 |
|
|
return 0;
|
| 1623 |
|
|
}
|
| 1624 |
|
|
|
| 1625 |
|
|
static uint64_t extract_store (CThread * t) {
|
| 1626 |
|
|
// Extract one element from vector RD, starting at offset RT*OS, with size OS into memory operand [RS+IM2]
|
| 1627 |
|
|
uint8_t rd = t->operands[0];
|
| 1628 |
|
|
uint8_t rs = t->operands[4];
|
| 1629 |
|
|
uint32_t elementSize = dataSizeTable[t->operandType];
|
| 1630 |
|
|
uint64_t pos = t->registers[rs] * elementSize;
|
| 1631 |
|
|
uint64_t value = t->readVectorElement(rd, (uint32_t)pos);
|
| 1632 |
|
|
t->writeMemoryOperand(value, t->memAddress);
|
| 1633 |
|
|
t->returnType = (t->returnType & 7) | 0x20; // debug return type is memory
|
| 1634 |
|
|
t->vect = 4; // stop vector loop
|
| 1635 |
|
|
t->running = 2; // don't save RD
|
| 1636 |
|
|
t->vectorLengthR = elementSize; // size of memory destination
|
| 1637 |
|
|
return 0;
|
| 1638 |
|
|
}
|
| 1639 |
|
|
|
| 1640 |
|
|
|
| 1641 |
|
|
// Format 2.2.6 E. Four vector registers
|
| 1642 |
|
|
|
| 1643 |
|
|
static uint64_t concatenate (CThread * t) {
|
| 1644 |
|
|
// A vector RU of length RT and a vector RS of length RT are concatenated into a vector RD of length 2*RT.
|
| 1645 |
|
|
uint8_t rd = t->operands[0];
|
| 1646 |
|
|
uint8_t ru = t->operands[3];
|
| 1647 |
|
|
uint8_t rs = t->operands[4];
|
| 1648 |
|
|
uint8_t rt = t->operands[5];
|
| 1649 |
|
|
uint64_t length1 = t->registers[rt];
|
| 1650 |
|
|
if (length1 > t->MaxVectorLength) length1 = t->MaxVectorLength;
|
| 1651 |
|
|
uint32_t length2 = 2 * (uint32_t)length1;
|
| 1652 |
|
|
if (length2 > t->MaxVectorLength) length2 = t->MaxVectorLength;
|
| 1653 |
|
|
t->vectorLength[rd] = length2; // set length of destination vector
|
| 1654 |
|
|
int8_t * source1 = t->vectors.buf() + ru*t->MaxVectorLength; // address of RU data
|
| 1655 |
|
|
int8_t * source2 = t->vectors.buf() + rs*t->MaxVectorLength; // address of RS data
|
| 1656 |
|
|
int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
|
| 1657 |
|
|
memcpy(destination, source1, (uint32_t)length1); // copy from RU
|
| 1658 |
|
|
memcpy(destination + (uint32_t)length1, source2, length2 - (uint32_t)length1); // copy from RS
|
| 1659 |
|
|
t->vect = 4; // stop vector loop
|
| 1660 |
|
|
t->running = 2; // don't save RD
|
| 1661 |
|
|
return 0;
|
| 1662 |
|
|
}
|
| 1663 |
|
|
|
| 1664 |
|
|
static uint64_t interleave (CThread * t) {
|
| 1665 |
|
|
// Interleave elements of vectors RU and RS of length RT/2 to produce vector RD of length RT.
|
| 1666 |
|
|
// Even-numbered elements of the destination come from RU and odd-numbered elements from RS.
|
| 1667 |
|
|
uint8_t rd = t->operands[0]; // destination
|
| 1668 |
|
|
uint8_t ru = t->operands[3]; // first input vector
|
| 1669 |
|
|
uint8_t rs = t->operands[4]; // second input vector
|
| 1670 |
|
|
uint8_t rt = t->operands[5]; // length
|
| 1671 |
|
|
uint8_t rm = t->operands[1]; // mask
|
| 1672 |
|
|
uint64_t length = t->registers[rt];
|
| 1673 |
|
|
if (length > t->MaxVectorLength) length = t->MaxVectorLength;
|
| 1674 |
|
|
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
|
| 1675 |
|
|
length = length >> dsizelog << dsizelog; // round down to nearest multiple of element size
|
| 1676 |
|
|
uint32_t elementSize = 1 << dsizelog; // size of each element
|
| 1677 |
|
|
t->vectorLength[rd] = (uint32_t)length; // set length of destination
|
| 1678 |
|
|
uint8_t even = 1;
|
| 1679 |
|
|
uint32_t pos1 = 0;
|
| 1680 |
|
|
uint64_t value;
|
| 1681 |
|
|
for (uint32_t pos2 = 0; pos2 < length; pos2 += elementSize) {
|
| 1682 |
|
|
if (even) {
|
| 1683 |
|
|
value = t->readVectorElement(ru, pos1);
|
| 1684 |
|
|
}
|
| 1685 |
|
|
else {
|
| 1686 |
|
|
value = t->readVectorElement(rs, pos1);
|
| 1687 |
|
|
pos1 += elementSize;
|
| 1688 |
|
|
}
|
| 1689 |
|
|
even ^= 1; // toggle between even and odd
|
| 1690 |
|
|
if (rm < 7 && (t->readVectorElement(rm, pos2) & 1) == 0) value = 0; // mask is 0
|
| 1691 |
|
|
t->writeVectorElement(rd, value, pos2);
|
| 1692 |
|
|
}
|
| 1693 |
|
|
t->vect = 4; // stop vector loop
|
| 1694 |
|
|
t->running = 2; // don't save RD
|
| 1695 |
|
|
return 0;
|
| 1696 |
|
|
}
|
| 1697 |
|
|
|
| 1698 |
|
|
|
| 1699 |
|
|
// Format 2.2.7 E. Three vector registers and a 16 bit immediate
|
| 1700 |
|
|
|
| 1701 |
|
|
static uint64_t move_bits (CThread * t) {
|
| 1702 |
|
|
// Replace one or more contiguous bits at one position of RS with contiguous bits from another position of RT
|
| 1703 |
|
|
// Format 2.0.7 E: general purpose registers
|
| 1704 |
|
|
// Format 2.2.7 E: vector registers
|
| 1705 |
|
|
// The position in src2 is the lower 8 bits of IM2. a = IM2 & 0xFF.
|
| 1706 |
|
|
// The position in src1 is the upper 8 bits of IM2. b = IM2 >> 0xFF.
|
| 1707 |
|
|
// The number of bits to move is c = IM3.
|
| 1708 |
|
|
SNum s1 = t->parm[0]; // input operand src1
|
| 1709 |
|
|
SNum s2 = t->parm[1]; // input operand src2
|
| 1710 |
|
|
SNum im = t->parm[4]; // input operand IM2
|
| 1711 |
|
|
SNum mask = t->parm[3]; //
|
| 1712 |
|
|
uint8_t c = t->pInstr->a.im3; // input operand IM3 = number of bits
|
| 1713 |
|
|
uint8_t pos1 = im.s >> 8; // bit position in src1. (can overflow, not handled)
|
| 1714 |
|
|
uint8_t pos2 = im.b; // bit position in src2. (can overflow, not handled)
|
| 1715 |
|
|
uint64_t bitmask = ((uint64_t)1 << c) - 1; // mask of c bits. (cannot overflow because c is max 63)
|
| 1716 |
|
|
uint64_t result = (s1.q & ~(bitmask << pos1)) | ((s2.q >> pos2) & bitmask) << pos1;
|
| 1717 |
|
|
if ((mask.b & 1) == 0) { // single format instructions with template E must handle mask here
|
| 1718 |
|
|
result = s1.q; // fallback
|
| 1719 |
|
|
if (t->operands[2] == 31) result = 0; // fallback = 0
|
| 1720 |
|
|
}
|
| 1721 |
|
|
return result;
|
| 1722 |
|
|
}
|
| 1723 |
|
|
|
| 1724 |
|
|
static uint64_t mask_length (CThread * t) {
|
| 1725 |
|
|
// Make a boolean vector to mask the first n bytes of a vector.
|
| 1726 |
|
|
// The output vector RD will have the same length as the input vector RS.
|
| 1727 |
|
|
// RT indicates the length of the part that is enabled by the mask (n).
|
| 1728 |
|
|
// IM3 contains the following option bits:
|
| 1729 |
|
|
// bit 0 = 0: bit 0 will be 1 in the first n bytes in the output and 0 in the rest.
|
| 1730 |
|
|
// bit 0 = 1: bit 0 will be 0 in the first n bytes in the output and 1 in the rest.
|
| 1731 |
|
|
// bit 1 = 1: copy remaining bits from input vector RT into each vector element.
|
| 1732 |
|
|
// bit 2 = 1: copy remaining bits from the numeric control register.
|
| 1733 |
|
|
// bit 4 = 1: broadcast remaining bits from IM2 into all 32-bit words of RD:
|
| 1734 |
|
|
// Bit 1-7 of IM2 go to bit 1-7 of RD. Bit 8-11 of IM2 go to bit 20-23 of RD. Bit 12-15 of IM2 go to bit 26-29 of RD.
|
| 1735 |
|
|
// Output bits that are not set by any of these options will be zero. If multiple options are specified, the results will be OR’ed.
|
| 1736 |
|
|
uint8_t rd = t->operands[0]; // destination
|
| 1737 |
|
|
uint8_t rs = t->operands[3]; // src2
|
| 1738 |
|
|
uint8_t rt = t->operands[4]; // length
|
| 1739 |
|
|
SNum s2 = t->parm[0]; // input operand src2
|
| 1740 |
|
|
SNum im2 = t->parm[4]; // input operand IM2
|
| 1741 |
|
|
uint8_t im3 = t->pInstr->a.im3; // input operand IM3 = options
|
| 1742 |
|
|
t->vectorLengthR = t->vectorLength[rd] = t->vectorLength[rs]; // set length of destination
|
| 1743 |
|
|
uint8_t dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
|
| 1744 |
|
|
uint64_t n = t->registers[rt]; // number of masked elements
|
| 1745 |
|
|
uint32_t i = t->vectorOffset >> dsizelog; // current element index
|
| 1746 |
|
|
uint8_t bit = i < n; // element is within the first n
|
| 1747 |
|
|
bit ^= im3 & 1; // invert option
|
| 1748 |
|
|
uint64_t result = 0;
|
| 1749 |
|
|
if (im3 & 2) result |= s2.q; // copy remaining bits from src1
|
| 1750 |
|
|
if (im3 & 4) result |= t->numContr; // copy remaining bits from NUMCONTR
|
| 1751 |
|
|
if (im3 & 0x10) { // copy bits from IM2
|
| 1752 |
|
|
uint32_t rr = (im2.b & ~1) | bit; // bit 1-7 -> bit 1-7
|
| 1753 |
|
|
rr |= (im2.s & 0xF00) << 12; // bit 8-11 -> bit 20-23
|
| 1754 |
|
|
rr |= (im2.s & 0xF000) << 14; // bit 12-15 -> bit 26-29
|
| 1755 |
|
|
result |= rr | ((uint64_t)rr << 32); // copy these bits twice
|
| 1756 |
|
|
}
|
| 1757 |
|
|
result = (result & ~(uint64_t)1) | bit; // combine
|
| 1758 |
|
|
return result;
|
| 1759 |
|
|
}
|
| 1760 |
|
|
|
| 1761 |
|
|
static uint64_t truth_tab3 (CThread * t) {
|
| 1762 |
|
|
// Bitwise boolean function of three inputs, given by a truth table
|
| 1763 |
|
|
SNum a = t->parm[0]; // first operand
|
| 1764 |
|
|
SNum b = t->parm[1]; // second operand
|
| 1765 |
|
|
SNum c = t->parm[2]; // third operand
|
| 1766 |
|
|
SNum mask = t->parm[3]; // mask register
|
| 1767 |
|
|
uint32_t table = t->pInstr->a.im2; // truth table
|
| 1768 |
|
|
uint8_t options = t->pInstr->a.im3; // option bits
|
| 1769 |
|
|
|
| 1770 |
|
|
uint32_t dataSize = dataSizeTableBits[t->operandType]; // number of bits
|
| 1771 |
|
|
if (options & 3) dataSize = 1; // only a single bit
|
| 1772 |
|
|
uint64_t result = 0; // calculate result
|
| 1773 |
|
|
|
| 1774 |
|
|
for (int i = dataSize - 1; i >= 0; i--) { // loop through bits
|
| 1775 |
|
|
uint64_t bit_pointer = uint64_t(1) << i; // selected bit
|
| 1776 |
|
|
uint8_t index = 0; // index into truth table
|
| 1777 |
|
|
if (a.q & bit_pointer) index = 1;
|
| 1778 |
|
|
if (b.q & bit_pointer) index |= 2;
|
| 1779 |
|
|
if (c.q & bit_pointer) index |= 4;
|
| 1780 |
|
|
uint64_t bit = table >> index & 1; // lookup in truth table
|
| 1781 |
|
|
result = result << 1 | bit; // insert bit into result
|
| 1782 |
|
|
}
|
| 1783 |
|
|
if (options & 2) { // take remaining bits from mask or numcontr
|
| 1784 |
|
|
result |= mask.q & ~(uint64_t)1;
|
| 1785 |
|
|
}
|
| 1786 |
|
|
return result;
|
| 1787 |
|
|
}
|
| 1788 |
|
|
|
| 1789 |
|
|
static uint64_t repeat_block (CThread * t) {
|
| 1790 |
|
|
// Repeat a block of data to make a longer vector.
|
| 1791 |
|
|
// RS is input vector containing data block to repeat.
|
| 1792 |
|
|
// IM2 is length in bytes of the block to repeat (must be a multiple of 4).
|
| 1793 |
|
|
// RT is the length of destination vector RD.
|
| 1794 |
|
|
uint8_t rd = t->operands[0];
|
| 1795 |
|
|
uint8_t rs = t->operands[3];
|
| 1796 |
|
|
uint8_t rt = t->operands[4];
|
| 1797 |
|
|
uint32_t blen = t->parm[4].i; // block length
|
| 1798 |
|
|
uint64_t length = t->registers[rt]; // length of destination
|
| 1799 |
|
|
if (length > t->MaxVectorLength) length = t->MaxVectorLength;
|
| 1800 |
|
|
if (blen > t->MaxVectorLength) blen = t->MaxVectorLength;
|
| 1801 |
|
|
t->vectorLength[rd] = (uint32_t)length; // set length of destination
|
| 1802 |
|
|
if (blen & 3) t->interrupt(INT_WRONG_PARAMETERS); // must be a multiple of 4
|
| 1803 |
|
|
int8_t * source = t->vectors.buf() + rs*t->MaxVectorLength; // address of RS data
|
| 1804 |
|
|
int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
|
| 1805 |
|
|
if (length > t->vectorLength[rs]) { // reading beyond the end of the source vector. make sure the rest is zero
|
| 1806 |
|
|
memset(source + t->vectorLength[rs], 0, size_t(length - t->vectorLength[rs]));
|
| 1807 |
|
|
}
|
| 1808 |
|
|
for (uint32_t pos = 0; pos < length; pos += blen) { // loop through blocks
|
| 1809 |
|
|
uint32_t blen2 = blen;
|
| 1810 |
|
|
if (pos + blen2 > length) blen2 = (uint32_t)length - pos; // avoid last block going too far
|
| 1811 |
|
|
memcpy(destination + pos, source, blen2); // copy block
|
| 1812 |
|
|
}
|
| 1813 |
|
|
t->vect = 4; // stop vector loop
|
| 1814 |
|
|
t->running = 2; // don't save RD
|
| 1815 |
|
|
return 0;
|
| 1816 |
|
|
}
|
| 1817 |
|
|
|
| 1818 |
|
|
static uint64_t repeat_within_blocks (CThread * t) {
|
| 1819 |
|
|
// Broadcast the first element of each block of data in a vector to the entire block.
|
| 1820 |
|
|
// RS is input vector containing data blocks.
|
| 1821 |
|
|
// IM2 is length in bytes of each block (must be a multiple of the operand size).
|
| 1822 |
|
|
// RT is length of destination vector RD.
|
| 1823 |
|
|
// The operand size must be at least 4 bytes.
|
| 1824 |
|
|
uint8_t rd = t->operands[0];
|
| 1825 |
|
|
uint8_t rs = t->operands[3];
|
| 1826 |
|
|
uint8_t rt = t->operands[4];
|
| 1827 |
|
|
uint32_t blen = t->parm[4].i; // block length
|
| 1828 |
|
|
uint64_t length = t->registers[rt]; // length of destination
|
| 1829 |
|
|
if (length > t->MaxVectorLength) length = t->MaxVectorLength;
|
| 1830 |
|
|
if (blen > t->MaxVectorLength) blen = t->MaxVectorLength;
|
| 1831 |
|
|
t->vectorLength[rd] = (uint32_t)length; // set length of destination
|
| 1832 |
|
|
uint32_t elementSize = dataSizeTable[t->operandType];
|
| 1833 |
|
|
if (elementSize < 4 || (blen & (elementSize - 1))) t->interrupt(INT_WRONG_PARAMETERS); // must be a multiple of elementsize
|
| 1834 |
|
|
int8_t * source = t->vectors.buf() + rs*t->MaxVectorLength; // address of RS data
|
| 1835 |
|
|
int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
|
| 1836 |
|
|
if (length > t->vectorLength[rs]) { // reading beyond the end of the source vector. make sure the rest is zero
|
| 1837 |
|
|
memset(source + t->vectorLength[rs], 0, size_t(length - t->vectorLength[rs]));
|
| 1838 |
|
|
}
|
| 1839 |
|
|
for (uint32_t pos = 0; pos < length; pos += blen) { // loop through blocks
|
| 1840 |
|
|
uint32_t blen2 = blen;
|
| 1841 |
|
|
if (pos + blen2 > length) blen2 = (uint32_t)length - pos; // avoid last block going too far
|
| 1842 |
|
|
for (uint32_t i = 0; i < blen2; i += elementSize) { // loop within block
|
| 1843 |
|
|
memcpy(destination + pos + i, source + pos, elementSize); // copy first element
|
| 1844 |
|
|
}
|
| 1845 |
|
|
}
|
| 1846 |
|
|
t->vect = 4; // stop vector loop
|
| 1847 |
|
|
t->running = 2; // don't save RD
|
| 1848 |
|
|
return 0;
|
| 1849 |
|
|
}
|
| 1850 |
|
|
|
| 1851 |
|
|
|
| 1852 |
|
|
// tables of single format instructions
|
| 1853 |
|
|
|
| 1854 |
|
|
// Format 1.3 B. Two vector registers and a broadcast 8-bit immediate operand.
|
| 1855 |
|
|
PFunc funcTab7[64] = {
|
| 1856 |
|
|
gp2vec, vec2gp, 0, make_sequence, insert_, extract_, compress, expand, // 0 - 7
|
| 1857 |
|
|
0, 0, 0, 0, float2int, int2float, round_, round2n, // 8 - 15
|
| 1858 |
|
|
abs_, fp_category, broad_, broad_, byte_reverse, bitscan_, popcount_, 0, // 16 - 23
|
| 1859 |
|
|
0, bool2bits, bool_reduce, 0, 0, 0, 0, 0, // 24 - 31
|
| 1860 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 32 - 39
|
| 1861 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 40 - 47
|
| 1862 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 48 - 55
|
| 1863 |
|
|
push_v, pop_v, clear_, 0, 0, 0, 0, 0, // 56 - 63
|
| 1864 |
|
|
};
|
| 1865 |
|
|
|
| 1866 |
|
|
// Format 1.4 C. One vector register and a broadcast 16-bit immediate operand.
|
| 1867 |
|
|
PFunc funcTab8[64] = {
|
| 1868 |
|
|
move_i16, f_add, and_i16, or_i16, xor_i16, 0, 0, 0, // 0 - 7
|
| 1869 |
|
|
move_8shift8, move_8shift8, add_8shift8, add_8shift8, and_8shift8, and_8shift8, or_8shift8, or_8shift8, // 8 - 15
|
| 1870 |
|
|
xor_8shift8, xor_8shift8, 0, 0, 0, 0, 0, 0, // 16 - 23
|
| 1871 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 24 - 31
|
| 1872 |
|
|
move_half2float, move_half2double, add_half2float, add_half2double, mul_half2float, mul_half2double, 0, 0, // 32 - 39
|
| 1873 |
|
|
add_h16, mul_h16, 0, 0, 0, 0, 0, 0, // 40 - 47
|
| 1874 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 48 - 55
|
| 1875 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 56 - 63
|
| 1876 |
|
|
};
|
| 1877 |
|
|
|
| 1878 |
|
|
// Format 2.5 A. Single format instructions with memory operands or mixed register types
|
| 1879 |
|
|
PFunc funcTab10[64] = {
|
| 1880 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 0 - 7
|
| 1881 |
|
|
store_i32, 0, 0, 0, 0, 0, 0, 0, // 8 - 15
|
| 1882 |
|
|
f_nop, 0, compare_swap, 0, 0, 0, 0, 0, // 16 - 23
|
| 1883 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 24 - 31
|
| 1884 |
|
|
read_insert, 0, 0, 0, 0, 0, 0, 0, // 32 - 39
|
| 1885 |
|
|
extract_store, 0, 0, 0, 0, 0, 0, 0, // 40 - 47
|
| 1886 |
|
|
};
|
| 1887 |
|
|
|
| 1888 |
|
|
|
| 1889 |
|
|
// Format 2.6 A. Three vector registers and a 32-bit immediate operand.
|
| 1890 |
|
|
PFunc funcTab11[64] = {
|
| 1891 |
|
|
load_hi, insert_hi, make_mask, replace_, replace_even, replace_odd, broadcast_32, 0, // 0 - 7
|
| 1892 |
|
|
permute, 0, 0, 0, 0, 0, 0, 0 // 8 - 15
|
| 1893 |
|
|
};
|
| 1894 |
|
|
|
| 1895 |
|
|
// Format 3.1 A. Three vector registers and a 64-bit immediate operand.
|
| 1896 |
|
|
PFunc funcTab13[64] = {
|
| 1897 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 0 - 7
|
| 1898 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 8 - 15
|
| 1899 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 16 - 23
|
| 1900 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 34 - 31
|
| 1901 |
|
|
replace_, broadcast_32, 0, 0, 0, 0, 0, 0, // 32 - 39
|
| 1902 |
|
|
};
|
| 1903 |
|
|
|
| 1904 |
|
|
|
| 1905 |
|
|
// Dispatch functions for single format instruction with E template.
|
| 1906 |
|
|
// (full tables of all possible single format instruction with E template would
|
| 1907 |
|
|
// be too large with most places unused).
|
| 1908 |
|
|
|
| 1909 |
|
|
// Format 2.0.6 E. Four general purpose registers
|
| 1910 |
|
|
static uint64_t dispatch206_1 (CThread * t) {
|
| 1911 |
|
|
switch (t->op) {
|
| 1912 |
|
|
case 8: return truth_tab3(t);
|
| 1913 |
|
|
default:
|
| 1914 |
|
|
t->interrupt(INT_UNKNOWN_INST);
|
| 1915 |
|
|
}
|
| 1916 |
|
|
return 0;
|
| 1917 |
|
|
}
|
| 1918 |
|
|
|
| 1919 |
|
|
|
| 1920 |
|
|
// Format 2.0.7 E. Three general purpose registers and a 16-bit immediate constant
|
| 1921 |
|
|
static uint64_t dispatch207_1 (CThread * t) {
|
| 1922 |
|
|
switch (t->op) {
|
| 1923 |
|
|
case 0: return move_bits(t);
|
| 1924 |
|
|
default:
|
| 1925 |
|
|
t->interrupt(INT_UNKNOWN_INST);
|
| 1926 |
|
|
}
|
| 1927 |
|
|
return 0;
|
| 1928 |
|
|
}
|
| 1929 |
|
|
|
| 1930 |
|
|
// Format 2.2.6 E. Four vector registers
|
| 1931 |
|
|
static uint64_t dispatch226_1 (CThread * t) {
|
| 1932 |
|
|
switch (t->op) {
|
| 1933 |
|
|
case 0: return concatenate(t);
|
| 1934 |
|
|
case 1: return permute(t);
|
| 1935 |
|
|
case 2: return interleave(t);
|
| 1936 |
|
|
case 8: return truth_tab3(t);
|
| 1937 |
|
|
default:
|
| 1938 |
|
|
t->interrupt(INT_UNKNOWN_INST);
|
| 1939 |
|
|
}
|
| 1940 |
|
|
return 0;
|
| 1941 |
|
|
}
|
| 1942 |
|
|
|
| 1943 |
|
|
// Format 2.2.7 E. Three vector registers and a 16-bit immediate constant
|
| 1944 |
|
|
static uint64_t dispatch227_1 (CThread * t) {
|
| 1945 |
|
|
switch (t->op) {
|
| 1946 |
|
|
case 0: return move_bits(t);
|
| 1947 |
|
|
case 1: return mask_length(t);
|
| 1948 |
|
|
case 8: return repeat_block(t);
|
| 1949 |
|
|
case 9: return repeat_within_blocks(t);
|
| 1950 |
|
|
default:
|
| 1951 |
|
|
t->interrupt(INT_UNKNOWN_INST);
|
| 1952 |
|
|
}
|
| 1953 |
|
|
return 0;
|
| 1954 |
|
|
}
|
| 1955 |
|
|
|
| 1956 |
|
|
// Table of dispatch functions for all possible single format instructions with E template
|
| 1957 |
|
|
PFunc EDispatchTable[96] = {
|
| 1958 |
|
|
0, 0, 0, 0, 0, 0, dispatch206_1, dispatch207_1, // 2.0.x i.1
|
| 1959 |
|
|
0, 0, 0, 0, 0, 0, dispatch226_1, dispatch227_1, // 2.2.x i.1
|
| 1960 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 3.0.x i.1
|
| 1961 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 3.2.x i.1
|
| 1962 |
|
|
|
| 1963 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 2.0.x i.2
|
| 1964 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 2.2.x i.2
|
| 1965 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 3.0.x i.2
|
| 1966 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 3.2.x i.2
|
| 1967 |
|
|
|
| 1968 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 2.0.x i.3
|
| 1969 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 2.2.x i.3
|
| 1970 |
|
|
0, 0, 0, 0, 0, 0, 0, 0, // 3.0.x i.3
|
| 1971 |
|
|
0, 0, 0, 0, 0, 0, 0, 0 // 3.2.x i.3
|
| 1972 |
|
|
};
|