OpenCores
URL https://opencores.org/ocsvn/forwardcom/forwardcom/trunk

Subversion Repositories forwardcom

[/] [forwardcom/] [bintools/] [emulator5.cpp] - Blame information for rev 163

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 56 Agner
/****************************  emulator5.cpp  ********************************
2
* Author:        Agner Fog
3
* date created:  2018-02-18
4
* Last modified: 2021-06-30
5
* Version:       1.11
6
* Project:       Binary tools for ForwardCom instruction set
7
* Description:
8
* Emulator: Execution functions for single format instructions, continued
9
*
10
* Copyright 2018-2021 GNU General Public License http://www.gnu.org/licenses
11
*****************************************************************************/
12
 
13
#include "stdafx.h"
14
 
15
// Format 1.3 B. Two vector registers and a broadcast 8-bit immediate operand.
16
 
17
static uint64_t gp2vec (CThread * t) {
18
    // Move value of general purpose register RS to scalar in vector register RD.
19
    uint8_t  rd = t->operands[0];
20
    uint8_t  rs = t->operands[4];
21
    uint64_t result = t->registers[rs];                    // read general purpose register
22
    t->vectorLength[rd] = dataSizeTable[t->operandType];   // set length of destination
23
    t->vect = 4;                                           // stop vector loop
24
    return result;
25
}
26
 
27
static uint64_t vec2gp (CThread * t) {
28
    // Move value of first element of vector register RS to general purpose register RD.
29
    uint8_t  rd = t->operands[0];
30
    uint8_t  rs = t->operands[4];
31
    uint8_t size = dataSizeTable[t->operandType];
32
    if (size > t->vectorLength[rs]) size = t->vectorLength[rs]; // limit size to vector length
33
    uint64_t result = *(uint64_t*)(t->vectors.buf() + t->MaxVectorLength*rs); // read directly from vector
34
    if (size < 8) result &= ((uint64_t)1 << size*8) - 1;   // mask off to size
35
    t->registers[rd] = result;                             // write to general purpose register
36
    t->vect = 4;                                           // stop vector loop
37
    t->running = 2;                                        // don't save RD
38
    t->returnType &= ~ 0x100;                              // debug return type not vector
39
    return result;
40
}
41
 
42
static uint64_t make_sequence (CThread * t) {
43
    // Make a vector with RS sequential numbers. First value is IM1.
44
    uint8_t  rd = t->operands[0];
45
    uint8_t  rs = t->operands[4];
46
    int32_t  val = int8_t(t->pInstr->b[0]);      // immediate operand, sign extended integer
47
    uint64_t num = t->registers[rs];             // number of elements
48
    uint32_t elementSize = dataSizeTable[t->operandType];
49
    uint8_t  dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
50
    SNum temp;
51
    // limit length
52
    uint64_t length = num << dsizelog;
53
    if (length > t->MaxVectorLength) {
54
        length = t->MaxVectorLength;  num = length >> dsizelog;
55
    }
56
    // set length of rd
57
    t->vectorLength[rd] = (uint32_t)length;
58
    // loop through destination vector
59
    for (uint32_t pos = 0; pos < length; pos += elementSize) {
60
        switch (t->operandType) {
61
        case 0: case 1: case 2: case 3:
62
            t->writeVectorElement(rd, (uint64_t)(int64_t)val, pos);  break;
63
        case 4:
64
            t->writeVectorElement(rd, (uint64_t)(int64_t)val, pos);          // int128
65
            t->writeVectorElement(rd, (uint64_t)((int64_t)val >> 63), pos+8); break;
66
        case 5:   // float
67
            temp.f = float(val);                 // convert to float
68
            t->writeVectorElement(rd, temp.q, pos);
69
            break;
70
        case 6:   // double
71
            temp.d = double(val);                // convert to double
72
            t->writeVectorElement(rd, temp.q, pos);
73
            break;
74
        default:
75
            t->interrupt(INT_WRONG_PARAMETERS);
76
        }
77
        val++;                                   // increment value
78
    }
79
    t->vect = 4;                                 // stop vector loop
80
    t->running = 2;                              // don't save RD
81
    return 0;
82
}
83
 
84
static uint64_t compress(CThread * t) {
85
    // Compress vector RT of length RS to a vector of half the length and half the element size.
86
    // Double precision -> single precision, 64-bit integer -> 32-bit integer, etc.
87
 
88
    // operands:
89
    uint8_t  rd = t->operands[0];
90
    uint8_t  rs = t->operands[4];
91
    uint8_t IM1 = t->parm[4].b;
92
    if (IM1 & 0xC0) t->interrupt(INT_WRONG_PARAMETERS);
93
    //uint32_t initLength = t->vectorLength[rt];
94
    uint32_t oldLength = t->vectorLength[rs]; // (uint32_t)t->registers[rs];
95
    uint32_t newLength = oldLength / 2;
96
    uint32_t pos;  // position in destination vector
97
    uint8_t overflowU  = 0;                      // unsigned overflow in current element
98
    uint8_t overflowS  = 0;                      // signed overflow in current element
99
    uint8_t overflowU2 = 0;                      // unsigned overflow in any element
100
    uint8_t overflowS2 = 0;                      // signed overflow in any element
101
    uint8_t overflowF2 = 0;                      // floating point overflow in any element
102
    SNum mask = t->parm[3];                      // options mask
103
    int8_t * source = t->vectors.buf() + (uint64_t)rs * t->MaxVectorLength;      // address of RS data
104
    int8_t * destination = t->vectors.buf() + (uint64_t)rd * t->MaxVectorLength; // address of RD data
105
 
106
    uint8_t roundingMode = (IM1 >> 3) & 7;       // floating point rounding mode
107
    if (roundingMode == 0) roundingMode = ((t->parm[3].i >> MSKI_ROUNDING) & 7) | 4;
108
    uint8_t exceptionControl = IM1 & 7;          // floating point exception enable bits:
109
                                                 // 1: overflow, 2: underflow, 4: inexact
110
    if (exceptionControl == 0) {                 // floating point exception control
111
        exceptionControl = mask.i >> (MSKI_EXCEPTIONS + 1) & 7; // exceptions from NUMCONTR
112
    }
113
    else if (exceptionControl == 7) {
114
        exceptionControl = 0;                    // 7 means none (5 means all)
115
    }
116
 
117
    switch (t->operandType) {                    //  source operand type
118
    case 0:   // int8 -> int4
119
        for (pos = 0; pos < newLength; pos += 1) {
120
            union {
121
                uint16_t s;
122
                uint8_t b[2];
123
            } u;
124
            u.s = *(uint16_t*)(source + 2*pos);  // two values to convert to one byte
125
            for (int i = 0; i < 2; i++) {        // loop for two bytes to convert
126
                uint8_t val = u.b[i];
127
                overflowU = val > 0x0F;          // unsigned overflow
128
                overflowS = val - 0xF8 > 0x0F;   // signed overflow
129
                overflowU2 |= overflowU;  overflowS2 |= overflowS;
130
                switch (IM1 & 7) {
131
                case 0: default:                 // wrap around
132
                    break;
133
                case 4:                          // signed integer overflow gives zero
134
                    if (overflowS) val = 0;
135
                    break;
136
                case 5:                          // signed integer overflow gives signed saturation
137
                    if (overflowS) val = 0x7 + (val >> 7);
138
                    break;
139
                case 6:                          // unsigned integer overflow gives zero
140
                    if (overflowU) val = 0;
141
                    break;
142
                case 7:                          // unsigned integer overflow gives unsigned saturation
143
                    if (overflowU) val = 0xF;
144
                    break;
145
                }
146
                u.b[i] = val;
147
            }
148
            uint8_t val2 = (u.b[0] & 0xF) | u.b[1] << 4;
149
            *(uint8_t*)(destination + pos) = val2;         // store two values
150
        }
151
        t->returnType = 0x110;
152
        break;
153
    case 1:   // int16 -> int8
154
        for (pos = 0; pos < newLength; pos += 1) {
155
            uint16_t val = *(uint16_t*)(source + 2*pos);   // value to convert
156
            overflowU = val > 0xFF;                        // unsigned overflow
157
            overflowS = val - 0xFF80 > 0xFF;               // signed overflow
158
            overflowU2 |= overflowU;  overflowS2 |= overflowS;
159
            switch (IM1 & 7) {
160
            case 0: default: // wrap around
161
                break;
162
            case 4:          // signed integer overflow gives zero
163
                if (overflowS) val = 0;
164
                break;
165
            case 5:          // signed integer overflow gives signed saturation
166
                if (overflowS) val = 0x7F + (val >> 15);
167
                break;
168
            case 6:          // unsigned integer overflow gives zero
169
                if (overflowU) val = 0;
170
                break;
171
            case 7:          // unsigned integer overflow gives unsigned saturation
172
                if (overflowU) val = 0xFF;
173
                break;
174
            }
175
            *(uint8_t*)(destination + pos) = (uint8_t)val; // store value
176
        }
177
        t->returnType = 0x110;
178
        break;
179
    case 2:   // int32 -> int16
180
        for (pos = 0; pos < newLength; pos += 2) {
181
            uint32_t val = *(uint32_t*)(source + 2*pos);   // value to convert
182
            overflowU = val > 0xFFFF;                      // unsigned overflow
183
            overflowS = val - 0xFFFF8000 > 0xFFFF;         // signed overflow
184
            switch (IM1 & 7) {
185
            case 0: default: // wrap around
186
                break;
187
            case 4:          // signed integer overflow gives zero
188
                if (overflowS) val = 0;
189
                break;
190
            case 5:          // signed integer overflow gives signed saturation
191
                if (overflowS) val = 0x7FFF + (val >> 31);
192
                break;
193
            case 6:          // unsigned integer overflow gives zero
194
                if (overflowU) val = 0;
195
                break;
196
            case 7:          // unsigned integer overflow gives unsigned saturation
197
                if (overflowU) val = 0xFFFF;
198
                break;
199
            }
200
            *(uint16_t*)(destination + pos) = (uint16_t)val; // store value
201
        }
202
        t->returnType = 0x111;
203
        break;
204
    case 3:   // int64 -> int32
205
        for (pos = 0; pos < newLength; pos += 4) {
206
            uint64_t val = *(uint64_t*)(source + 2*pos);  // value to convert
207
            overflowU = val > 0xFFFFFFFFU;                // unsigned overflow
208
            overflowS = val - 0xFFFFFFFF80000000 > 0xFFFFFFFFU; // signed overflow
209
            switch (IM1 & 7) {
210
            case 0: default: // wrap around
211
                break;
212
            case 4:          // signed integer overflow gives zero
213
                if (overflowS) val = 0;
214
                break;
215
            case 5:          // signed integer overflow gives signed saturation
216
                if (overflowS) val = 0x7FFFFFFF + (val >> 63);
217
                break;
218
            case 6:          // unsigned integer overflow gives zero
219
                if (overflowU) val = 0;
220
                break;
221
            case 7:          // unsigned integer overflow gives unsigned saturation
222
                if (overflowU) val = 0xFFFFFFFF;
223
                break;
224
            }
225
            *(uint32_t*)(destination + pos) = (uint32_t)val; // store value
226
        }
227
        t->returnType = 0x112;
228
        break;
229
    case 4:   // int128 -> int64
230
        for (pos = 0; pos < newLength; pos += 8) {
231
            uint64_t valLo = *(uint64_t*)(source + 2*pos);      // value to convert, low part
232
            uint64_t valHi = *(uint64_t*)(source + 2*pos + 8);  // value to convert, high part
233
            overflowU = valHi != 0;                             // unsigned overflow
234
            if ((int64_t)valLo < 0) overflowS = valHi+1 != 0;   // signed overflow
235
            else overflowS = valHi != 0;
236
            overflowU2 |= overflowU;  overflowS2 |= overflowS;
237
            switch (IM1 & 7) {
238
            case 0: default: // wrap around
239
                break;
240
            case 4:          // signed integer overflow gives zero
241
                if (overflowS) valLo = 0;
242
                break;
243
            case 5:          // signed integer overflow gives signed saturation
244
                if (overflowS) valLo = nsign_d + (valHi >> 63);
245
                break;
246
            case 6:          // unsigned integer overflow gives zero
247
                if (overflowU) valHi = valLo = 0;
248
                break;
249
            case 7:          // unsigned integer overflow gives unsigned saturation
250
                if (overflowU) valLo = 0xFFFFFFFFFFFFFFFF;
251
                break;
252
            }
253
        }
254
        t->returnType = 0x113;
255
        break;
256
    case 5:   // float -> float16
257
        for (pos = 0; pos < newLength; pos += 2) {
258
            SNum val;
259
            val.i = *(uint32_t*)(source + 2 * pos);        // value to convert
260
            uint16_t val2 = float2half(val.f);             // convert to half precision
261
            if (!isnan_or_inf_f(val.i)) {
262
                // check rounding mode
263
                switch (roundingMode) {
264
                case 1:          // odd if not exact
265
                    if (half2float(val2) != val.f) val2 |= 1;
266
                    break;
267
                case 4: default: // nearest or even
268
                    break;
269
                case 5:          // down
270
                    if (half2float(val2) > val.f) {
271
                        if (val2 << 1 == 0) val2 = 0x8001; // 0 -> subnormal negative
272
                        else if (int16_t(val2) > 0) val2--;
273
                        else val2++;
274
                    }
275
                    break;
276
                case 6:          // up
277
                    if (half2float(val2) < val.f) {
278
                        if (val2 << 1 == 0) val2 = 0x0001; // 0 -> subnormal positive
279
                        else if (int16_t(val2) > 0) val2++;
280
                        else val2--;
281
                    }
282
                    break;
283
                case 7:          // towards zero
284
                    if (half2float(val2) != val.f && (val2 << 1 != 0)) {
285
                        val2--;
286
                    }
287
                    break;
288
                }
289
                // check overflow
290
                overflowS = (val2 & 0x7FFF) == 0x7C00 && !isinf_f(val.i);// detect overflow
291
                overflowF2 |= overflowS;
292
                if (overflowS) {                               // check for overflow
293
                    if (exceptionControl & 1) {                // overflow exception -> NAN
294
                        val2 = (uint16_t)t->makeNan(nan_overflow_conv, 1);  // overflow
295
                    }
296
                }
297
                else if ((exceptionControl & 6) && val2 << 1 == 0 && val.f != 0.f) {
298
                    val2 = (uint16_t)t->makeNan(nan_underflow, 1); // underflow exception (inexact implies underflow)
299
                }
300
                else if ((exceptionControl & 4) && half2float(val2) != val.f) {
301
                    val2 = (uint16_t)t->makeNan(nan_inexact, 1);   // inexact exception
302
                }
303
            }
304
            *(uint16_t*)(destination + pos) = val2;        // store value
305
        }
306
        t->returnType = 0x118;
307
        break;
308
    case 6:   // double -> float
309
        for (pos = 0; pos < newLength; pos += 4) {
310
            SNum val1, val2;
311
            val1.q = *(uint64_t*)(source + 2 * pos);       // value to convert
312
            // check NAN and INF
313
            if (isnan_or_inf_d(val1.q)) {
314
                union {                                    // single precision float
315
                    float f;
316
                    struct {                               // structure of a NAN
317
                        uint32_t payload : 22;
318
                        uint32_t quiet : 1;
319
                        uint32_t expo : 8;
320
                        uint32_t sign : 1;
321
                    };
322
                } u;
323
                u.payload = val1.i & ((1 << 22) - 1);      // ForwardCom has right-justified NAN payload, unlike other binary systems
324
                u.quiet = val1.q >> 51 & 1;
325
                u.expo = 0xFF;
326
                u.sign = val1.q >> 63 & 1;
327
                val2.f = u.f;
328
            }
329
            else {
330
                val2.f = float(val1.d);                    // convert to single precision
331
                // check rounding mode
332
                uint8_t roundingMode = (IM1 >> 3) & 7;
333
                if (roundingMode == 0) roundingMode = ((t->parm[3].i >> MSKI_ROUNDING) & 7) | 4;
334
                switch (roundingMode) {
335
                case 1:          // odd if not exact
336
                    if (val2.f != val1.d) {
337
                        val2.i |= 1;
338
                    }
339
                    break;
340
                case 4: default: // nearest or even
341
                    break;
342
                case 5:          // down
343
                    if (val2.f > val1.d) {
344
                        if (val2.f == 0.f) val2.i = 0x80000001; // 0 -> subnormal negative
345
                        else if (val2.i > 0) val2.i--;
346
                        else val2.i++;
347
                    }
348
                    break;
349
                case 6:          // up
350
                    if (val2.f < val1.d) {
351
                        if (val2.f == 0.f) val2.i = 0x00000001; // 0 -> subnormal positive
352
                        else if (val2.i > 0) val2.i++;
353
                        else val2.i--;
354
                    }
355
                    break;
356
                case 7:         // towards zero
357
                    if (val2.f != val1.d && val2.f != 0.f) {
358
                        val2.i--;
359
                    }
360
                    break;
361
                }
362
                // check overflow
363
                overflowS = isinf_f(val2.i) && !isinf_d(val1.q); // detect overflow
364
                overflowF2 |= overflowS;
365
                if (overflowS) {                               // check for overflow
366
                    if (exceptionControl & 1) {                // overflow exception -> NAN
367
                        val2.q = t->makeNan(nan_overflow_conv, 5);  // overflow
368
                    }
369
                }
370
                else if ((exceptionControl & 6) && val2.f == 0.f && val1.d != 0.) {
371
                    val2.q = t->makeNan(nan_underflow, 5);     // underflow exception
372
                }
373
                else if ((exceptionControl & 4) && val2.f != val1.d) {
374
                    val2.q = t->makeNan(nan_inexact, 5);       // inexact exception
375
                }
376
            }
377
            *(uint32_t*)(destination + pos) = val2.i;          // store value
378
        }
379
        t->returnType = 0x115;
380
        break;
381
    default:
382
        t->interrupt(INT_WRONG_PARAMETERS);
383
    }
384
    // check overflow traps
385
    /*
386
    if (mask.i & MSK_OVERFL_ALL) {
387
        if      ((mask.i & MSK_OVERFL_SIGN)   && overflowS2) t->interrupt(INT_OVERFL_SIGN);   // signed overflow
388
        else if ((mask.i & MSK_OVERFL_UNSIGN) && overflowU2) t->interrupt(INT_OVERFL_UNSIGN); // unsigned overflow
389
        else if ((mask.i & MSK_OVERFL_FLOAT)  && overflowF2) t->interrupt(INT_OVERFL_FLOAT);  // float overflow
390
    } */
391
    t->vectorLength[rd] = newLength;             // save new vector length
392
    t->vect = 4;                                 // stop vector loop
393
    t->running = 2;                              // don't save. result has already been saved
394
    return 0;
395
}
396
 
397
static uint64_t expand(CThread * t) {
398
    // Expand vector RS to a vector of the double length and the double element size.
399
    // OT specifies the element size or precision of the destination.
400
    // Half precision -> single precision, 32-bit integer -> 64-bit integer, etc.
401
 
402
    // Operands:
403
    uint8_t  rd = t->operands[0];
404
    uint8_t  rs = t->operands[4];
405
    uint8_t IM1 = t->parm[4].b;
406
    if (IM1 & 0xFC) t->interrupt(INT_WRONG_PARAMETERS);
407
    bool signExtend = (IM1 & 2) == 0;
408
 
409
    uint32_t initLength = t->vectorLength[rs];
410
    uint32_t newLength = 2 * initLength;
411
    if (newLength > t->MaxVectorLength) newLength = t->MaxVectorLength;
412
    // uint32_t oldLength = newLength / 2;
413
    uint32_t pos;                                // position in source vector
414
    int8_t * source = t->vectors.buf() + (uint32_t)rs * t->MaxVectorLength;      // address of RT data
415
    int8_t * destination = t->vectors.buf() + (uint32_t)rd * t->MaxVectorLength; // address of RD data
416
    if (rd == rs) {
417
        // source and destination are the same. Make a temporary copy of source to avoid overwriting
418
        memcpy(t->tempBuffer, source, initLength);
419
        source = t->tempBuffer;
420
    }
421
    switch (t->operandType) {
422
    case 0:   // int4 -> int8
423
        for (pos = 0; pos < newLength; pos += 1) {
424
            uint8_t val1 = *(uint8_t*)(source + pos);  // values to convert
425
            union {
426
                uint16_t s;
427
                uint8_t b[2];
428
                int8_t bs[2];
429
            } val2;
430
            if (signExtend) {
431
                val2.bs[0] = (int8_t)val1 << 4 >> 4;   // sign extend
432
                val2.bs[1] = (int8_t)val1 >> 4;        // sign extend
433
            }
434
            else {
435
                val2.b[0] = val1 & 0xF;                // zero extend
436
                val2.b[1] = val1 >> 4;                 // zero extend
437
            }
438
            *(uint16_t*)(destination + pos*2) = val2.s;         // store value
439
        }
440
        break;
441
    case 1:   // int8 -> int16
442
        for (pos = 0; pos < newLength; pos += 1) {
443
            uint16_t val = *(uint8_t*)(source + pos);  // value to convert
444
            if (signExtend) val = uint16_t((int16_t)(val << 8) >> 8);   // sign extend
445
            *(uint16_t*)(destination + pos*2) = val; // store value
446
        }
447
        break;
448
    case 2:   // int16 -> int32
449
        for (pos = 0; pos < newLength; pos += 2) {
450
            uint32_t val = *(uint16_t*)(source + pos);  // value to convert
451
            if (signExtend) val = uint32_t((int32_t)(val << 16) >> 16);   // sign extend
452
            *(uint32_t*)(destination + pos*2) = val; // store value
453
        }
454
        break;
455
    case 3:   // int32 -> int64
456
        for (pos = 0; pos < newLength; pos += 4) {
457
            uint64_t val = *(uint32_t*)(source + pos);  // value to convert
458
            if (signExtend) val = uint64_t((int64_t)(val << 32) >> 32);   // sign extend
459
            *(uint64_t*)(destination + pos*2) = val; // store value
460
        }
461
        break;
462
    case 4:   // int64 -> int128
463
        for (pos = 0; pos < newLength; pos += 8) {
464
            uint64_t valLo = *(uint64_t*)(source + pos);   // value to convert
465
            uint64_t valHi = 0;
466
            if (signExtend) valHi = uint64_t((int64_t)valLo >> 63);   // sign extend
467
            *(uint64_t*)(destination + pos*2) = valLo;     // store low part
468
            *(uint64_t*)(destination + pos*2 + 8) = valHi; // store high part
469
        }
470
        break;
471
    case 5:   // float16 -> float
472
        for (pos = 0; pos < newLength; pos += 2) {
473
            uint16_t val1 = *(uint16_t*)(source + pos);    // value to convert
474
            float val2 = half2float(val1);                 // convert half precision to float
475
            *(float*)(destination + pos*2) = val2;         // store value
476
        }
477
        break;
478
    case 6:   // float -> double
479
        for (pos = 0; pos < newLength; pos += 4) {
480
            SNum val1;
481
            val1.i = *(uint32_t*)(source + pos);           // value to convert
482
            double val2 = val1.f;                          // convert to double precision
483
            // check NAN
484
            // ForwardCom has right-justified NAN payload, unlike other binary systems
485
            if (isnan_f(val1.i)) {
486
                union {                                    // single precision float
487
                    double d;
488
                    struct {                               // structure of a NAN
489
                        uint64_t payload : 51;
490
                        uint64_t quiet   : 1;
491
                        uint64_t expo    : 11;
492
                        uint64_t sign    : 1;
493
                    };
494
                } u;
495
                u.payload = val1.q & ((1 << 22) - 1);
496
                u.quiet = val1.i >> 22 & 1;
497
                u.expo = 0x7FF;
498
                u.sign = val1.q >> 63 & 1;
499
                val2 = u.d;
500
            }
501
            *(double*)(destination + pos*2) = val2;        // store value
502
        }
503
        break;
504
    default:
505
        t->interrupt(INT_WRONG_PARAMETERS);
506
    }
507
    t->vectorLength[rd] = newLength;                       // save new vector length
508
    t->vect = 4;                                           // stop vector loop
509
    t->running = 2;                                        // don't save. result has already been saved
510
    return 0;
511
}
512
 
513
static uint64_t float2int (CThread * t) {
514
    // Conversion of floating point to signed or unsigned integer with the same operand size. 
515
    // The rounding mode and overflow control is specified in IM1.
516
    SNum a = t->parm[1];
517
    SNum b = t->parm[4];
518
    int64_t result = 0;
519
    uint32_t dataSize = dataSizeTable[t->operandType];
520
    uint8_t roundingMode = b.b >> 3 & 3;
521
    uint8_t signMode = roundingMode | (b.b & 2) << 1; // bit 0-1: rounding mode, bit 2: usigned
522
    bool overflow = false;
523
    bool invalid = false;
524
 
525
    if (dataSize == 2) {  // float16 -> int16
526
        const float max = (float)(int32_t)0x7FFF;
527
        const float min = -max - 1.0f;
528
        const float umax = (float)(uint32_t)0xFFFFu;
529
        if (isnan_h(a.s)) {
530
            invalid = true;
531
        }
532
        else {
533
            float f = half2float(a.s);
534
            switch (signMode) { // rounding mode:
535
            case 0: // nearest or even
536
                if (f >= max + 0.5f || f < min - 0.5f) overflow = true;
537
                result = (int)(nearbyint(f));
538
                break;
539
            case 1: // down 
540
                if (f >= max + 1.0f || f <= min) overflow = true;
541
                result = (int)(floor(f));
542
                break;
543
            case 2: // up
544
                if (f > max || f <= min - 1.0f) overflow = true;
545
                result = (int)(ceil(f));
546
                break;
547
            case 3: // towards zero
548
                if (f >= max + 1.0f || f <= min - 1.0f) overflow = true;
549
                result = (int)(f);
550
                break;
551
            case 4: // unsigned nearest or even
552
                if (f >= umax + 0.5f || f < - 0.5f) overflow = true;
553
                result = (int)(nearbyint(f));
554
                break;
555
            case 5: case 7: // unsigned down
556
                if (f >= umax + 1.0f || f < 0.0f) overflow = true;
557
                result = (int)(floor(f));
558
                break;
559
            case 6: // unsigned up
560
                if (f > umax || f <= -1.0f) overflow = true;
561
                else result = (int)(ceil(f));
562
            }
563
            if (overflow) {
564
                switch (b.b & 7) { // overflow options
565
                case 0: default: // wrap around
566
                    result &= 0xFFFFu;
567
                    break;
568
                case 4: case 6:
569
                    result = 0;
570
                    break;
571
                case 5: // signed saturation
572
                    result = 0x7FFF + int(f < 0);
573
                    break;
574
                case 7: // unsigned saturation
575
                    result = 0xFFFFu;
576
                    break;
577
                }
578
            }
579
            if (invalid) {
580
                result = (b.b & 0x20) ? 0x8000u : 0;
581
            }
582
        }
583
    }
584
    else if (dataSize == 4) {  // float -> int32
585
        const float max = (float)(int32_t)nsign_f;
586
        const float min = -max - 1.0f;
587
        const float umax = (float)(uint32_t)0xFFFFFFFFu;
588
        if (isnan_f(a.i)) {
589
            invalid = true;
590
        }
591
        else {
592
            switch (signMode) { // rounding mode:
593
            case 0: // nearest or even
594
                if (a.f >= max + 0.5f || a.f < min - 0.5f) overflow = true;
595
                result = (int64_t)(nearbyint(a.f));
596
                break;
597
            case 1: // down 
598
                if (a.f >= max + 1.0f || a.f <= min) overflow = true;
599
                result = (int64_t)(floor(a.f));
600
                break;
601
            case 2: // up
602
                if (a.f > max || a.f <= min - 1.0f) overflow = true;
603
                result = (int64_t)(ceil(a.f));
604
                break;
605
            case 3: // towards zero
606
                if (a.f >= max + 1.0f || a.f <= min - 1.0f) overflow = true;
607
                result = (int64_t)(a.f);
608
                break;
609
            case 4: // unsigned nearest or even
610
                if (a.f >= umax + 0.5f || a.f < - 0.5f) overflow = true;
611
                result = (int64_t)(nearbyint(a.f));
612
                break;
613
            case 5: case 7: // unsigned down
614
                if (a.f >= umax + 1.0f || a.f < 0.0f) overflow = true;
615
                result = (int64_t)(floor(a.f));
616
                break;
617
            case 6: // unsigned up
618
                if (a.f > umax || a.f <= -1.0f) overflow = true;
619
                else result = (int64_t)(ceil(a.f));
620
            }
621
            if (overflow) {
622
                switch (b.b & 7) { // overflow options
623
                case 0:  // wrap around
624
                    result &= 0xFFFFFFFFu;
625
                    break;
626
                case 4: case 6:
627
                    result = 0;
628
                    break;
629
                case 5: // signed saturation
630
                    result = 0x7FFFFFFF + int(a.f < 0);
631
                    break;
632
                case 7: // unsigned saturation
633
                    result = 0xFFFFFFFFu;
634
                    break;
635
                }
636
            }
637
            if (invalid) {
638
                result = (b.b & 0x20) ? sign_f : 0;
639
            }
640
        }
641
    }
642
    else if (dataSize == 8) {   // double -> int64
643
        const double max = (double)(int64_t)nsign_d;
644
        const double min = -max - 1.0f;
645
        const double umax = (double)0xFFFFFFFFFFFFFFFFu;
646
        if (isnan_d(a.q)) {
647
            invalid = true;
648
        }
649
        else {
650
            switch (signMode) { // rounding mode:
651
            case 0: // nearest or even
652
                if (a.d >= max + 0.5 || a.d < min - 0.5) overflow = true;
653
                result = (int64_t)(nearbyint(a.d));
654
                break;
655
            case 1: // down 
656
                if (a.d >= max + 1.0 || a.d <= min) overflow = true;
657
                result = (int64_t)(floor(a.d));
658
                break;
659
            case 2: // up
660
                if (a.d > max || a.d <= min - 1.0) overflow = true;
661
                result = (int64_t)(ceil(a.d));
662
                break;
663
            case 3: // towards zero
664
                if (a.d >= max + 1.0 || a.d <= min - 1.0) overflow = true;
665
                result = (int64_t)(a.d);
666
                break;
667
            case 4: // unsigned nearest or even
668
                if (a.d >= umax + 0.5 || a.d < - 0.5) overflow = true;
669
                result = (uint64_t)(nearbyint(a.d));
670
                break;
671
            case 5: case 7: // unsigned down
672
                if (a.d >= umax + 1.0 || a.d < 0.0) overflow = true;
673
                result = (uint64_t)(floor(a.d));
674
                break;
675
            case 6: // unsigned up
676
                if (a.d > umax || a.d <= -1.0) overflow = true;
677
                result = (uint64_t)(ceil(a.d));
678
            }
679
        }
680
        if (overflow) {
681
            switch (b.b & 7) { // overflow options
682
            case 0:  // wrap around
683
                break;
684
            case 4: case 6:
685
                result = 0;
686
                break;
687
            case 5: // signed saturation
688
                result = nsign_d + int(a.d < 0);
689
                break;
690
            case 7: // unsigned saturation
691
                result = 0xFFFFFFFFFFFFFFFFu;
692
                break;
693
            }
694
        }
695
        if (invalid) {
696
            result = (b.b & 0x20) ? sign_d : 0;
697
        }
698
    }
699
    else t->interrupt(INT_WRONG_PARAMETERS);
700
    /* Traps not supported
701
    if (overflow && (mask.i & MSK_OVERFL_SIGN)) {
702
        t->interrupt(INT_OVERFL_SIGN);  // signed overflow
703
        result = dataSizeMask[t->operandType] >> 1; // INT_MAX
704
    }
705
    if (invalid && (mask.i & MSK_FLOAT_NAN_LOSS)) {
706
        t->interrupt(INT_FLOAT_NAN_LOSS);  // nan converted to integer
707
        result = dataSizeMask[t->operandType] >> 1; // INT_MAX
708
    } */
709
    if ((t->operandType & 7) >= 5) t->operandType -= 3;    // debug return type is integer
710
    return result;
711
}
712
 
713
static uint64_t int2float (CThread * t) {
714
    //  Conversion of signed or unsigned integer to floating point with same operand size.
715
    SNum a = t->parm[1];
716
    SNum IM1 = t->parm[4];
717
    bool isSigned = (IM1.b & 1) == 0;  // signed integer
718
    bool inexactX = (IM1.b & 4) != 0;  // make NAN exception if inexact
719
 
720
    SNum result;
721
    uint32_t dataSize = dataSizeTable[t->operandType];
722
    switch (dataSize) {
723
    case 2:  // int16 -> float16
724
        if (isSigned) {
725
            result.s = float2half(float(a.ss));
726
            if (inexactX && int32_t(half2float(result.s)) != a.ss) {
727
                result.q = t->makeNan(nan_inexact, 1);
728
            }
729
        }
730
        else { // unsigned
731
            result.s = float2half(float(a.s));
732
            if (inexactX && uint32_t(half2float(result.s)) != a.s) {
733
                result.q = t->makeNan(nan_inexact, 1);
734
            }
735
        }
736
        t->returnType = 0x118;         // debug return type is float16
737
        break;
738
 
739
    case 4: // int32 -> float
740
        if (isSigned) {
741
            result.f = (float)a.is;
742
            if (inexactX && int32_t(result.f) != a.is) {
743
                result.q = t->makeNan(nan_inexact, 5);
744
            }
745
        }
746
        else {
747
            result.f = (float)a.i;
748
            if (inexactX && uint32_t(result.f) != a.i) {
749
                result.q = t->makeNan(nan_inexact, 5);
750
            }
751
        }
752
        t->returnType = 0x115;        // debug return type is float
753
        break;
754
 
755
    case 8:  // int64 -> double
756
        if (isSigned) {
757
            result.d = (double)a.qs;
758
            if (inexactX && int64_t(result.d) != a.qs) {
759
                result.q = t->makeNan(nan_inexact, 6);
760
            }
761
        }
762
        else {
763
            result.d = (double)a.q;
764
            if (inexactX && uint64_t(result.d) != a.q) {
765
                result.q = t->makeNan(nan_inexact, 6);
766
            }
767
        }
768
        t->returnType = 0x116;        // debug return type is double
769
        break;
770
 
771
    default:
772
        t->interrupt(INT_WRONG_PARAMETERS);
773
        result.q = 0;
774
    }
775
    return result.q;
776
}
777
 
778
static uint64_t round_ (CThread * t) {
779
    // Round floating point to integer in floating point representation. 
780
    // The rounding mode is specified in IM1.
781
    // Conversion of floating point to signed integer with the same operand size. 
782
    // The rounding mode is specified in IM1.
783
    SNum a = t->parm[1];
784
    SNum b = t->parm[4];
785
    SNum result;
786
    uint32_t dataSize = dataSizeTable[t->operandType];
787
    if (dataSize == 4) {  // float -> int32
788
        switch (b.b) { // rounding mode:
789
        case 0: // nearest or even
790
            result.f = nearbyintf(a.f);
791
            break;
792
        case 1: // down 
793
            result.f = floorf(a.f);
794
            break;
795
        case 2: // up
796
            result.f = ceilf(a.f);
797
            break;
798
        case 3: // towards zero
799
            result.f = truncf(a.f);
800
            break;
801
        default: t->interrupt(INT_WRONG_PARAMETERS);
802
        }
803
    }
804
    else if (dataSize == 8) {   // double -> int64
805
        switch (b.b) { // rounding mode:
806
        case 0: // nearest or even
807
            result.d = nearbyint(a.d);
808
            break;
809
        case 1: // down 
810
            result.d = floor(a.d);
811
            break;
812
        case 2: // up
813
            result.d = ceil(a.d);
814
            break;
815
        case 3: // towards zero
816
            result.d = trunc(a.d);
817
            break;
818
        default: t->interrupt(INT_WRONG_PARAMETERS);
819
        }
820
    }
821
    return result.q;
822
}
823
 
824
static uint64_t round2n (CThread * t) {
825
    // Round to nearest multiple of 2n.
826
    // RD = 2^n * round(2^(−n)*RS). 
827
    // n is a signed integer constant in IM1
828
    SNum b = t->parm[4];                    // n
829
    //SNum mask = t->parm[3];
830
    uint32_t exponent1;
831
    uint64_t result = 0;
832
    if (t->operandType == 5) {  // float
833
        union {
834
            uint32_t i;
835
            float f;
836
            struct {
837
                uint32_t mantissa : 23;
838
                uint32_t exponent : 8;
839
                uint32_t sign     : 1;
840
            };
841
        } u;
842
        u.i = t->parm[1].i;             // input a
843
        if (isnan_f(u.i)) return u.i;   // a is nan
844
        exponent1 = u.exponent;
845
        if (exponent1 == 0) {
846
            u.mantissa = 0;                // a is zero or subnormal. return zero
847
            return u.i;
848
        }
849
        exponent1 -= b.i;                  // subtract b from exponent
850
        if ((int32_t)exponent1 <= 0) { // underflow
851
            //if (mask.i & MSK_FLOAT_UNDERFL) t->interrupt(INT_FLOAT_UNDERFL);
852
            return 0;
853
        }
854
        else if ((int32_t)exponent1 >= 0xFF) { // overflow
855
            //if (mask.i & MSK_OVERFL_FLOAT) t->interrupt(INT_OVERFL_FLOAT);
856
            return inf_f;
857
        }
858
        u.exponent = exponent1;
859
        u.f = nearbyintf(u.f);   // round
860
        if (u.f != 0) u.exponent += b.i;              // add b to exponent
861
        result = u.i;
862
    }
863
    else if (t->operandType == 6) {   // double
864
        union {
865
            uint64_t q;
866
            double d;
867
            struct {
868
                uint64_t mantissa : 52;
869
                uint64_t exponent : 11;
870
                uint64_t sign     :  1;
871
            };
872
        } u;
873
        u.q = t->parm[1].q;             // input a
874
        if (isnan_d(u.q)) return u.q;   // a is nan
875
        exponent1 = u.exponent;
876
        if (exponent1 == 0) {
877
            u.mantissa = 0;                // a is zero or subnormal. return zero
878
            return u.q;
879
        }
880
        exponent1 -= b.i;                  // subtract b from exponent
881
        if ((int32_t)exponent1 <= 0) { // underflow
882
            //if (mask.i & MSK_FLOAT_UNDERFL) t->interrupt(INT_FLOAT_UNDERFL);
883
            return 0;
884
        }
885
        else if ((int32_t)exponent1 >= 0x7FF) { // overflow
886
            //if (mask.i & MSK_OVERFL_FLOAT) t->interrupt(INT_OVERFL_FLOAT);
887
            return inf_d;
888
        }
889
        u.exponent = exponent1;
890
        u.d = nearbyint(u.d);   // round
891
        if (u.d != 0) u.exponent += b.i;              // add b to exponent
892
        result = u.q;
893
    }
894
    else t->interrupt(INT_WRONG_PARAMETERS);
895
    return result;
896
}
897
 
898
static uint64_t abs_ (CThread * t) {
899
    // Absolute value of integer. 
900
    // IM1 determines handling of overflow: 0: wrap around, 1: saturate, 2: zero, 3: trap
901
    SNum a = t->parm[1];                                   // x
902
    SNum b = t->parm[4];                                   // option
903
    uint64_t sizemask = dataSizeMask[t->operandType];      // mask for operand size
904
    uint64_t signbit = (sizemask >> 1) + 1;                // just the sign bit
905
    if (a.q & signbit) {
906
        // a is negative
907
        if (t->operandType > 4) {                          // floating point types
908
            return a.q & ~signbit;                         // just remove sign bit
909
        }
910
        if ((a.q & sizemask) == signbit) {
911
            // overflow
912
            switch (b.b & ~4) {
913
            case 0:  // wrap around
914
                break;
915
            case 1:  // saturate
916
                return a.q - 1;
917
            case 2:  // zero
918
                return 0;
919
            default:
920
                t->interrupt(INT_WRONG_PARAMETERS);
921
            }
922
            if ((b.b & 4) /* && (t->parm[3].i & MSK_OVERFL_SIGN)*/) { // trap
923
                t->interrupt(INT_OVERFL_SIGN);  // signed overflow
924
            }
925
        }
926
        a.qs = - a.qs;                           // change sign
927
    }
928
    return a.q;
929
}
930
 
931
static uint64_t fp_category (CThread * t) {
932
    // Check if floating point numbers belong to the categories indicated by constant
933
    //  0 ± NAN, 1 ± Zero, 2 −Subnormal, 3 +Subnormal, 4 −Normal, 5 +Normal, 6 −Infinite, 7 +Infinite
934
    SNum a = t->parm[1];                         // x
935
    SNum b = t->parm[4];                         // option
936
    uint32_t exponent;
937
    uint8_t category = 0;                        // detected category bits
938
    switch (t->operandType) {
939
    case 2: case 5:                              // float
940
        exponent = a.i >> 23 & 0xFF;             // isolate exponent
941
        if (exponent == 0xFF) {                  // nan or inf
942
            if (a.i << 9) category = 1;          // nan
943
            else if (a.i >> 31) category = 0x40; // -inf
944
            else category = 0x80;                // + inf
945
        }
946
        else if (exponent == 0) {
947
            if ((a.i << 9) == 0) category = 2;   // zero
948
            else if (a.i >> 31)  category = 4;   // - subnormal
949
            else category = 8;                   // + subnormal
950
        }
951
        else if (a.i >> 31) category = 0x10;     // - normal    
952
        else category = 0x20;                    // + normal
953
        break;
954
    case 3: case 6:                              // double
955
        exponent = a.q >> 52 & 0x7FF;            // isolate exponent
956
        if (exponent == 0x7FF) {                 // nan or inf
957
            if (a.q << 12) category = 1;         // nan
958
            else if (a.q >> 63) category = 0x40; // -inf
959
            else category = 0x80;                // + inf
960
        }
961
        else if (exponent == 0) {
962
            if ((a.q << 12) == 0) category = 2;  // zero
963
            else if (a.q >> 63)  category = 4;   // - subnormal
964
            else category = 8;                   // + subnormal
965
        }
966
        else if (a.q >> 63) category = 0x10;     // - normal    
967
        else category = 0x20;                    // + normal
968
        break;
969
    default:
970
        t->interrupt(INT_WRONG_PARAMETERS);
971
    }
972
    uint8_t result = (category & b.b) != 0;                // test if a belongs to any of the indicated categories
973
    if ((t->operandType & 7) >= 5) t->operandType -= 3;    // debug return type is integer
974
    return (t->numContr & ~(uint64_t)1) | result;          // get remaining bits from NUMCONTR
975
}
976
 
977
static uint64_t broad_ (CThread * t) {
978
    // 18: Broadcast 8-bit signed constant into all elements of RD with length RS (31 in RS field gives scalar output).
979
    // 19: broadcast_max. Broadcast 8-bit constant into all elements of RD with maximum vector length.
980
    uint8_t  rd = t->operands[0];
981
    uint8_t  rs = t->operands[4];
982
    uint8_t  rm = t->operands[1];                // mask register
983
    SNum b = t->parm[2];                         // constant
984
    uint64_t length;                             // length of destination vector
985
    if (t->op == 18) {                           // length given by RS
986
        length = t->registers[rs];
987
        if (length > t->MaxVectorLength) length = t->MaxVectorLength;
988
    }
989
    else {                                       // length is maximum
990
        length = t->MaxVectorLength;
991
    }
992
    uint8_t  dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
993
    length = length >> dsizelog << dsizelog;     // round down to nearest multiple of operand size
994
    // set length of destination vector
995
    t->vectorLength[rd] = (uint32_t)length;
996
    // loop to set all elements
997
    for (uint32_t pos = 0; pos < (uint32_t)length; pos += 1 << dsizelog) {
998
        if ((rm & 0x1F) != 0x1F && !(t->readVectorElement(rm, pos) & 1)) { // mask is zero. get fallback
999
            if (t->op == 18 || rs >= 31) b.q = 0;   // threre is no fallback. write zero
1000
            else b.q = t->readVectorElement(rs, pos);  // rs is fallback
1001
        }
1002
        t->writeVectorElement(rd, b.q, pos);     // write vector element
1003
    }
1004
    t->vect = 4;                                 // stop vector loop
1005
    t->running = 2;                              // don't save RD
1006
    return 0;
1007
}
1008
 
1009
static uint32_t byteSwap(uint32_t x) {      // swap bytes, used by byte_reverse function
1010
    union {
1011
        uint32_t i;
1012
        uint8_t b[4];
1013
    } a, b;
1014
    a.i = x;
1015
    b.b[0] = a.b[3]; b.b[1] = a.b[2]; b.b[2] = a.b[1]; b.b[3] = a.b[0];
1016
    return b.i;
1017
}
1018
 
1019
static uint8_t bitSwap(uint8_t x) {         // swap bits, used by bit_reverse function
1020
    x = x >> 4 | x << 4;                    // swap 4-bit nipples
1021
    x = (x >> 2 & 0x33) | (x << 2 & 0xCC);  // swap 2-bit groups
1022
    x = (x >> 1 & 0x55) | (x << 1 & 0xAA);  // swap single bits
1023
    return x;
1024
}
1025
 
1026
static uint64_t byte_reverse (CThread * t) {
1027
    // Reverse the order of bits or bytes in each element of vector
1028
    SNum a = t->parm[1];                         // value
1029
    uint8_t IM1 = t->parm[2].b;                  // immediate operand
1030
    if (IM1 & 1) {
1031
        // bit reverse: Reverse the order of bits in each element of vector
1032
        union {
1033
            uint64_t q;
1034
            uint32_t i[2];
1035
            uint8_t  b[8];
1036
        } u;
1037
        u.q = a.q;
1038
        uint8_t t1;  uint32_t t2;
1039
        switch (dataSizeTableLog[t->operandType]) {
1040
        case 0:  // 8 bit
1041
            u.b[0] = bitSwap(u.b[0]); break;
1042
        case 1:  // 16 bit
1043
            t1 = bitSwap(u.b[0]); u.b[0] = bitSwap(u.b[1]); u.b[1] = t1; break;
1044
        case 2:  // 32 bit
1045
            u.i[0] = byteSwap(u.i[0]);
1046
            for (t1 = 0; t1 < 4; t1++) u.b[t1] = bitSwap(u.b[t1]);
1047
            break;
1048
        case 3:  // 64 bit
1049
            t2 = byteSwap(u.i[0]); u.i[0] = byteSwap(u.i[1]); u.i[1] = t2;
1050
            for (t1 = 0; t1 < 8; t1++) u.b[t1] = bitSwap(u.b[t1]);
1051
            break;
1052
        case 4:  // 128 bit
1053
            t->interrupt(INT_WRONG_PARAMETERS);
1054
        }
1055
        return u.q;
1056
    }
1057
    else {
1058
        // byte reverse: Reverse the order of bytes in each element of a vector
1059
        uint8_t  rs = t->operands[4];
1060
        uint32_t tmp;
1061
        switch (dataSizeTableLog[t->operandType]) {
1062
        case 0:  // 8 bit
1063
            break;
1064
        case 1:  // 16 bit
1065
            a.s = a.s >> 8 | a.b << 8;  break;  // swap bytes
1066
        case 2:  // 32 bit
1067
            a.i = byteSwap(a.i); break;
1068
        case 3:  // 64 bit
1069
            tmp = byteSwap(a.i); a.q = byteSwap(a.q >> 32) | (uint64_t)tmp << 32;
1070
            break;
1071
        case 4:  // 128 bit
1072
            tmp = byteSwap(a.i); t->parm[5].q = byteSwap(a.q >> 32) | (uint64_t)tmp << 32;
1073
            a.q = t->readVectorElement(rs, t->vectorOffset + 8); // high part of input
1074
            tmp = byteSwap(a.i); a.q = byteSwap(a.q >> 32) | (uint64_t)tmp << 32;
1075
            break;
1076
        }
1077
        return a.q;
1078
    }
1079
}
1080
 
1081
/*
1082
static uint64_t truth_tab2 (CThread * t) {
1083
    // Boolean function of two inputs, given by a truth table
1084
    SNum a = t->parm[0];                         // value
1085
    SNum b = t->parm[1];                         // value
1086
    SNum c = t->parm[4];                         // truth table
1087
    return ((c.b >> ((a.b & 1) | (b.b & 1) << 1)) & 1) | (a.q & ~uint64_t(1));
1088
} */
1089
 
1090
static uint64_t bool2bits(CThread * t) {
1091
    // The boolean vector RT is packed into the lower n bits of RD, 
1092
    // taking bit 0 of each element
1093
    // The length of RD will be at least sufficient to contain n bits.
1094
 
1095
    uint8_t  rd = t->operands[0];         // destination vector
1096
    uint8_t  rt = t->operands[4];         // RT = source vector
1097
    //uint8_t  rs = t->operands[4];         // RS indicates length
1098
    uint8_t * destination = (uint8_t*)t->vectors.buf() + (int64_t)rd * t->MaxVectorLength; // address of RD data
1099
    //uint64_t length = t->registers[rs]; // value of RS = length of destination
1100
    uint32_t length = t->vectorLength[rt]; // length of source
1101
    uint8_t  dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
1102
    //if (length > t->MaxVectorLength) length = t->MaxVectorLength; // limit length
1103
    uint32_t num = length >> dsizelog;           // number of elements
1104
    length = num << dsizelog;                    // round down length to nearest multiple of element size 
1105
    // collect bits into blocks of 32 bits
1106
    uint32_t bitblock = 0;
1107
    // loop through elements of source vector
1108
    for (uint32_t i = 0; i < num; i++) {
1109
        uint8_t  bit = t->readVectorElement(rt, i << dsizelog) & 1;
1110
        uint8_t  bitindex = i & 31;                        // bit position with 32 bit block of destination
1111
        bitblock |= bit << bitindex;                       // add bit to bitblock
1112
        if (bitindex == 31 || i == num - 1) {              // last bit in this block
1113
            *(uint32_t*)(destination + (i/8 & -4)) = bitblock; // write 32 bit block to destination
1114
            bitblock = 0;                                  // start next block
1115
        }
1116
    }
1117
    // round up length of destination to multiple of 4 bytes
1118
    uint32_t destinationLength = ((num+7)/8 + 3) & -4;
1119
    if (destinationLength == 0) {
1120
        destinationLength = 4;  *(uint32_t*)destination = 0;
1121
    }
1122
    // set length of destination vector (must be done after reading source because source and destination may be the same)
1123
    t->vectorLength[rd] = destinationLength;
1124
    t->vect = 4;                                           // stop vector loop
1125
    t->running = 2;                                        // don't save RD
1126
    if ((t->returnType & 7) >= 5) t->returnType -= 3;      // make return type integer
1127
    return 0;
1128
}
1129
 
1130
static uint64_t bool_reduce(CThread * t) {
1131
    // integer vector: bool_reduce. The boolean vector RT is reduced by combining bit 0 of all elements.
1132
    // The output is a scalar integer where bit 0 is the AND combination of all the bits, 
1133
    // and bit 1 is the OR combination of all the bits. 
1134
    // The remaining bits are reserved for future use
1135
    // float vector: category_reduce: Each bit in RD indicates that at least one element in RT belongs
1136
    // to a certain category:
1137
    // bit 0: NAN, bit 1: zero, bit 2: - subnormal, bitt 3: + subnormal,
1138
    // bit 4: - normal, bit 5: + normal, bit 6: - INF, bit 7: + INF
1139
    uint8_t  rd = t->operands[0];                   // destination vector
1140
    uint8_t  rt = t->operands[4];                   // RT = source vector
1141
    //uint8_t  rs = t->operands[4];                   // RS indicates length
1142
    uint8_t bitOR = 0;                                     // OR combination of all bits
1143
    uint8_t bitAND = 1;                                    // AND combination of all bits
1144
    uint64_t result = 0;                                   // result value
1145
    uint8_t * source = (uint8_t*)t->vectors.buf() + rt*t->MaxVectorLength; // address of RT data
1146
    //if (length > t->MaxVectorLength) length = t->MaxVectorLength; // limit length
1147
    uint32_t elementSize = dataSizeTable[t->operandType];  // vector element size
1148
    uint8_t  dsizelog = dataSizeTableLog[t->operandType];  // log2(elementsize)
1149
    uint32_t sourceLength = t->vectorLength[rt];           // length of source vector
1150
    //uint64_t length = t->registers[rs];                  // value of RS = length of destination
1151
    uint32_t length = sourceLength;                        // length of source vector
1152
    length = length >> dsizelog << dsizelog;               // round down to nearest multiple of element size
1153
    /*if (length > sourceLength) {
1154
        length = sourceLength;                             // limit to length of source vector
1155
        bitAND = 0;                                        // bits beyond vector are 0
1156
    } */
1157
    switch (t->operandType) {
1158
    case 0: case 1: case 2: case 3: case 4:                // integer types: bool_reduce
1159
        for (uint32_t pos = 0; pos < length; pos += elementSize) { // loop through elements of source vector
1160
            uint8_t  bit = *(source + pos) & 1;            // get bit from source vector element
1161
            bitOR |= bit;  bitAND &= bit;
1162
        }
1163
        result = bitAND | bitOR << 1;
1164
        break;
1165
    case 5:                                                // float type: category_reduce
1166
        for (uint32_t pos = 0; pos < length; pos += elementSize) { // loop through elements of source vector
1167
            uint32_t val = *(int32_t*)(source + pos);
1168
            uint8_t exponent = val >> 23 & 0xFF;           // isolate exponent
1169
            uint8_t category;
1170
            if (exponent == 0xFF) {                        // nan or inf
1171
                if (val << 9) category = 1;                // nan
1172
                else if (val >> 31) category = 0x40;       // -inf
1173
                else category = 0x80;                      // + inf
1174
            }
1175
            else if (exponent == 0) {
1176
                if ((val << 9) == 0) category = 2;         // zero
1177
                else if (val >> 31)  category = 4;         // - subnormal
1178
                else category = 8;                         // + subnormal
1179
            }
1180
            else if (val >> 31) category = 0x10;           // - normal    
1181
            else category = 0x20;                          // + normal
1182
            result |= category;                            // combine categories
1183
        }
1184
        break;
1185
    case 6:                                                // double type: category_reduce
1186
        for (uint32_t pos = 0; pos < length; pos += elementSize) {    // loop through elements of source vector
1187
            uint64_t val = *(int64_t*)(source + pos);
1188
            uint32_t exponent = val >> 52 & 0x7FF;         // isolate exponent
1189
            uint8_t category;
1190
            if (exponent == 0x7FF) {                       // nan or inf
1191
                if (val << 12) category = 1;               // nan
1192
                else if (val >> 63) category = 0x40;       // -inf
1193
                else category = 0x80;                      // + inf
1194
            }
1195
            else if (exponent == 0) {
1196
                if ((val << 12) == 0) category = 2;        // zero
1197
                else if (val >> 63)  category = 4;         // - subnormal
1198
                else category = 8;                         // + subnormal
1199
            }
1200
            else if (val >> 63) category = 0x10;           // - normal    
1201
            else category = 0x20;                          // + normal
1202
            result |= category;                            // combine categories
1203
        }
1204
        break;
1205
    default:
1206
        t->interrupt(INT_WRONG_PARAMETERS);
1207
    }
1208
    t->vectorLength[rd] = 8;                               // set length of destination vector to 64 bits
1209
    uint8_t * destination = (uint8_t*)t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
1210
    *(uint64_t*)destination = result;                      // write 64 bits to destination
1211
    // (using writeVectorElement would possibly write less than 64 bits, leaving some of the destination vector unchanged)
1212
    t->vect = 4;                                           // stop vector loop
1213
    t->running = 2;                                        // don't save RD. It has already been saved
1214
    if ((t->returnType & 7) >= 5) t->returnType -= 3;      // make return type integer
1215
    return result;
1216
}
1217
 
1218
 
1219
static uint64_t push_v(CThread * t) {
1220
    // push one or more vector registers on a stack pointed to by rd
1221
    if (t->parm[2].i & 0xE0) {
1222
        t->interrupt(INT_WRONG_PARAMETERS); return 0;  // forward-growing stack not supported for vector registers
1223
    }
1224
    uint8_t reg0 = t->operands[0] & 0x1F;   // pointer register
1225
    uint8_t reg1 = t->operands[4] & 0x1F;   // first push register
1226
    uint8_t reglast = t->parm[2].i & 0x1F;  // last push register
1227
    uint8_t reg;                            // current regiser
1228
    uint32_t length;                        // length of current register
1229
    uint32_t length2;                       // length rounded up to nearest multiple of stack word size
1230
    uint64_t pointer = t->registers[reg0];
1231
    const int stack_word_size = 8;
1232
    t->operandType = 3;                     // must be 64 bits.
1233
    // loop through registers to push
1234
    for (reg = reg1; reg <= reglast; reg++) {
1235
        length = t->vectorLength[reg];
1236
        length2 = (length + stack_word_size - 1) & -stack_word_size;  // round up to multiple of 8
1237
        if (length != 0) {
1238
            pointer -= length2;
1239
            for (uint32_t j = 0; j < length2; j += 8) {
1240
                uint64_t value = t->readVectorElement(reg, j);
1241
                t->writeMemoryOperand(value, pointer + j);  // write vector  
1242
            }
1243
            t->returnType = 0x113;
1244
            t->operands[0] = reg;
1245
            t->listResult(0);
1246
        }
1247
        pointer -= stack_word_size;
1248
        t->writeMemoryOperand(length, pointer);  // write length  
1249
        t->returnType = 0x13;
1250
        t->listResult(length);
1251
    }
1252
    t->registers[reg0] = pointer;
1253
    t->returnType = 0x13;
1254
    t->operands[0] = reg0;
1255
    t->vect = 4;                              // stop vector loop
1256
    t->running = 2;                           // don't store result register
1257
    return pointer;
1258
}
1259
 
1260
static uint64_t pop_v(CThread * t) {
1261
    // pop one or more vector registers from a stack pointed to by rd
1262
    if (t->parm[2].i & 0xE0) {
1263
        t->interrupt(INT_WRONG_PARAMETERS); return 0;  // forward-growing stack not supported for vector registers
1264
    }
1265
    uint8_t reg0 = t->operands[0] & 0x1F;   // pointer register
1266
    uint8_t reg1 = t->operands[4] & 0x1F;   // first pop register
1267
    uint8_t reglast = t->parm[2].i & 0x1F;  // last pop register
1268
    uint8_t reg;                            // current regiser
1269
    uint32_t length;                        // length of current register
1270
    uint32_t length2;                       // length rounded up to nearest multiple of stack word size
1271
    uint64_t pointer = t->registers[reg0];  // value of stack pointer or pointer register
1272
    const int stack_word_size = 8;
1273
    t->operandType = 3;                     // must be 64 bits.
1274
    // reverse loop through registers to pop
1275
    for (reg = reglast; reg >= reg1; reg--) {
1276
        length = (uint32_t)t->readMemoryOperand(pointer);  // read length  
1277
        length2 = (length + stack_word_size - 1) & -stack_word_size;  // round up to multiple of 8
1278
        t->vectorLength[reg] = length;          // set vector length
1279
        pointer += stack_word_size;             // pop length
1280
        if (length != 0) {
1281
            for (uint32_t j = 0; j < length2; j += 8) { // read vector           
1282
                uint64_t value = t->readMemoryOperand(pointer + j);  // read from memory
1283
                t->writeVectorElement(reg, value, j);
1284
            }
1285
            pointer += length2;
1286
            t->returnType = 0x113;
1287
            t->operands[0] = reg;
1288
            t->listResult(0);
1289
        }
1290
        t->returnType = 0x13;
1291
        t->listResult(length);
1292
    }
1293
    t->registers[reg0] = pointer;
1294
    t->returnType = 0x13;
1295
    t->operands[0] = reg0;
1296
    t->vect = 4;                              // stop vector loop
1297
    t->running = 2;                           // don't store result register
1298
    return pointer;
1299
}
1300
 
1301
static uint64_t clear_(CThread * t) {
1302
    // clear one or more vector registers
1303
    uint8_t reg1 = t->operands[4] & 0x1F;   // first register
1304
    uint8_t reglast = t->parm[2].i & 0x1F;  // last register
1305
    uint8_t reg;                            // current regiser
1306
    for (reg = reg1; reg <= reglast; reg++) {
1307
        t->vectorLength[reg] = 0;
1308
    }
1309
    t->vect = 4;                              // stop vector loop
1310
    t->running = 2;                           // don't store result register
1311
    t->returnType = 0;
1312
    return 0;
1313
}
1314
 
1315
 
1316
// Format 1.4 C. One vector register and a broadcast 16-bit immediate operand.
1317
 
1318
static uint64_t move_i16 (CThread * t) {
1319
    // Move 16 bit integer constant to 16-bit scalar
1320
    uint8_t rd = t->operands[0];                 // destination vector
1321
    t->vectorLength[rd] = 2;                     // set length of destination
1322
    t->vect = 4;                                 // stop vector loop
1323
    return t->parm[2].q;
1324
}
1325
 
1326
//static uint64_t add_i16 (CThread * t) {return f_add(t);} // Add broadcasted 16 bit constant to 16-bit vector elements
1327
 
1328
static uint64_t and_i16 (CThread * t) {
1329
    // AND broadcasted 16 bit constant
1330
    return t->parm[1].q & t->parm[2].q;
1331
}
1332
 
1333
static uint64_t or_i16 (CThread * t) {
1334
    // OR broadcasted 16 bit constant
1335
    return t->parm[1].q | t->parm[2].q;
1336
}
1337
 
1338
static uint64_t xor_i16 (CThread * t) {
1339
    // XOR broadcasted 16 bit constant
1340
    return t->parm[1].q ^ t->parm[2].q;
1341
}
1342
 
1343
static uint64_t add_h16 (CThread * t) {
1344
    // add constant to half precision vector
1345
    return f_add_h(t);
1346
}
1347
 
1348
static uint64_t mul_h16 (CThread * t) {
1349
    // multiply half precision vector with constant
1350
    return f_mul_h(t);
1351
}
1352
 
1353
static uint64_t move_8shift8 (CThread * t) {
1354
    // RD = IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1 to make 32/64 bit scalar 
1355
    // 40: 32 bit, 41: 64 bit
1356
    uint8_t rd = t->operands[0];                 // destination vector
1357
    t->vectorLength[rd] = (t->op & 1) ? 8 : 4;   // set length of destination
1358
    t->vect = 4;                                 // stop vector loop
1359
    return (uint64_t)(int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs);  // shift and sign extend
1360
}
1361
 
1362
static uint64_t add_8shift8 (CThread * t) {
1363
    // RD += IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1, add to 32/64 bit vector
1364
    // 42: 32 bit, 43: 64 bit
1365
    int64_t save2 = t->parm[2].qs;
1366
    t->parm[2].qs = int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs;  // shift and sign extend
1367
    int64_t result = f_add(t);                             // use f_add for getting overflow traps
1368
    t->parm[2].qs = save2;                                 // restore constant
1369
    return result;
1370
}
1371
 
1372
static uint64_t and_8shift8 (CThread * t) {
1373
    // RD &= IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1, AND with 32/64 bit vector
1374
    // 44: 32 bit, 45: 64 bit
1375
    int64_t a = int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs;  // shift and sign extend
1376
    return t->parm[1].q & a;
1377
}
1378
 
1379
static uint64_t or_8shift8 (CThread * t) {
1380
    // RD |= IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1, OR with 32/64 bit vector
1381
    // 46: 32 bit, 47: 64 bit
1382
    int64_t a = int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs;  // shift and sign extend
1383
    return t->parm[1].q | a;
1384
}
1385
 
1386
static uint64_t xor_8shift8 (CThread * t) {
1387
    // RD |= IM2 << IM1. Sign-extend IM2 and shift left by the unsigned value IM1, XOR with 32/64 bit vector
1388
    // 48: 32 bit, 49: 64 bit
1389
    int64_t a = int64_t(t->parm[2].ss) >> 8 << t->parm[2].bs;  // shift and sign extend
1390
    return t->parm[1].q ^ a;
1391
}
1392
 
1393
static uint64_t move_half2float (CThread * t) {
1394
    // Move converted half precision floating point constant to single precision scalar
1395
    t->vectorLength[t->operands[0]] = 4;         // set length of destination
1396
    t->vectorLengthR = 4;
1397
    t->vect = 4;                                 // stop vector loop
1398
    return t->parm[2].q;
1399
}
1400
 
1401
static uint64_t move_half2double (CThread * t) {
1402
    // Move converted half precision floating point constant to double precision scalar
1403
    t->vectorLength[t->operands[0]] = 8;         // set length of destination
1404
    t->vect = 4;                                 // stop vector loop
1405
    return t->parm[2].q;
1406
}
1407
 
1408
static uint64_t add_half2float (CThread * t) {
1409
    // Add broadcast half precision floating point constant to single precision vector
1410
    return f_add(t);
1411
}
1412
 
1413
static uint64_t add_half2double (CThread * t) {
1414
    // Add broadcast half precision floating point constant to double precision vector
1415
    return f_add(t);
1416
}
1417
 
1418
static uint64_t mul_half2float (CThread * t) {
1419
    // multiply broadcast half precision floating point constant with single precision vector
1420
    return f_mul(t);
1421
}
1422
 
1423
static uint64_t mul_half2double (CThread * t) {
1424
    // multiply broadcast half precision floating point constant with double precision vector
1425
    return f_mul(t);
1426
}
1427
 
1428
// Format 2.6 A. Three vector registers and a 32-bit immediate operand.
1429
 
1430
static uint64_t load_hi (CThread * t) {
1431
    // Make vector of two elements. dest[0] = 0, dest[1] = IM2.
1432
    uint8_t rd = t->operands[0];
1433
    uint8_t dsize = dataSizeTable[t->operandType];
1434
    t->vectorLength[rd] = dsize * 2;             // set length of destination
1435
    t->writeVectorElement(rd, 0, 0);             // write 0
1436
    t->writeVectorElement(rd, t->parm[2].q, dsize);// write IM2
1437
    t->vect = 4;                                 // stop vector loop
1438
    t->running = 2;                              // don't save RD
1439
    return 0;
1440
}
1441
 
1442
static uint64_t insert_hi (CThread * t) {
1443
    // Make vector of two elements. dest[0] = src1[0], dest[1] = IM2.
1444
    uint8_t rd = t->operands[0];
1445
    uint8_t dsize = dataSizeTable[t->operandType];
1446
    t->vectorLength[rd] = dsize * 2;             // set length of destination
1447
    t->writeVectorElement(rd, t->parm[1].q, 0);  // write src1
1448
    t->writeVectorElement(rd, t->parm[2].q, dsize);// write IM2
1449
    t->vect = 4;                                 // stop vector loop
1450
    t->running = 2;                              // don't save RD
1451
    return 0;
1452
}
1453
 
1454
static uint64_t make_mask (CThread * t) {
1455
    // Make vector where bit 0 of each element comes from bits in IM2, the remaining bits come from RT.
1456
    SNum m = t->parm[3];                                   // mask or numcontr
1457
    SNum b = t->parm[2];                                   // constant operand
1458
    uint8_t  dsizelog = dataSizeTableLog[t->operandType];  // log2(elementsize)
1459
    uint32_t elementNum = t->vectorOffset >> dsizelog;     // index to vector element
1460
    if ((t->operandType & 7) >= 5) t->operandType -= 3;    // debug return type is integer
1461
    return (m.q & ~(uint64_t)1) | (b.i >> (elementNum & 31) & 1);
1462
}
1463
 
1464
static uint64_t replace_ (CThread * t) {
1465
    // Replace elements in RT by constant IM2
1466
    // format 2.6: 32 bits, format 3.1: 64 bits
1467
    return t->parm[2].q;
1468
}
1469
 
1470
static uint64_t replace_even (CThread * t) {
1471
    // Replace even-numbered elements in RT by constant IM2
1472
    uint8_t  dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
1473
    uint32_t elementNum = t->vectorOffset >> dsizelog;    // index to vector element
1474
    return (elementNum & 1) ? t->parm[1].q : t->parm[2].q;
1475
}
1476
 
1477
static uint64_t replace_odd (CThread * t) {
1478
    // Replace odd-numbered elements in RT by constant IM2
1479
    uint8_t  dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
1480
    uint32_t elementNum = t->vectorOffset >> dsizelog;    // index to vector element
1481
    return (elementNum & 1) ? t->parm[2].q : t->parm[1].q;
1482
}
1483
 
1484
static uint64_t broadcast_32 (CThread * t) {
1485
    // Broadcast 32-bit or 64 -bit constant into all elements of RD with length RS (31 in RS field gives scalar output).
1486
    uint8_t  rd = t->operands[0];
1487
    uint8_t  rs = t->operands[4];
1488
    uint8_t  rm = t->operands[1];                // mask register
1489
    uint32_t elementSize = dataSizeTable[t->operandType];
1490
    uint8_t  dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
1491
    uint64_t length;                             // length of destination 
1492
    int64_t  value;
1493
    if (rs == 31) length = elementSize;
1494
    else length = t->registers[rs] << dsizelog >> dsizelog;   // round length to multiple of elementSize
1495
    if (length > t->MaxVectorLength) length = t->MaxVectorLength;
1496
    t->vectorLength[rd] = (uint32_t)length;                   // set length of destination
1497
    for (uint32_t pos = 0; pos < length; pos += elementSize) { // loop through vector
1498
        if (rm >= 7 || (t->readVectorElement(rm, pos) & 1)) value = t->parm[2].qs; // check mask
1499
        else value = 0;
1500
        t->writeVectorElement(rd, value, pos);             // write to destination
1501
    }
1502
    t->vect = 4;                                           // stop vector loop
1503
    t->running = 2;                                        // don't save RD
1504
    return 0;
1505
}
1506
 
1507
static uint64_t permute (CThread * t) {
1508
    // The vector elements of RS are permuted within each block of size RT bytes. 
1509
    // The number of elements in each block, n = RT / OS
1510
    // format 2.2.6 op 1.1: index vector is last operand
1511
    // format 2.6   op   8: index vector is constant IM2, 4 bits for each element
1512
    uint8_t  rd = t->operands[0];                          // destination
1513
    uint8_t  rm = t->operands[1];                          // mask register
1514
    uint8_t  vin;                                          // input data register
1515
    uint8_t  vpat = 0;                                     // pattern register
1516
    uint8_t  bs;                                           // block size, g.p. register
1517
    uint32_t pattern = 0;                                  // IM2 = pattern, if constant
1518
    bool     constPat = false;                             // pattern is a constant
1519
    if (t->fInstr->format2 == 0x226) {
1520
        vin  = t->operands[3];                             // ru = input data
1521
        vpat = t->operands[4];                             // rs = pattern
1522
        bs   = t->operands[5];                             // block size, g.p. register
1523
    }
1524
    else {                                                 // format 2.6
1525
        vin = t->operands[3];                              // rs = input data
1526
        bs  = t->operands[4];                              // block size, g.p. register
1527
        pattern = t->parm[4].i;                            // IM2 = pattern, if constant
1528
        constPat = true;
1529
    }
1530
    uint8_t  dsizelog = dataSizeTableLog[t->operandType];  // log2(elementsize)
1531
    //uint32_t elementSize = 1 << dsizelog;
1532
    uint32_t length = t->vectorLength[vin];                // vector length
1533
    t->vectorLength[rd] = length;                          // set length of destination
1534
    int8_t * source = t->vectors.buf() + (uint32_t)(vin & 0x1F) * t->MaxVectorLength; // address of source data vector
1535
    if (vin == rd) {
1536
        // source and destination are the same. Make a temporary copy of source to avoid overwriting
1537
        memcpy(t->tempBuffer, source, length);
1538
        source = t->tempBuffer;
1539
    }
1540
    uint64_t blocksize = t->registers[bs];                 // bytes per block
1541
    uint64_t value;                                        // value of element
1542
    uint64_t index;                                        // index to source element
1543
    if (blocksize == 0 || (blocksize & (blocksize-1)) || blocksize > t->MaxVectorLength) {
1544
        t->interrupt(INT_WRONG_PARAMETERS);                    // RS must be a power of 2
1545
    }
1546
    else {
1547
        uint32_t num = (uint32_t)blocksize >> dsizelog;    // elements per block
1548
        for (uint32_t block = 0; block < length; block += (uint32_t)blocksize) {  // loop through blocks
1549
            for (uint32_t element = 0; element < num; element++) { // loop through elements within block
1550
                if (constPat) {  // get index from constant              
1551
                    index = (pattern >> (element&7)*4) & 0xF; // index to select block element
1552
                }
1553
                else {  // get index from vector
1554
                    index = t->readVectorElement(vpat, block + (element << dsizelog));
1555
                }
1556
                if (index < num && (rm == 7 || t->readVectorElement(rm, block + (element << dsizelog)) & 1)) { // check mask
1557
                    value = *(uint64_t*)(source + block + ((uint32_t)index << dsizelog));        // pick indexed element from source vector
1558
                }
1559
                else value = 0; // index out of range or mask = 0
1560
                t->writeVectorElement(rd, value, block + (element << dsizelog));  // write destination
1561
            }
1562
        }
1563
    }
1564
    t->vect = 4;                                           // stop vector loop
1565
    t->running = 2;                                        // don't save RD
1566
    return 0;
1567
}
1568
 
1569
/*
1570
static uint64_t replace_bits (CThread * t) {
1571
    // Replace a group of contiguous bits in RT by a specified constant
1572
    SNum a = t->parm[1];                         // input operand
1573
    SNum b = t->parm[2];                         // input constant
1574
    uint64_t val = b.s;                          // value of replacement bits
1575
    uint8_t  pos = uint8_t(b.i >> 16);           // position of replacement
1576
    uint8_t  num = uint8_t(b.i >> 24);           // number of consecutive bits to replace
1577
    uint64_t mask = ((uint64_t)1 << num) - 1;    // mask with num 1-bits
1578
    return (a.q & ~(mask<<pos)) | ((val & mask) << pos);
1579
}*/
1580
 
1581
// Format 2.5 A. Single format instructions with memory operands or mixed register types
1582
 
1583
static uint64_t store_i32 (CThread * t) {
1584
    // Store 32-bit constant IM2 to memory operand [RS+IM1] 
1585
    uint64_t value = t->parm[2].q;
1586
    if ((t->parm[3].b & 1) == 0) value = 0;      // check mask
1587
    t->writeMemoryOperand(value, t->memAddress);
1588
    t->running = 2;                              // don't save RD
1589
    t->returnType = (t->returnType & 7) | 0x20;
1590
    return 0;
1591
}
1592
 
1593
//static uint64_t fence_ (CThread * t) {return f_nop(t);}
1594
 
1595
static uint64_t compare_swap (CThread * t) {
1596
    // Atomic compare and exchange with address [RS+IM2]
1597
    uint64_t val1 = t->parm[0].q;
1598
    uint64_t val2 = t->parm[1].q;
1599
    // to do: use intrinsic compareandexchange or mutex or pause all threads if multiple threads
1600
    uint64_t address = t->memAddress;
1601
    uint64_t sizemask = dataSizeMask[t->operandType]; // mask for operand size
1602
    uint64_t val3 = t->readMemoryOperand(address);    // read value from memory
1603
    if (((val3 ^ val1) & sizemask) == 0) {            // value match
1604
        t->writeMemoryOperand(val2, address);         // write new value to memory
1605
    }
1606
    t->vect = 4;                                      // stop vector loop
1607
    return val3;                                      // return old value
1608
}
1609
 
1610
static uint64_t read_insert (CThread * t) {
1611
    // Replace one element in vector RD, starting at offset RT*OS, with scalar memory operand [RS+IM2]
1612
    uint8_t  rd = t->operands[0];
1613
    uint8_t  rs = t->operands[4];
1614
    uint32_t elementSize = dataSizeTable[t->operandType];
1615
    uint64_t value = t->readMemoryOperand(t->memAddress);
1616
    uint64_t pos = t->registers[rs] * elementSize;
1617
    if (pos < t->vectorLength[rd]) {
1618
        t->writeVectorElement(rd, value, (uint32_t)pos);
1619
    }
1620
    t->vect = 4;                                           // stop vector loop
1621
    t->running = 2;                                        // don't save RD
1622
    return 0;
1623
}
1624
 
1625
static uint64_t extract_store (CThread * t) {
1626
    // Extract one element from vector RD, starting at offset RT*OS, with size OS into memory operand [RS+IM2]
1627
    uint8_t  rd = t->operands[0];
1628
    uint8_t  rs = t->operands[4];
1629
    uint32_t elementSize = dataSizeTable[t->operandType];
1630
    uint64_t pos = t->registers[rs] * elementSize;
1631
    uint64_t value = t->readVectorElement(rd, (uint32_t)pos);
1632
    t->writeMemoryOperand(value, t->memAddress);
1633
    t->returnType = (t->returnType & 7) | 0x20;            // debug return type is memory
1634
    t->vect = 4;                                           // stop vector loop
1635
    t->running = 2;                                        // don't save RD
1636
    t->vectorLengthR = elementSize;                        // size of memory destination
1637
    return 0;
1638
}
1639
 
1640
 
1641
// Format 2.2.6 E. Four vector registers
1642
 
1643
static uint64_t concatenate (CThread * t) {
1644
    // A vector RU of length RT and a vector RS of length RT are concatenated into a vector RD of length 2*RT.
1645
    uint8_t  rd = t->operands[0];
1646
    uint8_t  ru = t->operands[3];
1647
    uint8_t  rs = t->operands[4];
1648
    uint8_t  rt = t->operands[5];
1649
    uint64_t length1 = t->registers[rt];
1650
    if (length1 > t->MaxVectorLength) length1 = t->MaxVectorLength;
1651
    uint32_t length2 = 2 * (uint32_t)length1;
1652
    if (length2 > t->MaxVectorLength) length2 = t->MaxVectorLength;
1653
    t->vectorLength[rd] = length2;                                   // set length of destination vector
1654
    int8_t * source1 = t->vectors.buf() + ru*t->MaxVectorLength;     // address of RU data
1655
    int8_t * source2 = t->vectors.buf() + rs*t->MaxVectorLength;     // address of RS data
1656
    int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
1657
    memcpy(destination, source1, (uint32_t)length1);                 // copy from RU
1658
    memcpy(destination + (uint32_t)length1, source2, length2 - (uint32_t)length1);  // copy from RS
1659
    t->vect = 4;                                                     // stop vector loop
1660
    t->running = 2;                                                  // don't save RD
1661
    return 0;
1662
}
1663
 
1664
static uint64_t interleave (CThread * t) {
1665
    // Interleave elements of vectors RU and RS of length RT/2 to produce vector RD of length RT.
1666
    // Even-numbered elements of the destination come from RU and odd-numbered elements from RS.
1667
    uint8_t  rd = t->operands[0];                // destination
1668
    uint8_t  ru = t->operands[3];                // first input vector
1669
    uint8_t  rs = t->operands[4];                // second input vector
1670
    uint8_t  rt = t->operands[5];                // length
1671
    uint8_t  rm = t->operands[1];                // mask
1672
    uint64_t length = t->registers[rt];
1673
    if (length > t->MaxVectorLength) length = t->MaxVectorLength;
1674
    uint8_t  dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
1675
    length = length >> dsizelog << dsizelog;     // round down to nearest multiple of element size
1676
    uint32_t elementSize = 1 << dsizelog;        // size of each element
1677
    t->vectorLength[rd] = (uint32_t)length;      // set length of destination
1678
    uint8_t even = 1;
1679
    uint32_t pos1 = 0;
1680
    uint64_t value;
1681
    for (uint32_t pos2 = 0; pos2 < length; pos2 += elementSize) {
1682
        if (even) {
1683
            value = t->readVectorElement(ru, pos1);
1684
        }
1685
        else {
1686
            value = t->readVectorElement(rs, pos1);
1687
            pos1 += elementSize;
1688
        }
1689
        even ^= 1;                               // toggle between even and odd
1690
        if (rm < 7 && (t->readVectorElement(rm, pos2) & 1) == 0) value = 0; // mask is 0
1691
        t->writeVectorElement(rd, value, pos2);
1692
    }
1693
    t->vect = 4;                                 // stop vector loop
1694
    t->running = 2;                              // don't save RD
1695
    return 0;
1696
}
1697
 
1698
 
1699
// Format 2.2.7 E. Three vector registers and a 16 bit immediate
1700
 
1701
static uint64_t move_bits (CThread * t) {
1702
    // Replace one or more contiguous bits at one position of RS with contiguous bits from another position of RT
1703
    // Format 2.0.7 E: general purpose registers
1704
    // Format 2.2.7 E: vector registers
1705
    // The position in src2 is the lower 8 bits of IM2. a = IM2 & 0xFF.
1706
    // The position in src1 is the upper 8 bits of IM2. b = IM2 >> 0xFF.
1707
    // The number of bits to move is c = IM3.
1708
    SNum s1 = t->parm[0];                        // input operand src1
1709
    SNum s2 = t->parm[1];                        // input operand src2
1710
    SNum im = t->parm[4];                        // input operand IM2
1711
    SNum mask = t->parm[3];                      // 
1712
    uint8_t c = t->pInstr->a.im3;                // input operand IM3 = number of bits
1713
    uint8_t pos1 = im.s >> 8;                    // bit position in src1. (can overflow, not handled)
1714
    uint8_t pos2 = im.b;                         // bit position in src2. (can overflow, not handled)
1715
    uint64_t bitmask = ((uint64_t)1 << c) - 1;   // mask of c bits. (cannot overflow because c is max 63)
1716
    uint64_t result = (s1.q & ~(bitmask << pos1)) | ((s2.q >> pos2) & bitmask) << pos1;
1717
    if ((mask.b & 1) == 0) {                     // single format instructions with template E must handle mask here
1718
        result = s1.q;                           // fallback
1719
        if (t->operands[2] == 31) result = 0;    // fallback = 0
1720
    }
1721
    return result;
1722
}
1723
 
1724
static uint64_t mask_length (CThread * t) {
1725
    // Make a boolean vector to mask the first n bytes of a vector.
1726
    // The output vector RD will have the same length as the input vector RS. 
1727
    // RT indicates the length of the part that is enabled by the mask (n).
1728
    // IM3 contains the following option bits:
1729
    // bit 0 = 0: bit 0 will be 1 in the first n bytes in the output and 0 in the rest.
1730
    // bit 0 = 1: bit 0 will be 0 in the first n bytes in the output and 1 in the rest.
1731
    // bit 1 = 1: copy remaining bits from input vector RT into each vector element.
1732
    // bit 2 = 1: copy remaining bits from the numeric control register.
1733
    // bit 4 = 1: broadcast remaining bits from IM2 into all 32-bit words of RD:
1734
    //    Bit 1-7 of IM2 go to bit 1-7 of RD. Bit 8-11 of IM2 go to bit 20-23 of RD. Bit 12-15 of IM2 go to bit 26-29 of RD.
1735
    // Output bits that are not set by any of these options will be zero. If multiple options are specified, the results will be OR’ed.
1736
    uint8_t rd = t->operands[0];                 // destination
1737
    uint8_t rs = t->operands[3];                 // src2
1738
    uint8_t rt = t->operands[4];                 // length
1739
    SNum s2 = t->parm[0];                        // input operand src2
1740
    SNum im2 = t->parm[4];                       // input operand IM2
1741
    uint8_t im3 = t->pInstr->a.im3;              // input operand IM3 = options
1742
    t->vectorLengthR = t->vectorLength[rd] = t->vectorLength[rs]; // set length of destination
1743
    uint8_t  dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
1744
    uint64_t n = t->registers[rt];               // number of masked elements
1745
    uint32_t i = t->vectorOffset >> dsizelog;    // current element index
1746
    uint8_t bit = i < n;                         // element is within the first n
1747
    bit ^= im3 & 1;                              // invert option
1748
    uint64_t result = 0;
1749
    if (im3 & 2) result |= s2.q;                 // copy remaining bits from src1
1750
    if (im3 & 4) result |= t->numContr;          // copy remaining bits from NUMCONTR
1751
    if (im3 & 0x10) {                            // copy bits from IM2
1752
        uint32_t rr = (im2.b & ~1) | bit;        // bit 1-7 -> bit 1-7
1753
        rr |= (im2.s & 0xF00) << 12;             // bit 8-11 -> bit 20-23
1754
        rr |= (im2.s & 0xF000) << 14;            // bit 12-15 -> bit 26-29
1755
        result |= rr | ((uint64_t)rr << 32);     // copy these bits twice
1756
    }
1757
    result = (result & ~(uint64_t)1) | bit;      // combine
1758
    return result;
1759
}
1760
 
1761
static uint64_t truth_tab3 (CThread * t) {
1762
    //  Bitwise boolean function of three inputs, given by a truth table 
1763
    SNum a = t->parm[0];                         // first operand
1764
    SNum b = t->parm[1];                         // second operand
1765
    SNum c = t->parm[2];                         // third operand
1766
    SNum mask = t->parm[3];                      // mask register
1767
    uint32_t table = t->pInstr->a.im2;           // truth table
1768
    uint8_t  options = t->pInstr->a.im3;         // option bits
1769
 
1770
    uint32_t dataSize = dataSizeTableBits[t->operandType]; // number of bits
1771
    if (options & 3) dataSize = 1;               // only a single bit
1772
    uint64_t result = 0;                         // calculate result
1773
 
1774
    for (int i = dataSize - 1; i >= 0; i--) {    // loop through bits
1775
        uint64_t bit_pointer = uint64_t(1) << i; // selected bit
1776
        uint8_t index = 0;                       // index into truth table
1777
        if (a.q & bit_pointer) index  = 1;
1778
        if (b.q & bit_pointer) index |= 2;
1779
        if (c.q & bit_pointer) index |= 4;
1780
        uint64_t bit = table >> index & 1;       // lookup in truth table
1781
        result = result << 1 | bit;              // insert bit into result    
1782
    }
1783
    if (options & 2) {                           // take remaining bits from mask or numcontr
1784
        result |= mask.q & ~(uint64_t)1;
1785
    }
1786
    return result;
1787
}
1788
 
1789
static uint64_t repeat_block (CThread * t) {
1790
    // Repeat a block of data to make a longer vector. 
1791
    // RS is input vector containing data block to repeat. 
1792
    // IM2 is length in bytes of the block to repeat (must be a multiple of 4). 
1793
    // RT is the length of destination vector RD.
1794
    uint8_t  rd = t->operands[0];
1795
    uint8_t  rs = t->operands[3];
1796
    uint8_t  rt = t->operands[4];
1797
    uint32_t blen = t->parm[4].i;                // block length
1798
    uint64_t length = t->registers[rt];          // length of destination
1799
    if (length > t->MaxVectorLength) length = t->MaxVectorLength;
1800
    if (blen > t->MaxVectorLength) blen = t->MaxVectorLength;
1801
    t->vectorLength[rd] = (uint32_t)length;        // set length of destination
1802
    if (blen & 3) t->interrupt(INT_WRONG_PARAMETERS);  // must be a multiple of 4
1803
    int8_t * source = t->vectors.buf() + rs*t->MaxVectorLength;      // address of RS data
1804
    int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
1805
    if (length > t->vectorLength[rs]) { // reading beyond the end of the source vector. make sure the rest is zero
1806
        memset(source + t->vectorLength[rs], 0, size_t(length - t->vectorLength[rs]));
1807
    }
1808
    for (uint32_t pos = 0; pos < length; pos += blen) {  // loop through blocks
1809
        uint32_t blen2 = blen;
1810
        if (pos + blen2 > length) blen2 = (uint32_t)length - pos;  // avoid last block going too far
1811
        memcpy(destination + pos, source, blen2);          // copy block
1812
    }
1813
    t->vect = 4;                                           // stop vector loop
1814
    t->running = 2;                                        // don't save RD
1815
    return 0;
1816
}
1817
 
1818
static uint64_t repeat_within_blocks (CThread * t) {
1819
    // Broadcast the first element of each block of data in a vector to the entire block. 
1820
    // RS is input vector containing data blocks. 
1821
    // IM2 is length in bytes of each block (must be a multiple of the operand size). 
1822
    // RT is length of destination vector RD. 
1823
    // The operand size must be at least 4 bytes.
1824
    uint8_t  rd = t->operands[0];
1825
    uint8_t  rs = t->operands[3];
1826
    uint8_t  rt = t->operands[4];
1827
    uint32_t blen = t->parm[4].i;                // block length
1828
    uint64_t length = t->registers[rt];          // length of destination
1829
    if (length > t->MaxVectorLength) length = t->MaxVectorLength;
1830
    if (blen > t->MaxVectorLength) blen = t->MaxVectorLength;
1831
    t->vectorLength[rd] = (uint32_t)length;        // set length of destination
1832
    uint32_t elementSize = dataSizeTable[t->operandType];
1833
    if (elementSize < 4 || (blen & (elementSize - 1))) t->interrupt(INT_WRONG_PARAMETERS);  // must be a multiple of elementsize
1834
    int8_t * source = t->vectors.buf() + rs*t->MaxVectorLength;      // address of RS data
1835
    int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
1836
    if (length > t->vectorLength[rs]) { // reading beyond the end of the source vector. make sure the rest is zero
1837
        memset(source + t->vectorLength[rs], 0, size_t(length - t->vectorLength[rs]));
1838
    }
1839
    for (uint32_t pos = 0; pos < length; pos += blen) {  // loop through blocks
1840
        uint32_t blen2 = blen;
1841
        if (pos + blen2 > length) blen2 = (uint32_t)length - pos;  // avoid last block going too far
1842
        for (uint32_t i = 0; i < blen2; i += elementSize) {  // loop within block        
1843
            memcpy(destination + pos + i, source + pos, elementSize);  // copy first element
1844
        }
1845
    }
1846
    t->vect = 4;                                           // stop vector loop
1847
    t->running = 2;                                        // don't save RD
1848
    return 0;
1849
}
1850
 
1851
 
1852
// tables of single format instructions
1853
 
1854
// Format 1.3 B. Two vector registers and a broadcast 8-bit immediate operand.
1855
PFunc funcTab7[64] = {
1856
    gp2vec, vec2gp, 0, make_sequence, insert_, extract_, compress, expand,          // 0  - 7
1857
    0, 0, 0, 0, float2int, int2float, round_, round2n,                              // 8 - 15
1858
    abs_, fp_category, broad_, broad_, byte_reverse, bitscan_, popcount_, 0,        // 16 - 23
1859
    0, bool2bits, bool_reduce, 0, 0, 0, 0, 0,                                       // 24 - 31
1860
    0, 0, 0, 0, 0, 0, 0, 0,                                                         // 32 - 39
1861
    0, 0, 0, 0, 0, 0, 0, 0,                                                         // 40 - 47
1862
    0, 0, 0, 0, 0, 0, 0, 0,                                                         // 48 - 55
1863
    push_v, pop_v, clear_, 0, 0, 0, 0, 0,                                           // 56 - 63
1864
};
1865
 
1866
// Format 1.4 C. One vector register and a broadcast 16-bit immediate operand.
1867
PFunc funcTab8[64] = {
1868
    move_i16, f_add, and_i16, or_i16, xor_i16, 0, 0, 0,               // 0 - 7
1869
    move_8shift8, move_8shift8, add_8shift8, add_8shift8, and_8shift8, and_8shift8, or_8shift8, or_8shift8, // 8 - 15
1870
    xor_8shift8, xor_8shift8, 0, 0, 0, 0, 0, 0,                                     // 16 - 23
1871
    0, 0, 0, 0, 0, 0, 0, 0,                                                         // 24 - 31
1872
    move_half2float, move_half2double, add_half2float, add_half2double, mul_half2float, mul_half2double, 0, 0,  // 32 - 39
1873
    add_h16, mul_h16, 0, 0, 0, 0, 0, 0,                                             // 40 - 47
1874
    0, 0, 0, 0, 0, 0, 0, 0,                                                         // 48 - 55
1875
    0, 0, 0, 0, 0, 0, 0, 0,                                                         // 56 - 63
1876
};
1877
 
1878
// Format 2.5 A. Single format instructions with memory operands or mixed register types
1879
PFunc funcTab10[64] = {
1880
    0, 0, 0, 0, 0, 0, 0, 0,                                                 // 0 - 7
1881
    store_i32, 0, 0, 0, 0, 0, 0, 0,                                         // 8 - 15
1882
    f_nop, 0, compare_swap, 0, 0, 0, 0, 0,                                  // 16 - 23
1883
    0, 0, 0, 0, 0, 0, 0, 0,                                                 // 24 - 31
1884
    read_insert, 0, 0, 0, 0, 0, 0, 0,                                       // 32 - 39
1885
    extract_store, 0, 0, 0, 0, 0, 0, 0,                                     // 40 - 47
1886
};
1887
 
1888
 
1889
// Format 2.6 A. Three vector registers and a 32-bit immediate operand.
1890
PFunc funcTab11[64] = {
1891
    load_hi, insert_hi, make_mask, replace_, replace_even, replace_odd, broadcast_32, 0, // 0 - 7
1892
    permute, 0, 0, 0, 0, 0, 0, 0                                               // 8 - 15
1893
};
1894
 
1895
// Format 3.1 A. Three vector registers and a 64-bit immediate operand.
1896
PFunc funcTab13[64] = {
1897
    0, 0, 0, 0, 0, 0, 0, 0,                                                    // 0 - 7
1898
    0, 0, 0, 0, 0, 0, 0, 0,                                                    // 8 - 15
1899
    0, 0, 0, 0, 0, 0, 0, 0,                                                    // 16 - 23
1900
    0, 0, 0, 0, 0, 0, 0, 0,                                                    // 34 - 31
1901
    replace_, broadcast_32, 0, 0, 0, 0, 0, 0,                                  // 32 - 39
1902
};
1903
 
1904
 
1905
// Dispatch functions for single format instruction with E template.
1906
// (full tables of all possible single format instruction with E template would 
1907
//  be too large with most places unused).
1908
 
1909
// Format 2.0.6 E. Four general purpose registers
1910
static uint64_t dispatch206_1 (CThread * t) {
1911
    switch (t->op) {
1912
    case 8: return truth_tab3(t);
1913
    default:
1914
        t->interrupt(INT_UNKNOWN_INST);
1915
    }
1916
    return 0;
1917
}
1918
 
1919
 
1920
// Format 2.0.7 E. Three general purpose registers and a 16-bit immediate constant
1921
static uint64_t dispatch207_1 (CThread * t) {
1922
    switch (t->op) {
1923
    case 0: return move_bits(t);
1924
    default:
1925
        t->interrupt(INT_UNKNOWN_INST);
1926
    }
1927
    return 0;
1928
}
1929
 
1930
// Format 2.2.6 E. Four vector registers
1931
static uint64_t dispatch226_1 (CThread * t) {
1932
    switch (t->op) {
1933
    case 0: return concatenate(t);
1934
    case 1: return permute(t);
1935
    case 2: return interleave(t);
1936
    case 8: return truth_tab3(t);
1937
    default:
1938
        t->interrupt(INT_UNKNOWN_INST);
1939
    }
1940
    return 0;
1941
}
1942
 
1943
// Format 2.2.7 E. Three vector registers and a 16-bit immediate constant
1944
static uint64_t dispatch227_1 (CThread * t) {
1945
    switch (t->op) {
1946
    case 0: return move_bits(t);
1947
    case 1: return mask_length(t);
1948
    case 8: return repeat_block(t);
1949
    case 9: return repeat_within_blocks(t);
1950
    default:
1951
        t->interrupt(INT_UNKNOWN_INST);
1952
    }
1953
    return 0;
1954
}
1955
 
1956
// Table of dispatch functions for all possible single format instructions with E template
1957
PFunc EDispatchTable[96] = {
1958
    0, 0, 0, 0, 0, 0, dispatch206_1, dispatch207_1,        // 2.0.x i.1
1959
    0, 0, 0, 0, 0, 0, dispatch226_1, dispatch227_1,        // 2.2.x i.1
1960
    0, 0, 0, 0, 0, 0, 0, 0,                                // 3.0.x i.1
1961
    0, 0, 0, 0, 0, 0, 0, 0,                                // 3.2.x i.1
1962
 
1963
    0, 0, 0, 0, 0, 0, 0, 0,                                // 2.0.x i.2
1964
    0, 0, 0, 0, 0, 0, 0, 0,                                // 2.2.x i.2
1965
    0, 0, 0, 0, 0, 0, 0, 0,                                // 3.0.x i.2
1966
    0, 0, 0, 0, 0, 0, 0, 0,                                // 3.2.x i.2
1967
 
1968
    0, 0, 0, 0, 0, 0, 0, 0,                                // 2.0.x i.3
1969
    0, 0, 0, 0, 0, 0, 0, 0,                                // 2.2.x i.3
1970
    0, 0, 0, 0, 0, 0, 0, 0,                                // 3.0.x i.3
1971
    0, 0, 0, 0, 0, 0, 0, 0                                 // 3.2.x i.3
1972
};

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.