OpenCores
URL https://opencores.org/ocsvn/forwardcom/forwardcom/trunk

Subversion Repositories forwardcom

[/] [forwardcom/] [bintools/] [emulator4.cpp] - Blame information for rev 166

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 55 Agner
/****************************  emulator4.cpp  ********************************
2
* Author:        Agner Fog
3
* date created:  2018-02-18
4
* Last modified: 2021-08-05
5
* Version:       1.11
6
* Project:       Binary tools for ForwardCom instruction set
7
* Description:
8
* Emulator: Execution functions for single format instructions, part 1
9
*
10
* Copyright 2018-2021 GNU General Public License http://www.gnu.org/licenses
11
*****************************************************************************/
12
 
13
#include "stdafx.h"
14
 
15
 
16
// Format 1.0 A. Three general purpose registers
17
 
18
// Currently no instructions with format 1.0
19
 
20
 
21
// Format 1.1 C. One general purpose register and a 16 bit immediate operand. int64
22
 
23
static uint64_t move_16s(CThread * t) {
24
    // Move 16-bit sign-extended constant to general purpose register.
25
    return t->parm[2].q;
26
}
27
 
28
static uint64_t move_16u(CThread * t) {
29
    // Move 16-bit zero-extended constant to general purpose register.
30
    return t->parm[2].s;
31
}
32
 
33
static uint64_t shift16_add(CThread * t) {
34
    // Shift 16-bit unsigned constant left by 16 and add.
35
    t->parm[2].q <<= 16;
36
    return f_add(t);
37
}
38
 
39
static uint64_t shifti1_move(CThread * t) {
40
    // RD = IM2 << IM1. Sign-extend IM2 to 32/64 bits and shift left by the unsigned value IM1
41
    return (t->parm[2].qs >> 8) << t->parm[2].b;
42
}
43
 
44
static uint64_t shifti1_add(CThread * t) {
45
    // RD += IM2 << IM1. Sign-extend IM2 to 32/64 bits and shift left by the unsigned value IM1 and add
46
    t->parm[2].q = (t->parm[2].qs >> 8) << t->parm[2].b;
47
    return f_add(t);
48
}
49
 
50
static uint64_t shifti1_and(CThread * t) {
51
    // RD &= IM2 << IM1
52
    return t->parm[1].q & ((t->parm[2].qs >> 8) << t->parm[2].b);
53
}
54
 
55
static uint64_t shifti1_or(CThread * t) {
56
    // RD |= IM2 << IM1
57
    return t->parm[1].q | ((t->parm[2].qs >> 8) << t->parm[2].b);
58
}
59
 
60
static uint64_t shifti1_xor(CThread * t) {
61
    // RD ^= IM2 << IM1
62
    return t->parm[1].q ^ ((t->parm[2].qs >> 8) << t->parm[2].b);
63
}
64
 
65
// Format 1.8 B. Two general purpose registers and an 8-bit immediate operand. int64
66
 
67
static uint64_t abs_64(CThread * t) {
68
    // Absolute value of signed integer. 
69
    // IM1 determines handling of overflow: 0: wrap around, 1: saturate, 2: zero.
70
    SNum a = t->parm[1];
71
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
72
    uint64_t signBit = (sizeMask >> 1) + 1;        // sign bit
73
    if ((a.q & sizeMask) == signBit) {  // overflow
74
        if (t->parm[2].b & 4) t->interrupt(INT_OVERFL_SIGN);
75
        switch (t->parm[2].b & ~4) {
76
        case 0:  return a.q;     // wrap around
77
        case 1:  return sizeMask >> 1; // saturate
78
        case 2:  return 0;       // zero
79
        default: t->interrupt(INT_WRONG_PARAMETERS);
80
        }
81
    }
82
    if (a.q & signBit) {  // negative
83
        a.qs = - a.qs;    // change sign
84
    }
85
    return a.q;
86
}
87
 
88
static uint64_t shifti_add(CThread * t) {
89
    // Shift and add. RD += RS << IM1
90
    SNum a = t->parm[0];
91
    SNum b = t->parm[1];
92
    SNum c = t->parm[2];
93
    SNum r1, r2;                                 // result
94
    r1.q = b.q << c.b;                           // shift left
95
    uint8_t nbits = dataSizeTableBits[t->operandType];
96
    if (c.q >= nbits) r1.q = 0;                  // shift out of range gives zero
97
    r2.q = a.q + r1.q;                           // add
98
    /*
99
    if (t->numContr & MSK_OVERFL_I) {  // check for overflow
100
        if (t->numContr & MSK_OVERFL_SIGN) {  // check for signed overflow
101
            uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
102
            uint64_t signBit = (sizeMask >> 1) + 1;        // sign bit
103
            uint64_t ovfl = ~(a.q ^ r1.q) & (a.q ^ r2.q);  // overflow if a and b have same sign and result has opposite sign
104
            if (r1.qs >> c.b != b.qs || (ovfl & signBit) || c.q >= nbits) t->interrupt(INT_OVERFL_SIGN);  // signed overflow
105
        }
106
        else if (t->numContr & MSK_OVERFL_UNSIGN) {  // check for unsigned overflow
107
            if (r2.q < a.q || r1.q >> c.b != b.q || c.q >= nbits) t->interrupt(INT_OVERFL_UNSIGN);  // unsigned overflow
108
        }
109
    } */
110
    return r2.q;         // add
111
}
112
 
113
uint64_t bitscan_ (CThread * t) {
114
    // Bit scan forward or reverse. Find index to first or last set bit in RS
115
    SNum a = t->parm[1];                         // input value
116
    uint8_t IM1 = t->parm[2].b;                  // immediate operand
117
    a.q &= dataSizeMask[t->operandType];         // mask for operand size
118
    if (a.q == 0) {
119
        a.qs = (IM1 & 0x10) ? -1 : 0;            // return 0 or -1 if intput is 0
120
    }
121
    else if (IM1 & 1) {
122
        // reverse
123
        a.q = bitScanReverse(a.q);
124
    }
125
    else {
126
        // forward    
127
        a.q = bitScanForward(a.q);
128
    }
129
    return a.q;
130
}
131
 
132
static uint64_t roundp2(CThread * t) {
133
    // Round up or down to nearest power of 2.
134
    SNum a = t->parm[1];                         // input operand
135
    uint8_t IM1 = t->parm[2].b;                  // immediate operand
136
    a.q &= dataSizeMask[t->operandType];         // mask off unused bits
137
    if (dataSizeTable[t->operandType] > 8) t->interrupt(INT_WRONG_PARAMETERS); // illegal operand type
138
    if (a.q == 0) {
139
        a.qs = IM1 & 0x10 ? -1 : 0;              // return 0 or -1 if the intput is 0
140
    }
141
    else if (!(a.q & (a.q-1))) {
142
        return a.q;                              // the number is a power of 2. Return unchanged
143
    }
144
    else if (IM1 & 1) {
145
        // round up to nearest power of 2
146
        uint32_t s = bitScanReverse(a.q);        // highest set bit
147
        if (s+1 >= dataSizeTableBits[t->operandType]) { // overflow
148
            a.qs = IM1 & 0x20 ? -1 : 0;          // return 0 or -1 on overflow
149
        }
150
        else {
151
            a.q = (uint64_t)1 << (s+1);          // round up
152
        }
153
    }
154
    else {
155
        // round down to nearest power of 2
156
        a.q = (uint64_t)1 << bitScanReverse(a.q);
157
    }
158
    return a.q;
159
}
160
 
161
static uint32_t popcount32(uint32_t x) { // count bits in 32 bit integer. used by popcount_ function
162
    x = x - ((x >> 1) & 0x55555555);
163
    x = (x >> 2 & 0x33333333) + (x & 0x33333333);
164
    x = (x + (x >> 4)) & 0x0F0F0F0F;
165
    x = (x + (x >> 8)) & 0x00FF00FF;
166
    x = uint16_t(x + (x >> 16));
167
    return x;
168
}
169
 
170
uint64_t popcount_ (CThread * t) {
171
    // Count the number of bits in RS that are 1
172
    SNum a = t->parm[1];                         // value
173
    a.q &= dataSizeMask[t->operandType];         // mask for operand size
174
    return popcount32(a.i) + popcount32(a.q >> 32);
175
}
176
 
177
static uint64_t read_spec(CThread * t) {
178
    // Read special register RS into g. p. register RD.
179
    uint8_t rs = t->operands[4];                 // source register
180
    uint64_t retval = 0;
181
 
182
    switch (rs) {
183
    case REG_NUMCONTR & 0x1F:     // numcontr register
184
        retval = t->numContr;
185
        break;
186
 
187
    case REG_THREADP & 0x1F:     // threadp register
188
        retval = t->threadp;
189
        break;
190
 
191
    case REG_DATAP & 0x1F:       // datap register     
192
        retval = t->datap;
193
        break;
194
 
195
    default:                     // other register not implemented
196
        t->interrupt(INT_WRONG_PARAMETERS);
197
    }
198
    return retval;
199
}
200
 
201
static uint64_t write_spec(CThread * t) {
202
    // Write g. p. register RS to special register RD
203
    uint8_t rd = t->operands[0];                 // destination register
204
    SNum a = t->parm[1];                         // value
205
    switch (rd) {
206
    case REG_NUMCONTR & 0x1F:     // numcontr register
207
        t->numContr = a.i | 1;                   // bit 0 must be set
208
        if (((t->numContr ^ t->lastMask) & (1<<MSK_SUBNORMAL)) != 0) {
209
            // subnormal status changed
210
            enableSubnormals(t->numContr & (1<<MSK_SUBNORMAL));
211
        }
212
        t->lastMask = t->numContr;
213
        break;
214
 
215
    case REG_THREADP & 0x1F:     // threadp register
216
        t->threadp = a.q;
217
        break;
218
 
219
    case REG_DATAP & 0x1F:       // datap register     
220
        t->datap = a.q;
221
        break;
222
 
223
    default:                     // other register not implemented
224
        t->interrupt(INT_WRONG_PARAMETERS);
225
    }
226
 
227
    t->returnType = 0;
228
    return 0;
229
}
230
 
231
static uint64_t read_capabilities(CThread * t) {
232
    // Read capabilities register into g. p. register RD
233
    uint8_t capabreg = t->operands[4];    // capabilities register number
234
    if (capabreg < number_of_capability_registers) {
235
        return t->capabilyReg[capabreg];
236
    }
237
    else {
238
        t->interrupt(INT_WRONG_PARAMETERS);
239
    }
240
    return 0;
241
}
242
 
243
static uint64_t write_capabilities(CThread * t) {
244
    // Write g. p. register to capabilities register RD
245
    uint8_t capabreg = t->operands[0];    // capabilities register number
246
    uint64_t value =  t->parm[1].q;
247
    if (capabreg < number_of_capability_registers) {
248
        t->capabilyReg[capabreg] = value;
249
    }
250
    else {
251
        t->interrupt(INT_WRONG_PARAMETERS);
252
    }
253
    t->returnType = 0;
254
    return 0;
255
}
256
 
257
static uint64_t read_perf(CThread * t) {
258
    // Read performance counter
259
    uint8_t parfreg = t->operands[4];    // performance register number
260
    uint8_t par2 = t->parm[2].b;         // second operand
261
    uint64_t result = 0;
262
    switch (parfreg) {
263
    case 0:  // reset all performance counters
264
        if (par2 & 1) {
265
            t->perfCounters[perf_cpu_clock_cycles] = 0;
266
        }
267
        if (par2 & 2) {
268
            t->perfCounters[perf_instructions] = 0;
269
            t->perfCounters[perf_2size_instructions] = 0;
270
            t->perfCounters[perf_3size_instructions] = 0;
271
            t->perfCounters[perf_gp_instructions] = 0;
272
            t->perfCounters[perf_gp_instructions_mask0] = 0;
273
        }
274
        if (par2 & 4) {
275
            t->perfCounters[perf_vector_instructions] = 0;
276
        }
277
        if (par2 & 8) {
278
            t->perfCounters[perf_control_transfer_instructions] = 0;
279
            t->perfCounters[perf_direct_jumps] = 0;
280
            t->perfCounters[perf_indirect_jumps] = 0;
281
            t->perfCounters[perf_cond_jumps] = 0;
282
        }
283
        break;
284
 
285
    case 1:  // CPU clock cycles
286
        result = t->perfCounters[perf_cpu_clock_cycles];
287
        if (par2 == 0) t->perfCounters[perf_cpu_clock_cycles] = 0;
288
        break;
289
 
290
    case 2:  // number of instructions
291
        switch (par2) {
292
        case 0:
293
            result = t->perfCounters[perf_instructions];
294
            t->perfCounters[perf_instructions] = 0;
295
            t->perfCounters[perf_2size_instructions] = 0;
296
            t->perfCounters[perf_3size_instructions] = 0;
297
            t->perfCounters[perf_gp_instructions] = 0;
298
            t->perfCounters[perf_gp_instructions_mask0] = 0;
299
            break;
300
        case 1:
301
            result = t->perfCounters[perf_instructions];
302
            break;
303
        case 2:
304
            result = t->perfCounters[perf_2size_instructions];
305
            break;
306
        case 3:
307
            result = t->perfCounters[perf_3size_instructions];
308
            break;
309
        case 4:
310
            result = t->perfCounters[perf_gp_instructions];
311
            break;
312
        case 5:
313
            result = t->perfCounters[perf_gp_instructions_mask0];
314
            break;
315
        }
316
        break;
317
 
318
    case 3:  // number of vector instructions
319
        result = t->perfCounters[perf_vector_instructions];
320
        if (par2 == 0) t->perfCounters[perf_vector_instructions] = 0;
321
        break;
322
 
323
    case 4:  // vector registers in use
324
        for (int iv = 0; iv < 32; iv++) {
325
            if (t->vectorLength[iv] > 0) result |= (uint64_t)1 << iv;
326
        }
327
        break;
328
 
329
    case 5:  // jumps, calls, and returns
330
        switch (par2) {
331
        case 0:
332
            result = t->perfCounters[perf_control_transfer_instructions];
333
            t->perfCounters[perf_control_transfer_instructions] = 0;
334
            t->perfCounters[perf_direct_jumps] = 0;
335
            t->perfCounters[perf_indirect_jumps] = 0;
336
            t->perfCounters[perf_cond_jumps] = 0;
337
            break;
338
        case 1:    // all jumps, calls, returns
339
            result = t->perfCounters[perf_control_transfer_instructions];
340
            break;
341
        case 2:    // direct unconditional jumps, calls, returns
342
            result = t->perfCounters[perf_direct_jumps];
343
            break;
344
        case 3:
345
            result = t->perfCounters[perf_indirect_jumps];
346
            break;
347
        case 4:
348
            result = t->perfCounters[perf_cond_jumps];
349
            break;
350
        }
351
        break;
352
    case 16:  // errors counters
353
        switch (par2) {
354
        case 0:
355
            result = 0;
356
            t->perfCounters[perf_unknown_instruction] = 0;
357
            t->perfCounters[perf_wrong_operands] = 0;
358
            t->perfCounters[perf_array_overflow] = 0;
359
            t->perfCounters[perf_read_violation] = 0;
360
            t->perfCounters[perf_write_violation] = 0;
361
            t->perfCounters[perf_misaligned] = 0;
362
            t->perfCounters[perf_address_of_first_error] = 0;
363
            t->perfCounters[perf_type_of_first_error] = 0;
364
            break;
365
        case 1:    // unknown instructions
366
            result = t->perfCounters[perf_unknown_instruction];
367
            break;
368
        case 2:    // wrong operands for instruction
369
            result = t->perfCounters[perf_wrong_operands];
370
            break;
371
        case 3:    // array index out of bounds
372
            result = t->perfCounters[perf_array_overflow];
373
            break;
374
        case 4:    // memory read access violation
375
            result = t->perfCounters[perf_read_violation];
376
            break;
377
        case 5:    // memory write access violation
378
            result = t->perfCounters[perf_write_violation];
379
            break;
380
        case 6:    // memory access misaligned
381
            result = t->perfCounters[perf_misaligned];
382
            break;
383
        case 62:   // address of first error
384
            result = t->perfCounters[perf_address_of_first_error];
385
            break;
386
        case 63:   // type of first error
387
            result = t->perfCounters[perf_type_of_first_error];
388
            break;
389
        }
390
 
391
        break;
392
    default:
393
        t->interrupt(INT_WRONG_PARAMETERS);
394
    }
395
 
396
    return result;
397
}
398
 
399
static uint64_t read_sys(CThread * t) {
400
    // Read system register RS into g. p. register RD
401
    t->interrupt(INT_WRONG_PARAMETERS); // not supported yet
402
    return 0;
403
}
404
 
405
static uint64_t write_sys(CThread * t) {
406
    // Write g. p. register RS to system register RD
407
    t->interrupt(INT_WRONG_PARAMETERS); // not supported yet
408
    t->returnType = 0;
409
    return 0;
410
}
411
 
412
static uint64_t push_r(CThread * t) {
413
    // push one or more g.p. registers on a stack pointed to by rd
414
    int32_t step = dataSizeTable[t->operandType];
415
    if (!(t->parm[4].i & 0x80)) step = -step;
416
    uint8_t reg0 = t->operands[0] & 0x1F;   // pointer register
417
    uint8_t reg1 = t->operands[4] & 0x1F;   // first push register
418
    uint8_t reglast = t->parm[4].i & 0x1F;  // last push register
419
    uint8_t reg;
420
    uint64_t pointer = t->registers[reg0];
421
    // loop through registers to push
422
    for (reg = reg1; reg <= reglast; reg++) {
423
        pointer += (int64_t)step;
424
        uint64_t value = t->registers[reg];
425
        t->writeMemoryOperand(value, pointer);
426
        t->listResult(value);
427
    }
428
    t->registers[reg0] = pointer;
429
    return pointer;
430
}
431
 
432
static uint64_t pop_r(CThread * t) {
433
    // pop one or more g.p. registers from a stack pointed to by rd
434
    int32_t step = dataSizeTable[t->operandType];
435
    if (t->parm[4].i & 0x80) step = -step;
436
    uint8_t reg0 = t->operands[0] & 0x1F;   // pointer register
437
    uint8_t reg1 = t->operands[4] & 0x1F;   // first push register
438
    uint8_t reglast = t->parm[4].i & 0x1F;  // last push register
439
    uint8_t reg;
440
    uint64_t pointer = t->registers[reg0];
441
    // loop through registers to pop in reverse order
442
    for (reg = reglast; reg >=  reg1; reg--) {
443
        uint64_t value = t->readMemoryOperand(pointer);
444
        t->registers[reg] = value;
445
        pointer += (int64_t)step;
446
        t->listResult(value);
447
    }
448
    t->registers[reg0] = pointer;
449
    return pointer;
450
}
451
 
452
 
453
// Format 2.9 A. Three general purpose registers and a 32-bit immediate operand
454
 
455
static uint64_t move_hi32(CThread * t) {
456
    // Load 32-bit constant into the high part of a general purpose register. The low part is zero. RD = IM2 << 32.
457
    return t->parm[2].q << 32;
458
}
459
 
460
static uint64_t insert_hi32(CThread * t) {
461
    // Insert 32-bit constant into the high part of a general purpose register, leaving the low part unchanged.
462
    return t->parm[2].q << 32 | t->parm[1].i;
463
}
464
 
465
static uint64_t add_32u(CThread * t) {
466
    // Add zero-extended 32-bit constant to general purpose register
467
    t->parm[2].q = t->parm[2].i;
468
    return f_add(t);
469
}
470
 
471
static uint64_t sub_32u(CThread * t) {
472
    // Subtract zero-extended 32-bit constant from general purpose register
473
    t->parm[2].q = t->parm[2].i;
474
    return f_sub(t);
475
}
476
 
477
static uint64_t add_hi32(CThread * t) {
478
    // Add 32-bit constant to high part of general purpose register. RD = RT + (IM2 << 32).
479
    t->parm[2].q <<= 32;
480
    return f_add(t);
481
}
482
 
483
static uint64_t and_hi32(CThread * t) {
484
    // AND high part of general purpose register with 32-bit constant. RD = RT & (IM2 << 32).
485
    return t->parm[1].q & t->parm[2].q << 32;
486
}
487
 
488
static uint64_t or_hi32(CThread * t) {
489
    // OR high part of general purpose register with 32-bit constant. RD = RT | (IM2 << 32).
490
    return t->parm[1].q | t->parm[2].q << 32;
491
}
492
 
493
static uint64_t xor_hi32(CThread * t) {
494
    // XOR high part of general purpose register with 32-bit constant. RD = RT ^ (IM2 << 32).
495
    return t->parm[1].q ^ t->parm[2].q << 32;
496
}
497
 
498
static uint64_t replace_bits(CThread * t) {
499
    // Replace a group of contiguous bits in RT by a specified constant
500
    SNum a = t->parm[1];
501
    SNum b = t->parm[2];
502
    uint64_t val = b.s;                          // value to insert
503
    uint8_t  pos = uint8_t(b.i >> 16);           // start position
504
    uint8_t  num = uint8_t(b.i >> 24);           // number of bits to replace
505
    if (num > 32 || pos + num > 64) t->interrupt(INT_WRONG_PARAMETERS);
506
    uint64_t mask = ((uint64_t)1 << num) - 1;    // mask with 'num' 1-bits
507
    return (a.q & ~(mask << pos)) | ((val & mask) << pos);
508
}
509
 
510
static uint64_t address_(CThread * t) {
511
    // RD = RT + IM2, RS can be THREADP (28), DATAP (29) or IP (30)
512
    t->returnType = 0x13;
513
    return t->memAddress;
514
}
515
 
516
// Format 1.2 A. Three vector register operands
517
 
518
static uint64_t set_len(CThread * t) {
519
    // RD = vector register RS with length changed to value of g.p. register RT
520
    // set_len: the new length is indicated in bytes
521
    // set_num: the new length is indicated in elements
522
    uint8_t  rd = t->operands[0];
523
    uint8_t  rs = t->operands[4];
524
    uint8_t  rt = t->operands[5];
525
    uint32_t oldLength = t->vectorLength[rs];
526
    uint64_t newLength = t->registers[rt];
527
    if (t->op & 1) newLength *= dataSizeTable[t->operandType];  // set_num: multiply by operand size
528
    if (newLength > t->MaxVectorLength) newLength = t->MaxVectorLength;
529
    if (newLength > oldLength) {
530
        memcpy(t->vectors.buf() + rd*t->MaxVectorLength, t->vectors.buf() + rs*t->MaxVectorLength, oldLength);  // copy first part from RT
531
        memset(t->vectors.buf() + rd*t->MaxVectorLength + oldLength, 0, size_t(newLength - oldLength));               // set the rest to zero
532
    }
533
    else {
534
        memcpy(t->vectors.buf() + rd*t->MaxVectorLength, t->vectors.buf() + rs*t->MaxVectorLength, size_t(newLength));  // copy newLength from RT
535
    }
536
    t->vectorLength[rd] = (uint32_t)newLength;             // set new length
537
    t->vect = 4;                                           // stop vector loop
538
    t->running = 2;                                       // don't save RD
539
    return 0;
540
}
541
 
542
static uint64_t get_len(CThread * t) {
543
    // Get length of vector register RT into general purpose register RD
544
    // get_len: get the length in bytes
545
    // get_num: get the length in elements
546
    uint8_t  rd = t->operands[0];
547
    uint8_t  rt = t->operands[4];
548
    uint32_t length = t->vectorLength[rt];                 // length of RT
549
    if (t->op & 1) length >>= dataSizeTableLog[t->operandType];  // get_num: divide by operand size (round down)
550
    t->registers[rd] = length;                             // save in g.p. register, not vector register
551
    t->vect = 4;                                           // stop vector loop
552
    t->running = 2;                                        // don't save to vector register RD
553
    t->returnType = 0x12;                                  // debug return output
554
    return length;
555
}
556
 
557
uint64_t insert_(CThread * t) {
558
    // Replace one element in vector RD, starting at offset RT·OS, with scalar RS
559
    uint64_t pos;                         // position of element insert
560
    uint8_t  rd = t->operands[3];         // source and destination register
561
    uint8_t  operandType = t->operandType;       // operand type
562
    uint64_t returnval;
563
    uint8_t  dsizelog = dataSizeTableLog[operandType]; // log2(elementsize)
564
    t->vectorLengthR = t->vectorLength[rd];
565
    uint8_t sourceVector = t->operands[4];      // source register 
566
 
567
    if (t->fInstr->format2 == 0x120) {   //  format 1.2A  v1 = insert(v1, v2, r3)
568
        uint8_t  rt = t->operands[5];         // index register
569
        pos = t->registers[rt] << dsizelog;
570
    }
571
    else {   // format 1.3B     v1 = insert(v1, v2, imm)
572
        pos = t->parm[2].q << dsizelog;
573
    }
574
    if (pos == t->vectorOffset) {
575
        if (dsizelog == 4) {  // 128 bits.
576
            t->parm[5].q = t->readVectorElement(sourceVector, 8); // high part of 128-bit result
577
        }
578
        returnval = t->readVectorElement(sourceVector, 0);      // first element of sourceVector
579
    }
580
    else {
581
        if (dsizelog == 4) {  // 128 bits.
582
            t->parm[5].q = t->readVectorElement(rd, t->vectorOffset + 8); // high part of 128-bit result
583
        }
584
        returnval = t->parm[0].q;                     // rd unchanged
585
    }
586
    return returnval;
587
}
588
 
589
uint64_t extract_(CThread * t) {
590
    // Extract one element from vector RT, at offset RS·OS or IM1·OS, with size OS 
591
    // and broadcast into vector register RD.
592
    uint8_t  rd = t->operands[0];                          // destination register
593
    uint8_t  operandType = t->operandType;                 // operand type
594
    uint8_t  dsizelog = dataSizeTableLog[operandType];     // log2(elementsize)
595
    uint8_t  rsource = t->operands[4];                     // source vector
596
    uint64_t pos;                                          // position = index * OS
597
    if (t->fInstr->format2 == 0x120) {
598
        uint8_t  rt = t->operands[5];                      // index register
599
        pos = t->registers[rt] << dsizelog;
600
    }
601
    else {  // format 0x130
602
        pos = t->parm[4].q << dsizelog;
603
    }
604
    uint32_t sourceLength = t->vectorLength[rsource];      // length of source vector
605
    uint64_t result;
606
    if (pos >= sourceLength) {
607
        result = 0;                                        // beyond end of source vector
608
    }
609
    else {
610
        int8_t * source = t->vectors.buf() + (uint64_t)rsource * t->MaxVectorLength; // address of rsource data
611
        result = *(uint64_t*)(source+pos);                 // no problem reading too much, it will be cut off later if the operand size is < 64 bits
612
        if (dsizelog >= 4) {                               // 128 bits
613
            t->parm[5].q = *(uint64_t*)(source+pos+8);     // store high part of 128 bit element
614
        }
615
    }
616
    t->vectorLength[rd] = t->vectorLengthR = sourceLength; // length of destination vector
617
    return result;
618
}
619
 
620
 
621
 
622
static uint64_t compress_sparse(CThread * t) {
623
    // Compress sparse vector elements indicated by mask bits into contiguous vector. 
624
    uint8_t  rd = t->operands[0];         // destination vector
625
    //uint8_t  rt = t->operands[4];       // length of input vector not specified
626
    uint8_t  rt = t->operands[5];         // source vector
627
    uint8_t  rm = t->operands[1];         // mask vector
628
    uint32_t sourceLength = t->vectorLength[rt]; // length of source vector
629
    uint32_t maskLength = t->vectorLength[rm];   // length of mask vector
630
    //uint64_t newLength = t->registers[rt];       // length of destination
631
    uint64_t newLength = sourceLength;     // length of destination
632
    uint32_t elementSize = dataSizeTable[t->operandType];            // size of each element
633
    int8_t * source = t->vectors.buf() + rt*t->MaxVectorLength;      // address of RT data
634
    int8_t * masksrc = t->vectors.buf() + rm*t->MaxVectorLength;     // address of mask data
635
    int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
636
    // limit length
637
    if (newLength > t->MaxVectorLength) newLength = t->MaxVectorLength;
638
    if (newLength > maskLength) newLength = maskLength;              // no reason to go beyond mask
639
    if (newLength > sourceLength) {                                  // reading beyond the end of the source vector
640
        memset(source + sourceLength, 0, size_t(newLength - sourceLength));  // make sure the rest is zero
641
    }
642
    uint32_t pos1 = 0;                           // position in source vector
643
    uint32_t pos2 = 0;                           // position in destination vector
644
    // loop through mask register
645
    for (pos1 = 0; pos1 < newLength; pos1 += elementSize) {
646
        if (*(masksrc + pos1) & 1) {             // check mask bit
647
            // copy from pos1 in source to pos2 in destination
648
            switch (elementSize) {
649
            case 1:  // int8
650
                *(destination+pos2) = *(source+pos1);
651
                break;
652
            case 2:  // int16
653
                *(uint16_t*)(destination+pos2) = *(uint16_t*)(source+pos1);
654
                break;
655
            case 4:  // int32, float
656
                *(uint32_t*)(destination+pos2) = *(uint32_t*)(source+pos1);
657
                break;
658
            case 8:  // int64, double
659
                *(uint64_t*)(destination+pos2) = *(uint64_t*)(source+pos1);
660
                break;
661
            case 16:  // int128, float128
662
                *(uint64_t*)(destination+pos2)   = *(uint64_t*)(source+pos1);
663
                *(uint64_t*)(destination+pos2+8) = *(uint64_t*)(source+pos1+8);
664
                break;
665
            }
666
            pos2 += elementSize;
667
        }
668
    }
669
    // set new length of destination vector
670
    t->vectorLength[rd] = pos2;
671
    t->vect = 4;                                 // stop vector loop
672
    t->running = 2;                              // don't save. result has already been saved
673
    return 0;
674
}
675
 
676
static uint64_t expand_sparse(CThread * t) {
677
    // Expand contiguous vector into sparse vector with positions indicated by mask bits
678
    // RS = length of output vector
679
    uint8_t  rd = t->operands[0];         // destination vector
680
    uint8_t  rs = t->operands[4];         // source vector
681
    uint8_t  rt = t->operands[5];         // length indicator
682
    uint8_t  rm = t->operands[1];         // mask vector
683
    uint32_t sourceLength = t->vectorLength[rs]; // length of source vector
684
    uint32_t maskLength = t->vectorLength[rm];   // length of mask vector
685
    uint64_t newLength = t->registers[rt];       // length of destination
686
    uint32_t elementSize = dataSizeTable[t->operandType & 7];        // size of each element
687
    int8_t * source = t->vectors.buf() + rs*t->MaxVectorLength;      // address of RS data
688
    int8_t * masksrc = t->vectors.buf() + rm*t->MaxVectorLength;     // address of mask data
689
    int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
690
    if (rd == rs) {
691
        // source and destination are the same. Make a temporary copy of source to avoid overwriting
692
        memcpy(t->tempBuffer, source, sourceLength);
693
        source = t->tempBuffer;
694
    }
695
    // limit length
696
    if (newLength > t->MaxVectorLength) newLength = t->MaxVectorLength;
697
    if (newLength > maskLength) newLength = maskLength;              // no reason to go beyond mask
698
    if (newLength > sourceLength) {                                  // reading beyond the end of the source vector
699
        memset(source + sourceLength, 0, size_t(newLength - sourceLength));  // make sure the rest is zero
700
    }
701
    uint32_t pos1 = 0;                           // position in source vector
702
    uint32_t pos2 = 0;                           // position in destination vector
703
 
704
    // loop through mask register
705
    for (pos2 = 0; pos2 < newLength; pos2 += elementSize) {
706
        if (*(masksrc + pos2) & 1) {             // check mask bit
707
            // copy from pos1 in source to pos2 in destination
708
            switch (elementSize) {
709
            case 1:  // int8
710
                *(destination+pos2) = *(source+pos1);
711
                break;
712
            case 2:  // int16
713
                *(uint16_t*)(destination+pos2) = *(uint16_t*)(source+pos1);
714
                break;
715
            case 4:  // int32, float
716
                *(uint32_t*)(destination+pos2) = *(uint32_t*)(source+pos1);
717
                break;
718
            case 8:  // int64, double
719
                *(uint64_t*)(destination+pos2) = *(uint64_t*)(source+pos1);
720
                break;
721
            case 16:  // int128, float128
722
                *(uint64_t*)(destination+pos2)   = *(uint64_t*)(source+pos1);
723
                *(uint64_t*)(destination+pos2+8) = *(uint64_t*)(source+pos1+8);
724
                break;
725
            }
726
            pos1 += elementSize;
727
        }
728
        else {
729
            // mask is zero. insert zero
730
            switch (elementSize) {
731
            case 1:  // int8
732
                *(destination+pos2) = 0;
733
                break;
734
            case 2:  // int16
735
                *(uint16_t*)(destination+pos2) = 0;
736
                break;
737
            case 4:  // int32, float
738
                *(uint32_t*)(destination+pos2) = 0;
739
                break;
740
            case 8:  // int64, double
741
                *(uint64_t*)(destination+pos2) = 0;
742
                break;
743
            case 16:  // int128, float128
744
                *(uint64_t*)(destination+pos2)   = 0;
745
                *(uint64_t*)(destination+pos2+8) = 0;
746
                break;
747
            }
748
 
749
        }
750
    }
751
    // set new length of destination vector
752
    t->vectorLength[rd] = pos2;
753
    t->vect = 4;                                 // stop vector loop
754
    t->running = 2;                              // don't save. result has already been saved
755
    return 0;
756
}
757
 
758
static uint64_t broad_(CThread * t) {
759
    // Broadcast first element of source vector into all elements of RD with specified length
760
    uint8_t  rlen;                               // g.p. register indicating length
761
    uint64_t value;                              // value to broadcast
762
    uint8_t  rd = t->operands[0];                // destination vector
763
    if (t->fInstr->format2 == 0x120) {
764
        rlen = t->operands[5];                   // RT = length
765
        uint8_t  rs = t->operands[4];            // source vector
766
        value = t->readVectorElement(rs, 0);     // first element of RS
767
    }
768
    else {
769
        rlen = t->operands[4];                   // first source operand = length
770
        value = t->parm[2].q;                    // immediate operand
771
    }
772
    uint64_t destinationLength = t->registers[rlen];  // value of length register
773
    if (destinationLength > t->MaxVectorLength) destinationLength = t->MaxVectorLength; // limit length
774
    // set length of destination register, let vector loop continue to this length
775
    t->vectorLength[rd] = t->vectorLengthR = (uint32_t)destinationLength;
776
    return value;
777
}
778
 
779
static uint64_t bits2bool(CThread * t) {
780
    // The lower n bits of RT are unpacked into a boolean vector RD with length RS
781
    // with one bit in each element, where n = RS / OS.
782
    uint8_t  rd = t->operands[0];         // destination vector
783
    uint8_t  rt = t->operands[5];         // RT = source vector
784
    uint8_t  rs = t->operands[4];         // RS indicates length
785
    SNum mask = t->parm[3];                      // mask
786
    uint8_t * source = (uint8_t*)t->vectors.buf() + rt*t->MaxVectorLength; // address of RT data
787
    uint8_t * destination = (uint8_t*)t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
788
    uint64_t destinationLength = t->registers[rs]; // value of RS = length of destination
789
    uint8_t  dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
790
    if (destinationLength > t->MaxVectorLength) destinationLength = t->MaxVectorLength; // limit length
791
    // set length of destination register
792
    t->vectorLength[rd] = (uint32_t)destinationLength;
793
    uint32_t num = (uint32_t)destinationLength >> dsizelog; // number of elements
794
    destinationLength = num << dsizelog;          // round down length to nearest multiple of element size
795
    // number of bits in source
796
    uint32_t srcnum = t->vectorLength[rt] * 8;
797
    if (num < srcnum) num = srcnum;              // limit to the number of bits in source
798
    mask.q &= -(int64_t)2;                       // remove lower bit of mask. it will be replaced by source bit
799
    // loop through bits
800
    for (uint32_t i = 0; i < num; i++) {
801
        uint8_t bit = (source[i / 8] >> (i & 7)) & 1;  // extract single bit from source
802
        switch (dsizelog) {
803
        case 0:  // int8
804
            *destination = mask.b | bit;  break;
805
        case 1:  // int16
806
            *(uint16_t*)destination = mask.s | bit;  break;
807
        case 2:  // int32
808
            *(uint32_t*)destination = mask.i | bit;  break;
809
        case 3:  // int64
810
            *(uint64_t*)destination = mask.q | bit;  break;
811
        case 4:  // int128
812
            *(uint64_t*)destination = mask.q | bit;
813
            *(uint64_t*)(destination+8) = mask.q | bit;
814
            break;
815
        }
816
        destination += (uint64_t)1 << dsizelog;
817
    }
818
    t->vect = 4;                                           // stop vector loop
819
    t->running = 2;                                        // don't save RD
820
    if ((t->returnType & 7) >= 5) t->returnType -= 3;      // make return type integer
821
    return 0;
822
}
823
 
824
 
825
static uint64_t shift_expand(CThread * t) {
826
    // Shift vector RS up by RT bytes and extend the vector length by RT. 
827
    // The lower RT bytes of RD will be zero.
828
    uint8_t  rd = t->operands[0];         // destination vector
829
    uint8_t  rs = t->operands[4];         // RS = source vector
830
    uint8_t  rt = t->operands[5];         // RT indicates length
831
    uint8_t * source = (uint8_t*)t->vectors.buf() + rs*t->MaxVectorLength; // address of RS data
832
    uint8_t * destination = (uint8_t*)t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
833
    uint64_t shiftCount = t->registers[rt];      // value of RT = shift count
834
    if (shiftCount > t->MaxVectorLength) shiftCount = t->MaxVectorLength; // limit length
835
    uint32_t sourceLength = t->vectorLength[rs]; // length of source vector
836
    uint32_t destinationLength = sourceLength + (uint32_t)shiftCount; // length of destination vector
837
    if (destinationLength > t->MaxVectorLength) destinationLength = t->MaxVectorLength; // limit length
838
    // set length of destination vector
839
    t->vectorLength[rd] = destinationLength;
840
    // set lower part of destination to zero
841
    memset(destination, 0, size_t(shiftCount));
842
    // copy the rest from source
843
    if (destinationLength > shiftCount) {
844
        memmove(destination + shiftCount, source, size_t(destinationLength - shiftCount));
845
    }
846
    t->vect = 4;                                 // stop vector loop
847
    t->running = 2;                              // don't save RD. It has already been saved
848
    return 0;
849
}
850
 
851
static uint64_t shift_reduce(CThread * t) {
852
    // Shift vector RS down RT bytes and reduce the length by RT. 
853
    // The lower RT bytes of RS are lost
854
    uint8_t  rd = t->operands[0];         // destination vector
855
    uint8_t  rs = t->operands[4];         // RS = source vector
856
    uint8_t  rt = t->operands[5];         // RT indicates length
857
    uint8_t * source = (uint8_t*)t->vectors.buf() + rs*t->MaxVectorLength; // address of RS data
858
    uint8_t * destination = (uint8_t*)t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
859
    uint32_t sourceLength = t->vectorLength[rs]; // length of source vector
860
    uint64_t shiftCount = t->registers[rt];      // value of RT = shift count
861
    if (shiftCount > sourceLength) shiftCount = sourceLength; // limit length
862
    uint32_t destinationLength = sourceLength - (uint32_t)shiftCount; // length of destination vector
863
    t->vectorLength[rd] = destinationLength;     // set length of destination vector
864
    // copy data from source
865
    if (destinationLength > 0) {
866
        memmove(destination, source + shiftCount, destinationLength);
867
    }
868
    t->vect = 4;                                           // stop vector loop
869
    t->running = 2;                                        // don't save RD. It has already been saved
870
    return 0;
871
}
872
 
873
static uint64_t shift_up(CThread * t) {
874
    // Shift elements of vector RS up RT elements.
875
    // The lower RT elements of RD will be zero, the upper RT elements of RS are lost.
876
    uint8_t  rd = t->operands[0];         // destination vector
877
    uint8_t  rs = t->operands[4];         // RS = source vector
878
    uint8_t  rt = t->operands[5];         // RT indicates length
879
    uint8_t * source = (uint8_t*)t->vectors.buf() + rs * t->MaxVectorLength; // address of RS data
880
    uint8_t * destination = (uint8_t*)t->vectors.buf() + rd * t->MaxVectorLength; // address of RD data
881
    uint8_t  dsizelog = dataSizeTableLog[t->operandType];  // log2(elementsize)
882
    uint64_t shiftCount = t->registers[rt] << dsizelog;      // value of TS = shift count, elements
883
    if (shiftCount > t->MaxVectorLength) shiftCount = t->MaxVectorLength; // limit length
884
    uint32_t sourceLength = t->vectorLength[rs]; // length of source vector
885
    t->vectorLength[rd] = sourceLength;          // set length of destination vector to the same as source vector
886
    // copy from source
887
    if (sourceLength > shiftCount) {
888
        memmove(destination + shiftCount, source, size_t(sourceLength - shiftCount));
889
    }
890
    // set lower part of destination to zero
891
    memset(destination, 0, size_t(shiftCount));
892
    t->vect = 4;                                           // stop vector loop
893
    t->running = 2;                                        // don't save RD. It has already been saved
894
    return 0;
895
}
896
 
897
static uint64_t shift_down(CThread * t) {
898
    // Shift elements of vector RS down RT elements.
899
    // The upper RT elements of RD will be zero, the lower RT elements of RS are lost.
900
    uint8_t  rd = t->operands[0];                   // destination vector
901
    uint8_t  rs = t->operands[4];                   // RS = source vector
902
    uint8_t  rt = t->operands[5];                   // RT indicates length
903
    uint8_t * source = (uint8_t*)t->vectors.buf() + rs*t->MaxVectorLength; // address of RS data
904
    uint8_t * destination = (uint8_t*)t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
905
    uint32_t sourceLength = t->vectorLength[rs];           // length of source vector
906
    uint8_t  dsizelog = dataSizeTableLog[t->operandType];  // log2(elementsize)
907
    uint64_t shiftCount = t->registers[rt] << dsizelog;    // value of RT = shift count, elements
908
    if (shiftCount > sourceLength) shiftCount = sourceLength; // limit length
909
    t->vectorLength[rd] = sourceLength;                    // set length of destination vector
910
    if (sourceLength > shiftCount) {                       // copy data from source
911
        memmove(destination, source + shiftCount, size_t(sourceLength - shiftCount));
912
    }
913
    if (shiftCount > 0) {                                  // set the rest to zero
914
        memset(destination + sourceLength - shiftCount, 0, size_t(shiftCount));
915
    }
916
    t->vect = 4;                                           // stop vector loop
917
    t->running = 2;                                        // don't save RD. It has already been saved
918
    return 0;
919
}
920
 
921
/*
922
static uint64_t rotate_up (CThread * t) {
923
    // Rotate vector RT up one element.
924
    uint8_t  rd = t->operands[0];         // destination vector
925
    uint8_t  rt = t->operands[5];         // RT = source vector
926
    //uint8_t  rs = t->operands[4];         // RS indicates length
927
    int8_t * source = t->vectors.buf() + rt*t->MaxVectorLength; // address of RT data
928
    int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
929
    //uint64_t length = t->registers[rs];          // value of RS = vector length
930
    //if (length > t->MaxVectorLength) length = t->MaxVectorLength; // limit length
931
    uint32_t sourceLength = t->vectorLength[rt]; // length of source vector
932
    uint32_t length = sourceLength;
933
    if (rd == rt) {
934
        // source and destination are the same. Make a temporary copy of source to avoid overwriting
935
        memcpy(t->tempBuffer, source, length);
936
        source = t->tempBuffer;
937
    }
938
    if (length > sourceLength) {                 // reading beyond the end of the source vector. make sure the rest is zero
939
        memset(source + sourceLength, 0, size_t(length - sourceLength));
940
    }
941
    uint32_t elementSize = dataSizeTable[t->operandType];            // size of each element
942
    if (elementSize > length) elementSize = (uint32_t)length;
943
    t->vectorLength[rd] = (uint32_t)length;                // set length of destination vector
944
    memcpy(destination, source + length - elementSize, elementSize); // copy top element to bottom
945
    memcpy(destination + elementSize, source, size_t(length - elementSize)); // copy the rest
946
    t->vect = 4;                                           // stop vector loop
947
    t->running = 2;                                        // don't save RD. It has already been saved
948
    return 0;
949
}
950
 
951
static uint64_t rotate_down (CThread * t) {
952
    // Rotate vector RT down one element.
953
    uint8_t  rd = t->operands[0];         // destination vector
954
    uint8_t  rt = t->operands[5];         // RT = source vector
955
    //uint8_t  rs = t->operands[4];         // RS indicates length
956
    int8_t * source = t->vectors.buf() + rt*t->MaxVectorLength; // address of RT data
957
    int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
958
    //uint64_t length = t->registers[rs];          // value of RS = vector length
959
    uint32_t sourceLength = t->vectorLength[rt]; // length of source vector
960
    uint32_t length = sourceLength;
961
    //if (length > t->MaxVectorLength) length = t->MaxVectorLength; // limit length
962
    if (rd == rt) {
963
        // source and destination are the same. Make a temporary copy of source to avoid overwriting
964
        memcpy(t->tempBuffer, source, length);
965
        source = t->tempBuffer;
966
    }
967
    if (length > sourceLength) {                 // reading beyond the end of the source vector. make sure the rest is zero
968
        memset(source + sourceLength, 0, size_t(length - sourceLength));
969
    }
970
    uint32_t elementSize = dataSizeTable[t->operandType];            // size of each element
971
    if (elementSize > length) elementSize = (uint32_t)length;
972
    t->vectorLength[rd] = (uint32_t)length;      // set length of destination vector
973
    memcpy(destination, source + elementSize, size_t(length - elementSize)); // copy down
974
    memcpy(destination + length - elementSize, source, elementSize); // copy the bottom element to top
975
    t->vect = 4;                                           // stop vector loop
976
    t->running = 2;                                        // don't save RD. It has already been saved
977
    return 0;
978
}*/
979
 
980
static uint64_t div_ex (CThread * t) {
981
    // Divide vector of double-size integers RS by integers RT. 
982
    // RS has element size 2·OS. These are divided by the even numbered elements of RT with size OS.
983
    // The truncated results are stored in the even-numbered elements of RD. 
984
    // The remainders are stored in the odd-numbered elements of RD
985
    // op = 24: signed, 25: unsigned
986
    SNum result;                                 // quotient
987
    SNum remainder;                              // remainder
988
    SNum a_lo = t->parm[1];                      // low part of dividend
989
    SNum b = t->parm[2];                         // divisor
990
    uint8_t rs = t->operands[4];          // RS indicates length
991
    uint32_t elementSize = dataSizeTable[t->operandType];            // size of each element
992
    SNum a_hi;
993
    a_hi.q = t->readVectorElement(rs, t->vectorOffset + elementSize);  // high part of dividend
994
    uint64_t sizemask = dataSizeMask[t->operandType]; // mask for operand size
995
    uint64_t signbit = (sizemask >> 1) + 1;      // mask indicating sign bit
996
    //SNum mask = t->parm[3];                      // mask register value or NUMCONTR
997
    bool isUnsigned = t->op & 1;                 // 24: signed, 25: unsigned
998
    bool overflow = false;
999
    int sign = 0;                                // 1 if result is negative
1000
 
1001
    if (!isUnsigned) {                           // convert signed division to unsigned
1002
        if (b.q & signbit) {                     // b is negative. make it positive
1003
            b.qs = -b.qs;  sign = 1;
1004
        }
1005
        if (a_hi.q & signbit) {                  // a is negative. make it positive
1006
            a_lo.qs = - a_lo.qs;
1007
            a_hi.q  = ~ a_hi.q;
1008
            if ((a_lo.q & sizemask) == 0) a_hi.q++; // carry from low to high part
1009
            sign ^= 1;                           // invert sign
1010
        }
1011
    }
1012
    // limit data size
1013
    b.q    &= sizemask;
1014
    a_hi.q &= sizemask;
1015
    a_lo.q &= sizemask;
1016
    result.q = 0;
1017
    remainder.q = 0;
1018
    // check for overflow
1019
    if (a_hi.q >= b.q || b.q == 0) {
1020
        overflow = true;
1021
    }
1022
    else {
1023
        switch (t->operandType) {
1024
        case 0: // int8
1025
            a_lo.s |= a_hi.s << 8;
1026
            result.s = a_lo.s / b.s;
1027
            remainder.s = a_lo.s % b.s;
1028
            break;
1029
        case 1: // int16
1030
            a_lo.i |= a_hi.i << 16;
1031
            result.i = a_lo.i / b.i;
1032
            remainder.i = a_lo.i % b.i;
1033
            break;
1034
        case 2: // int32
1035
            a_lo.q |= a_hi.q << 32;
1036
            result.q = a_lo.q / b.q;
1037
            remainder.q = a_lo.q % b.q;
1038
            break;
1039
        case 3: // int64
1040
            // to do: implement 128/64 -> 64 division by intrinsic or inline assembly
1041
            // or bit shift method (other methods are too complex)
1042
        default:
1043
            t->interrupt(INT_WRONG_PARAMETERS);
1044
        }
1045
    }
1046
    // check sign
1047
    if (sign) {
1048
        if (result.q == signbit) overflow = true;
1049
        result.qs = - result.qs;
1050
        if (remainder.q == signbit) overflow = true;
1051
        remainder.qs = - remainder.qs;
1052
    }
1053
    if (overflow) {
1054
        if (isUnsigned) {   // unsigned overflow
1055
            //if (mask.i & MSK_OVERFL_UNSIGN) t->interrupt(INT_OVERFL_UNSIGN);  // unsigned overflow
1056
            result.q = sizemask;
1057
            remainder.q = 0;
1058
        }
1059
        else {       // signed overflow
1060
            //if (mask.i & MSK_OVERFL_SIGN) t->interrupt(INT_OVERFL_SIGN);      // signed overflow
1061
            result.q = signbit;
1062
            remainder.q = 0;
1063
        }
1064
    }
1065
    t->parm[5].q = remainder.q;                  // save remainder
1066
    return result.q;
1067
}
1068
 
1069
static uint64_t f_mul_ex(CThread * t) {
1070
    // extended signed multiply. result uses two consecutive array elements
1071
    if (!t->vect) {
1072
        t->interrupt(INT_WRONG_PARAMETERS);  return 0;
1073
    }
1074
    SNum result;
1075
    switch (t->operandType) {
1076
    case 0:   // int8
1077
        result.is = ((int32_t)t->parm[1].bs * (int32_t)t->parm[2].bs);
1078
        t->parm[5].is = result.is >> 8;  // store high part in parm[q]
1079
        break;
1080
    case 1:   // int16
1081
        result.is = ((int32_t)t->parm[1].ss * (int32_t)t->parm[2].ss);
1082
        t->parm[5].is = result.is >> 16;  // store high part in parm[5]
1083
        break;
1084
    case 2:   // int32
1085
        result.qs = ((int64_t)t->parm[1].is * (int64_t)t->parm[2].is);
1086
        t->parm[5].qs = result.qs >> 32;  // store high part in parm[5]
1087
        break;
1088
    case 3:   // int64
1089
        result.qs = mul64_128s(&t->parm[5].q, t->parm[1].qs, t->parm[2].qs);
1090
        break;
1091
    default:
1092
        t->interrupt(INT_WRONG_PARAMETERS);
1093
        result.i = 0;
1094
    }
1095
    return result.q;
1096
}
1097
 
1098
static uint64_t f_mul_ex_u(CThread * t) {
1099
    // extended unsigned multiply. result uses two consecutive array elements
1100
    if (!t->vect) {
1101
        t->interrupt(INT_WRONG_PARAMETERS);  return 0;
1102
    }
1103
    SNum result;
1104
    switch (t->operandType) {
1105
    case 0:   // int8
1106
        result.i = ((uint32_t)t->parm[1].b * (uint32_t)t->parm[2].b);
1107
        t->parm[5].i = result.i >> 8;  // store high part in parm[5]
1108
        break;
1109
    case 1:   // int16
1110
        result.i = ((uint32_t)t->parm[1].s * (uint32_t)t->parm[2].s);
1111
        t->parm[5].i = result.i >> 16;  // store high part in parm[5]
1112
        break;
1113
    case 2:   // int32
1114
        result.q = ((uint64_t)t->parm[1].i * (uint64_t)t->parm[2].i);
1115
        t->parm[5].q = result.q >> 32;  // store high part in parm[5]
1116
        break;
1117
    case 3:   // int64
1118
        result.q = mul64_128u(&t->parm[5].q, t->parm[1].q, t->parm[2].q);
1119
        break;
1120
    default:
1121
        t->interrupt(INT_WRONG_PARAMETERS);
1122
        result.i = 0;
1123
    }
1124
    return result.q;
1125
}
1126
 
1127
static uint64_t sqrt_ (CThread * t) {
1128
    // square root
1129
    SNum a = t->parm[2];                         // input operand
1130
    SNum result;  result.q = 0;
1131
    uint32_t mask = t->parm[3].i;
1132
    uint8_t operandType = t->operandType;
1133
    bool detectExceptions = (mask & (0xF << MSKI_EXCEPTIONS)) != 0;  // make NAN if exceptions
1134
    bool roundingMode = (mask & (3 << MSKI_ROUNDING)) != 0;  // non-standard rounding mode
1135
    bool error = false;
1136
    switch (operandType) {
1137
    case 0:   // int8
1138
        if (a.bs < 0) error = true;
1139
        else result.b = (int8_t)sqrtf(a.bs);
1140
        break;
1141
    case 1:   // int16
1142
        if (a.ss < 0) error = true;
1143
        else result.s = (int16_t)sqrtf(a.bs);
1144
        break;
1145
    case 2:   // int32
1146
        if (a.is < 0) error = true;
1147
        else result.i = (int32_t)sqrt(a.bs);
1148
        break;
1149
    case 3:   // int64
1150
        if (a.qs < 0) error = true;
1151
        else result.q = (int64_t)sqrt(a.bs);
1152
        break;
1153
    case 5:   // float
1154
        if (a.f < 0) {
1155
            result.q = t->makeNan(nan_invalid_sqrt, operandType);
1156
        }
1157
        else {
1158
            if (detectExceptions) clearExceptionFlags();   // clear previous exceptions
1159
            if (roundingMode) setRoundingMode(mask >> MSKI_ROUNDING);
1160
            result.f = sqrtf(a.f);                         // calculate square root
1161
            if (roundingMode) setRoundingMode(0);
1162
            if (detectExceptions) {
1163
                uint32_t x = getExceptionFlags();          // read exceptions
1164
                if ((mask & (1<<MSK_UNDERFLOW)) && (x & 0x10)) result.q = t->makeNan(nan_underflow, operandType);
1165
                else if ((mask & (1<<MSK_INEXACT)) && (x & 0x20)) result.q = t->makeNan(nan_inexact, operandType);
1166
            }
1167
        }
1168
        break;
1169
    case 6:   // double
1170
        if (a.d < 0) {
1171
            result.q = t->makeNan(nan_invalid_sqrt, operandType);
1172
        }
1173
        else {
1174
            if (detectExceptions) clearExceptionFlags();   // clear previous exceptions
1175
            if (roundingMode) setRoundingMode(mask >> MSKI_ROUNDING);
1176
            result.d = sqrt(a.d);                          // calculate square root
1177
            if (roundingMode) setRoundingMode(0);
1178
            if (detectExceptions) {
1179
                uint32_t x = getExceptionFlags();          // read exceptions
1180
                if ((mask & (1<<MSK_UNDERFLOW)) && (x & 0x10)) result.q = t->makeNan(nan_underflow, operandType);
1181
                else if ((mask & (1<<MSK_INEXACT)) && (x & 0x20)) result.q = t->makeNan(nan_inexact, operandType);
1182
            }
1183
        }
1184
        break;
1185
    default:
1186
        t->interrupt(INT_WRONG_PARAMETERS);
1187
    }
1188
    return result.q;
1189
}
1190
 
1191
static uint64_t add_c (CThread * t) {
1192
    // Add with carry. Vector has two elements. 
1193
    // The upper element is used as carry on input and output
1194
    SNum a = t->parm[1];                         // input operand
1195
    SNum b = t->parm[2];                         // input operand
1196
    SNum result;
1197
    uint8_t rs = t->operands[4];          // RS is first input vector
1198
    uint32_t elementSize = dataSizeTable[t->operandType]; // size of each element
1199
    SNum carry;
1200
    carry.q = t->readVectorElement(rs, t->vectorOffset + elementSize);  // high part of first input vector
1201
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size    
1202
    result.q = a.q + b.q;                        // add    
1203
    uint8_t newCarry = (result.q & sizeMask) < (a.q & sizeMask); // get new carry
1204
    result.q += carry.q & 1;                     // add carry
1205
    if ((result.q & sizeMask) == 0) newCarry = 1;// carry
1206
    t->parm[5].q = newCarry;                     // save new carry
1207
    return result.q;
1208
}
1209
 
1210
static uint64_t sub_b (CThread * t) {
1211
    // Subtract with borrow. Vector has two elements. 
1212
    // The upper element is used as borrow on input and output
1213
    SNum a = t->parm[1];                         // input operand
1214
    SNum b = t->parm[2];                         // input operand
1215
    SNum result;
1216
    uint8_t rs = t->operands[4];          // RS is first input vector
1217
    uint32_t elementSize = dataSizeTable[t->operandType]; // size of each element
1218
    SNum carry;
1219
    carry.q = t->readVectorElement(rs, t->vectorOffset + elementSize);  // high part of first input vector
1220
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size    
1221
    result.q = a.q - b.q;                        // subtract
1222
    uint8_t newCarry = (result.q & sizeMask) > (a.q & sizeMask); // get new carry
1223
    result.q -= carry.q & 1;                     // subtract borrow
1224
    if ((result.q & sizeMask) == sizeMask) newCarry = 1;// borrow
1225
    t->parm[5].q = newCarry;                     // save new borrow
1226
    return result.q;
1227
}
1228
 
1229
static uint64_t add_ss (CThread * t) {
1230
    // Add integer vectors, signed with saturation
1231
    SNum a = t->parm[1];                         // input operand
1232
    SNum b = t->parm[2];                         // input operand
1233
    SNum result;
1234
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
1235
    uint64_t signBit = (sizeMask >> 1) + 1;      // sign bit
1236
    result.q = a.q + b.q;                        // add
1237
    uint64_t overfl = ~(a.q ^ b.q) & (a.q ^ result.q); // overflow if a and b have same sign and result has opposite sign
1238
    if (overfl & signBit) { // overflow
1239
        result.q = (sizeMask >> 1) + ((a.q & signBit) != 0); // INT_MAX or INT_MIN
1240
    }
1241
    return result.q;
1242
}
1243
 
1244
static uint64_t sub_ss (CThread * t) {
1245
    // subtract integer vectors, signed with saturation
1246
    SNum a = t->parm[1];                         // input operand
1247
    SNum b = t->parm[2];                         // input operand
1248
    SNum result;
1249
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
1250
    uint64_t signBit = (sizeMask >> 1) + 1;      // sign bit
1251
    result.q = a.q - b.q;                        // subtract
1252
    uint64_t overfl = (a.q ^ b.q) & (a.q ^ result.q); // overflow if a and b have different sign and result has opposite sign of a
1253
    if (overfl & signBit) { // overflow
1254
        result.q = (sizeMask >> 1) + ((a.q & signBit) != 0); // INT_MAX or INT_MIN
1255
    }
1256
    return result.q;
1257
}
1258
 
1259
static uint64_t add_us (CThread * t) {
1260
    // Add integer vectors, unsigned with saturation
1261
    SNum a = t->parm[1];                         // input operand
1262
    SNum b = t->parm[2];                         // input operand
1263
    SNum result;
1264
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
1265
    result.q = a.q + b.q;                        // add
1266
    if ((result.q & sizeMask) < (a.q & sizeMask)) {   // overflow
1267
        result.q = sizeMask;                     // UINT_MAX
1268
    }
1269
    return result.q;
1270
}
1271
 
1272
static uint64_t sub_us (CThread * t) {
1273
    // subtract integer vectors, unsigned with saturation
1274
    SNum a = t->parm[1];                         // input operand
1275
    SNum b = t->parm[2];                         // input operand
1276
    SNum result;
1277
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
1278
    result.q = a.q - b.q;                        // add
1279
    if ((result.q & sizeMask) > (a.q & sizeMask)) {   // overflow
1280
        result.q = 0;                            // 0
1281
    }
1282
    return result.q;
1283
}
1284
 
1285
static uint64_t mul_ss (CThread * t) {
1286
    // multiply integer vectors, signed with saturation
1287
    SNum a = t->parm[1];                         // input operand
1288
    SNum b = t->parm[2];                         // input operand
1289
    SNum result;
1290
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
1291
    uint64_t signBit = (sizeMask >> 1) + 1;      // sign bit
1292
 
1293
    // check for overflow
1294
    bool overflow = false;
1295
    switch (t->operandType) {
1296
    case 0:  // int8
1297
        result.is = (int32_t)a.bs * (int32_t)b.bs;                        // multiply
1298
        overflow = result.bs != result.is;  break;
1299
    case 1:  // int16
1300
        result.is = (int32_t)a.ss * (int32_t)b.ss;                        // multiply
1301
        overflow = result.ss != result.is;  break;
1302
    case 2:  // int32
1303
        result.qs = (int64_t)a.is * (int64_t)b.is;                        // multiply
1304
        overflow = result.is != result.qs;  break;
1305
    case 3:  // int64
1306
        result.qs = a.qs * b.qs;                        // multiply
1307
        overflow = fabs((double)a.qs * (double)b.qs - (double)result.qs) > 1.E8;
1308
        break;
1309
    default:
1310
        t->interrupt(INT_WRONG_PARAMETERS);
1311
    }
1312
    if (overflow) {
1313
        result.q = (sizeMask >> 1) + (((a.q ^ b.q) & signBit) != 0);  // INT_MAX or INT_MIN
1314
    }
1315
    return result.q;
1316
}
1317
 
1318
static uint64_t mul_us (CThread * t) {
1319
    // multiply integer vectors, unsigned with saturation
1320
    SNum a = t->parm[1];                         // input operand
1321
    SNum b = t->parm[2];                         // input operand
1322
    SNum result;
1323
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
1324
 
1325
    // check for overflow
1326
    bool overflow = false;
1327
    switch (t->operandType) {
1328
    case 0:
1329
        result.i = (uint32_t)a.b * (uint32_t)b.b;                        // multiply
1330
        overflow = result.b != result.i;  break;
1331
    case 1:
1332
        result.i = (uint32_t)a.s * (uint32_t)b.s;
1333
        overflow = result.s != result.i;  break;
1334
    case 2:
1335
        result.q = (uint64_t)a.i * (uint64_t)b.i;
1336
        overflow = result.i != result.q;  break;
1337
    case 3:
1338
        result.q = a.q * b.q;
1339
        overflow = fabs((double)a.q * (double)b.q - (double)result.q) > 1.E8;
1340
        break;
1341
    default:
1342
        t->interrupt(INT_WRONG_PARAMETERS);
1343
    }
1344
    if (overflow) {
1345
        result.q = sizeMask;
1346
    }
1347
    return result.q;
1348
}
1349
 
1350
/*
1351
static uint64_t shift_ss (CThread * t) {
1352
    // Shift left integer vectors, signed with saturation
1353
    SNum a = t->parm[1];                         // input operand
1354
    SNum b = t->parm[2];                         // input operand
1355
    SNum result;
1356
    result.q = a.q << b.i;                       // shift left
1357
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
1358
    uint64_t signBit = (sizeMask >> 1) + 1;      // sign bit
1359
    uint32_t bits1 = bitScanReverse(a.q & sizeMask) + 1;  // number of bits in a
1360
    uint32_t bitsMax = dataSizeTable[t->operandType]; // maximum number of bits if negative
1361
    uint8_t negative = (a.q & signBit) != 0;     // a is negative
1362
    if (!negative) bitsMax--;                    // maximum number of bits if positive
1363
    if ((a.q & sizeMask) != 0 && bits1 + (b.q & sizeMask) > bitsMax) { // overflow
1364
        result.q = (sizeMask >> 1) + negative;   // INT_MAX or INT_MIN
1365
    }
1366
    return result.q;
1367
}
1368
 
1369
static uint64_t shift_us (CThread * t) {
1370
    // Shift left integer vectors, unsigned with saturation
1371
    SNum a = t->parm[1];                         // input operand
1372
    SNum b = t->parm[2];                         // input operand
1373
    SNum result;
1374
    result.q = a.q << b.i;                       // shift left
1375
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
1376
    uint32_t bits1 = bitScanReverse(a.q & sizeMask) + 1;  // number of bits in a
1377
    uint32_t bitsMax = dataSizeTable[t->operandType]; // maximum number of bits
1378
    if ((a.q & sizeMask) != 0 && bits1 + (b.q & sizeMask) > bitsMax) { // overflow
1379
        result.q = sizeMask;                     // UINT_MAX
1380
    }
1381
    return result.q;
1382
} */
1383
 
1384
/*
1385
Instructions with overflow check use the even-numbered vector elements for arithmetic instructions.
1386
Each following odd-numbered vector element is used for overflow detection. If the first source operand
1387
is a scalar then the result operand will be a vector with two elements.
1388
Overflow conditions are indicated with the following bits:
1389
bit 0. Unsigned integer overflow (carry).
1390
bit 1. Signed integer overflow.
1391
The values are propagated so that the overflow result of the operation is OR’ed with the corresponding
1392
values of both input operands. */
1393
 
1394
static uint64_t add_oc (CThread * t) {
1395
    // add with overflow check
1396
    SNum a = t->parm[1];                         // input operand
1397
    SNum b = t->parm[2];                         // input operand
1398
    uint8_t rs = t->operands[4];          // RS is first input vector
1399
    uint8_t rt = t->operands[5];          // RT is first input vector
1400
    uint32_t elementSize = dataSizeTable[t->operandType]; // size of each element
1401
    SNum carry;
1402
    carry.q  = t->readVectorElement(rs, t->vectorOffset + elementSize); // high part of first input vector
1403
    carry.q |= t->readVectorElement(rt, t->vectorOffset + elementSize);  // high part of second input vector
1404
    SNum result;
1405
 
1406
    if (t->operandType < 4) {
1407
        uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
1408
        result.q = a.q + b.q;                    // add
1409
        if ((result.q & sizeMask) < (a.q & sizeMask)) { // unsigned overflow
1410
            carry.b |= 1;
1411
        }
1412
        // signed overflow if a and b have same sign and result has opposite sign
1413
        uint64_t signedOverflow = ~(a.q ^ b.q) & (a.q ^ result.q);
1414
        uint64_t signBit = (sizeMask >> 1) + 1;      // sign bit
1415
        if (signedOverflow & signBit) {
1416
            carry.b |= 2;
1417
        }
1418
    }
1419
    else {
1420
        // unsupported operand type
1421
        t->interrupt(INT_WRONG_PARAMETERS);  result.q = 0;
1422
    }
1423
    t->parm[5].q = carry.q & 3;                  // return carry
1424
    return result.q;                             // return result
1425
}
1426
 
1427
static uint64_t sub_oc (CThread * t) {
1428
    // subtract with overflow check
1429
    SNum a = t->parm[1];                         // input operand
1430
    SNum b = t->parm[2];                         // input operand
1431
    uint8_t rs = t->operands[4];          // RS is first input vector
1432
    uint8_t rt = t->operands[5];          // RT is second input vector
1433
    uint32_t elementSize = dataSizeTable[t->operandType]; // size of each element
1434
    SNum carry;
1435
    carry.q  = t->readVectorElement(rs, t->vectorOffset + elementSize);  // high part of first input vector
1436
    carry.q |= t->readVectorElement(rt, t->vectorOffset + elementSize);  // high part of second input vector
1437
    SNum result;
1438
    if (t->operandType < 4) {
1439
        uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
1440
        result.q = a.q - b.q;                    // add
1441
        if ((result.q & sizeMask) > (a.q & sizeMask)) { // unsigned overflow
1442
            carry.b |= 1;
1443
        }
1444
        // signed overflow if a and b have opposite sign and result has opposite sign of a
1445
        uint64_t signedOverflow = (a.q ^ b.q) & (a.q ^ result.q);
1446
        uint64_t signBit = (sizeMask >> 1) + 1;      // sign bit
1447
        if (signedOverflow & signBit) {
1448
            carry.b |= 2;
1449
        }
1450
    }
1451
    else {
1452
        // unsupported operand type
1453
        t->interrupt(INT_WRONG_PARAMETERS);  result.q = 0;
1454
    }
1455
    t->parm[5].q = carry.q & 3;                  // return carry
1456
    return result.q;                             // return result
1457
}
1458
 
1459
static uint64_t mul_oc (CThread * t) {
1460
    // multiply with overflow check
1461
    SNum a = t->parm[1];                         // input operand
1462
    SNum b = t->parm[2];                         // input operand
1463
    uint8_t rs = t->operands[4];          // RS is first input vector
1464
    uint8_t rt = t->operands[5];          // RT is second input vector
1465
    uint32_t elementSize = dataSizeTable[t->operandType]; // size of each element
1466
    SNum carry;
1467
    carry.q  = t->readVectorElement(rs, t->vectorOffset + elementSize);  // high part of first input vector
1468
    carry.q |= t->readVectorElement(rt, t->vectorOffset + elementSize);  // high part of second input vector
1469
    SNum result;
1470
    bool signedOverflow = false;
1471
    bool unsignedOverflow = false;
1472
 
1473
    // multiply and check for signed and unsigned overflow
1474
    switch (t->operandType) {
1475
    case 0:
1476
        result.is = (int32_t)a.bs * (int32_t)b.bs;                        // multiply
1477
        unsignedOverflow = result.b != result.i;
1478
        signedOverflow = result.bs != result.is;
1479
        break;
1480
    case 1:
1481
        result.is = (int32_t)a.ss * (int32_t)b.ss;
1482
        unsignedOverflow = result.s != result.i;
1483
        signedOverflow = result.ss != result.is;
1484
        break;
1485
    case 2:
1486
        result.qs = (int64_t)a.is * (int64_t)b.is;
1487
        unsignedOverflow = result.q != result.i;
1488
        signedOverflow = result.qs != result.is;
1489
        break;
1490
    case 3:
1491
        result.qs = a.qs * b.qs;
1492
        unsignedOverflow = fabs((double)a.q * (double)b.q - (double)result.q) > 1.E8;
1493
        signedOverflow   = fabs((double)a.qs * (double)b.qs - (double)result.qs) > 1.E8;
1494
        break;
1495
    default:
1496
        t->interrupt(INT_WRONG_PARAMETERS);
1497
    }
1498
    if (unsignedOverflow) carry.b |= 1;     // unsigned overflow
1499
    if (signedOverflow)   carry.b |= 2;     // signed overflow
1500
    t->parm[5].q = carry.q & 3;                  // return carry
1501
    return result.q;                             // return result
1502
}
1503
 
1504
static uint64_t div_oc (CThread * t) {
1505
    // signed divide with overflow check
1506
    SNum a = t->parm[1];                         // input operand
1507
    SNum b = t->parm[2];                         // input operand
1508
    uint8_t rs = t->operands[4];          // RS is first input vector
1509
    uint8_t rt = t->operands[5];          // RT is second input vector
1510
    uint32_t elementSize = dataSizeTable[t->operandType]; // size of each element
1511
    SNum carry;
1512
    carry.q  = t->readVectorElement(rs, t->vectorOffset + elementSize);  // high part of first input vector
1513
    carry.q |= t->readVectorElement(rt, t->vectorOffset + elementSize);  // high part of second input vector
1514
    SNum result;
1515
 
1516
    // to do: rounding mode!
1517
 
1518
    switch (t->operandType) {
1519
    case 0:  // int8
1520
        if (b.b == 0) {
1521
            result.i = 0x80; carry.b |= 3;     // signed and unsigned overflow
1522
        }
1523
        else if (a.b == 0x80 && b.bs == -1) {
1524
            result.i = 0x80; carry.b |= 2;     // signed overflow
1525
        }
1526
        else result.i = a.bs / b.bs;
1527
        break;
1528
    case 1:  // int16
1529
        if (b.s == 0) {
1530
            result.i = 0x8000; carry.b |= 3;     // signed and unsigned overflow
1531
        }
1532
        else if (a.s == 0x8000 && b.ss == -1) {
1533
            result.i = 0x8000; carry.b |= 2;     // signed overflow
1534
        }
1535
        else result.i = a.ss / b.ss;
1536
        break;
1537
    case 2:  // int32
1538
        if (b.i == 0) {
1539
            result.i = sign_f; carry.b |= 3;     // signed and unsigned overflow
1540
        }
1541
        else if (a.i == sign_f && b.is == -1) {
1542
            result.i = sign_f; carry.b |= 2;     // signed overflow
1543
        }
1544
        else result.i = a.is / b.is;
1545
        break;
1546
    case 3:  // int64
1547
        if (b.q == 0) {
1548
            result.q = sign_d; carry.b |= 3;     // signed and unsigned overflow
1549
        }
1550
        else if (a.q == sign_d && b.qs == int64_t(-1)) {
1551
            result.q = sign_d; carry.b |= 2;     // signed overflow
1552
        }
1553
        else result.qs = a.qs / b.qs;
1554
        break;
1555
    default:
1556
        t->interrupt(INT_WRONG_PARAMETERS);
1557
    }
1558
    t->parm[5].q = carry.q & 3;                  // return carry
1559
    return result.q;                             // return result
1560
}
1561
 
1562
static uint64_t read_spev (CThread * t) {
1563
    // Read special register RS into vector register RD with length RT.
1564
    // to do
1565
    return 0;
1566
}
1567
 
1568
static uint64_t read_call_stack (CThread * t) {
1569
    // read internal call stack. RD = vector register destination of length RS, RT-RS = internal address
1570
    return 0; // to do
1571
}
1572
 
1573
static uint64_t write_call_stack (CThread * t) {
1574
    // write internal call stack. RD = vector register source of length RS, RT-RS = internal address 
1575
    return 0; // to do
1576
}
1577
 
1578
static uint64_t read_memory_map (CThread * t) {
1579
    // read memory map. RD = vector register destination of length RS, RT-RS = internal address 
1580
    return 0; // to do
1581
}
1582
 
1583
static uint64_t write_memory_map (CThread * t) {
1584
    // write memory map. RD = vector register
1585
    return 0; // to do
1586
}
1587
 
1588
/* Input ports to match soft core
1589
Note: serial input from stdin in windows and Linux is messy. Emulation will have quirks.
1590
 
1591
Input port 8. Serial input:
1592
Read one byte from RS232 serial input. The value is
1593
bit 0-7: Received data (zero if input buffer empty)
1594
bit   8: Data valid. Will be 0 if the input buffer is empty. It will not wait for data if the system allows polling
1595
bit   9: More data ready: The input buffer contains at least one more byte ready to read
1596
bit  12: Buffer overflow error. Data has been lost due to input buffer overflow
1597
bit  13: Frame error. Error detected in start bit or stop bit. May be due to noise or wrong BAUD rate
1598
 
1599
Input port 9. Serial input status:
1600
bit 0-15: Number of bytes currently in input buffer
1601
bit   16: Buffer overflow error. Data has been lost due to input buffer overflow
1602
bit   17: Frame error. Error detected in start bit or stop bit. May be due to noise or wrong BAUD rate
1603
 
1604
Input port 11. Serial output status:
1605
bit 0-15: Number of bytes currently in output buffer
1606
bit   16: Buffer overflow error. Data has been lost due to output buffer overflow
1607
bit   18: Ready. The output buffer has enough space to receive at least one more byte
1608
 
1609
*/
1610
 
1611
static uint64_t input_ (CThread * t) {
1612
    // read from input port. 
1613
    // vector version: RD = vector register, RS = port address, RT = vector length
1614
    // g.p. version: RD = g.p. register, RS = port address, IM1 = port address
1615
    using namespace std;  // some compilers have getchar and putchar in namespace std, some not
1616
    if (t->vect) {   // vector version not implemented yet
1617
        t->interrupt(INT_WRONG_PARAMETERS);
1618
        return 0;
1619
    }
1620
    uint32_t port = t->parm[2].i;           // immediate operand contains port number
1621
    if (port == 255) port = t->parm[1].i;   // register operand contains port number
1622
 
1623
    switch (port) {
1624
#if defined (__WINDOWS__) || defined (_WIN32) || defined (_WIN64)
1625
    case 8:    // port 8: read serial input
1626
        if (_kbhit()) {
1627
            //int res = getchar();          // read character from stdin. waits for enter
1628
            int res = _getch();             // read character from stdin. does not wait for enter
1629
            if (res < 0) return 0;          // error or end of file (EOF = -1)
1630
            else return (res | 0x100);      // input valid
1631
        }
1632
        else return 0;
1633
    case 9:    // port 9: read serial input status. Only in systems that allow polling
1634
        return _kbhit();
1635
#else   // Other operating systems
1636
        // Why is there no portable way of non-blocking read or polling a serial input?
1637
    //case 8: case 9:
1638
    //    return 0;  // to do: implement for Linux using curses.h or something
1639
#endif
1640
    case 11:   // port 11: get serial output status.
1641
        return 0;
1642
    default:
1643
        t->interrupt(INT_WRONG_PARAMETERS);
1644
        break;
1645
    }
1646
    return 0;
1647
}
1648
 
1649
/* Output ports to match soft core
1650
Output port 9. Serial input control:
1651
bit    0: Clear buffer. Delete all data currently in the input buffer, and clear error flags
1652
bit    1: Clear error flags but keep data.
1653
          The error bits remain high after an error condition until reset by this or by system reset
1654
 
1655
Output port 10. Serial output:
1656
Write one byte to RS232 serial output.
1657
bit 0-7: Data to write
1658
Other bits are reserved.
1659
 
1660
Output port 11. Serial output control:
1661
bit    0: Clear buffer. Delete all data currently in the input buffer, and clear error flags
1662
bit    1: Clear error flags but keep data.
1663
          The error bits remain high after an error condition until reset by this or by system reset
1664
*/
1665
 
1666
static uint64_t output_ (CThread * t) {
1667
    // write to output port. 
1668
    // vector version: RD = vector register to write, RS = port address, RT = vector length
1669
    // g.p. version: RD = g.p. register to wrote, RS = port address, IM1 = port address
1670
    using namespace std;  // some compilers have getchar and putchar in namespace std::, some not
1671
    if (t->vect) {   // vector version not implemented yet
1672
        t->interrupt(INT_WRONG_PARAMETERS);
1673
        return 0;
1674
    }
1675
    uint32_t port = t->parm[2].i;           // immediate operand contains port number
1676
    if (port == 255) port = t->parm[1].i;   // register operand contains port number
1677
    uint32_t value = t->parm[0].i;          // value to output
1678
    switch (port) {
1679
    case 9:   // clear input buffer
1680
#if defined (__WINDOWS__) || defined (_WIN32) || defined (_WIN64)
1681
        while (_kbhit()) (void)_getch();
1682
#endif
1683
        break;
1684
    case 10:   // write character
1685
        putchar(value);
1686
        break;
1687
    case 11:   // serial output control. not possible in most operating systems
1688
        break;
1689
    default:
1690
        t->interrupt(INT_WRONG_PARAMETERS);
1691
        break;
1692
    }
1693
    t->running = 2;  // don't save to register RD
1694
    return 0;
1695
}
1696
 
1697
 
1698
// tables of single format instructions
1699
// Format 1.0 A. Three general purpose registers
1700
PFunc funcTab4[64] = {
1701
    0, 0, 0, 0, 0, 0, 0, 0
1702
};
1703
 
1704
// Format 1.1 C. One general purpose register and a 16 bit immediate operand. int64
1705
PFunc funcTab5[64] = {
1706
    move_16s, move_16s, 0, move_16u, shifti1_move, shifti1_move, f_add, 0,   // 0 - 7
1707
    f_mul, 0, shifti1_add, shifti1_add, shifti1_and, shifti1_and, shifti1_or, shifti1_or,  // 8 - 15 
1708
    shifti1_xor, shifti1_xor, shift16_add, 0, 0, 0, 0, // 16 -23    
1709
};
1710
 
1711
 
1712
// Format 1.2 A. Three vector register operands
1713
PFunc funcTab6[64] = {
1714
    get_len, get_len, set_len, set_len, insert_, extract_, broad_, 0,               // 0  - 7
1715
    compress_sparse, expand_sparse, 0, 0, bits2bool, 0, 0, 0,                       // 8 - 15
1716
    shift_expand, shift_reduce, shift_up, shift_down, 0, 0, 0, 0, // 16 - 23
1717
    div_ex, div_ex, f_mul_ex, f_mul_ex_u, sqrt_, 0, 0, 0,                           // 24 - 31
1718
    add_ss, add_us, sub_ss, sub_us, mul_ss, mul_us, add_oc, sub_oc,                 // 32 - 39
1719
    mul_oc, div_oc, add_c, sub_b, 0, 0, 0, 0,                                       // 40 - 47
1720
    0, 0, 0, 0, 0, 0, 0, 0,                                                         // 48 - 55
1721
    read_spev, 0, read_call_stack, write_call_stack, read_memory_map, write_memory_map, input_, output_ // 56 - 63
1722
};
1723
 
1724
 
1725
// Format 1.8 B. Two general purpose registers and an 8-bit immediate operand. int64
1726
PFunc funcTab9[64] = {
1727
    abs_64, shifti_add, bitscan_, roundp2, popcount_, 0, 0, 0,   // 0  - 7
1728
    0, 0, 0, 0, 0, 0, 0, 0,                                      // 8 - 15
1729
    0, 0, 0, 0, 0, 0, 0, 0,                                      // 16 - 23
1730
    0, 0, 0, 0, 0, 0, 0, 0,                                      // 24 - 31
1731
    read_spec, write_spec, read_capabilities, write_capabilities, read_perf, read_perf, read_sys, write_sys, // 32 - 39
1732
    0, 0, 0, 0, 0, 0, 0, 0,                                      // 40 - 47
1733
    0, 0, 0, 0, 0, 0, 0, 0,                                      // 48 - 55
1734
    push_r, pop_r, 0, 0, 0, 0, input_, output_                   // 56 - 63
1735
};
1736
 
1737
// Format 2.9 A. Three general purpose registers and a 32-bit immediate operand
1738
PFunc funcTab12[64] = {
1739
    move_hi32, insert_hi32, add_32u, sub_32u, add_hi32, and_hi32, or_hi32, xor_hi32,  // 0  - 7
1740
    0, replace_bits, 0, 0, 0, 0, 0, 0,                                                // 8 - 15
1741
    0, 0, 0, 0, 0, 0, 0, 0,                                                           // 16 - 23
1742
    0, 0, 0, 0, 0, 0, 0, 0,                                                           // 24 - 31
1743
    address_, 0, 0, 0, 0, 0, 0, 0,                                                    // 32 - 39
1744
    0, 0, 0, 0, 0, 0, 0, 0,                                                           // 40 - 47
1745
};

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.