URL https://opencores.org/ocsvn/forwardcom/forwardcom/trunk
Subversion Repositories forwardcom

[/] [forwardcom/] [bintools/] [emulator4.cpp] - Blame information for rev 139

Go to most recent revision | Details | Compare with Previous | View Log

﻿/****************************  emulator4.cpp  ********************************
* Author:        Agner Fog
* date created:  2018-02-18
* Last modified: 2021-08-05
* Version:       1.11
* Project:       Binary tools for ForwardCom instruction set
* Description:
* Emulator: Execution functions for single format instructions, part 1
*
* Copyright 2018-2021 GNU General Public License http://www.gnu.org/licenses
*****************************************************************************/
 
#include "stdafx.h"
 
 
// Format 1.0 A. Three general purpose registers
 
// Currently no instructions with format 1.0
 
 
// Format 1.1 C. One general purpose register and a 16 bit immediate operand. int64
 
static uint64_t move_16s(CThread * t) {
    // Move 16-bit sign-extended constant to general purpose register.
    return t->parm[2].q;
}
 
static uint64_t move_16u(CThread * t) {
    // Move 16-bit zero-extended constant to general purpose register.
    return t->parm[2].s;
}
 
static uint64_t shift16_add(CThread * t) {
    // Shift 16-bit unsigned constant left by 16 and add.
    t->parm[2].q <<= 16;
    return f_add(t);
}
 
static uint64_t shifti1_move(CThread * t) {
    // RD = IM2 << IM1. Sign-extend IM2 to 32/64 bits and shift left by the unsigned value IM1
    return (t->parm[2].qs >> 8) << t->parm[2].b;
}
 
static uint64_t shifti1_add(CThread * t) {
    // RD += IM2 << IM1. Sign-extend IM2 to 32/64 bits and shift left by the unsigned value IM1 and add
    t->parm[2].q = (t->parm[2].qs >> 8) << t->parm[2].b;
    return f_add(t);
}
 
static uint64_t shifti1_and(CThread * t) {
    // RD &= IM2 << IM1
    return t->parm[1].q & ((t->parm[2].qs >> 8) << t->parm[2].b);
}
 
static uint64_t shifti1_or(CThread * t) {
    // RD |= IM2 << IM1
    return t->parm[1].q | ((t->parm[2].qs >> 8) << t->parm[2].b);
}
 
static uint64_t shifti1_xor(CThread * t) {
    // RD ^= IM2 << IM1
    return t->parm[1].q ^ ((t->parm[2].qs >> 8) << t->parm[2].b);
}
 
// Format 1.8 B. Two general purpose registers and an 8-bit immediate operand. int64
 
static uint64_t abs_64(CThread * t) {
    // Absolute value of signed integer. 
    // IM1 determines handling of overflow: 0: wrap around, 1: saturate, 2: zero.
    SNum a = t->parm[1];
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
    uint64_t signBit = (sizeMask >> 1) + 1;        // sign bit
    if ((a.q & sizeMask) == signBit) {  // overflow
        if (t->parm[2].b & 4) t->interrupt(INT_OVERFL_SIGN);
        switch (t->parm[2].b & ~4) {
        case 0:  return a.q;     // wrap around
        case 1:  return sizeMask >> 1; // saturate
        case 2:  return 0;       // zero
        default: t->interrupt(INT_WRONG_PARAMETERS);
        }
    }
    if (a.q & signBit) {  // negative
        a.qs = - a.qs;    // change sign
    }
    return a.q;
}
 
static uint64_t shifti_add(CThread * t) {
    // Shift and add. RD += RS << IM1
    SNum a = t->parm[0];
    SNum b = t->parm[1];
    SNum c = t->parm[2];
    SNum r1, r2;                                 // result
    r1.q = b.q << c.b;                           // shift left
    uint8_t nbits = dataSizeTableBits[t->operandType];
    if (c.q >= nbits) r1.q = 0;                  // shift out of range gives zero
    r2.q = a.q + r1.q;                           // add
    /*
    if (t->numContr & MSK_OVERFL_I) {  // check for overflow
        if (t->numContr & MSK_OVERFL_SIGN) {  // check for signed overflow
            uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
            uint64_t signBit = (sizeMask >> 1) + 1;        // sign bit
            uint64_t ovfl = ~(a.q ^ r1.q) & (a.q ^ r2.q);  // overflow if a and b have same sign and result has opposite sign
            if (r1.qs >> c.b != b.qs || (ovfl & signBit) || c.q >= nbits) t->interrupt(INT_OVERFL_SIGN);  // signed overflow
        }
        else if (t->numContr & MSK_OVERFL_UNSIGN) {  // check for unsigned overflow
            if (r2.q < a.q || r1.q >> c.b != b.q || c.q >= nbits) t->interrupt(INT_OVERFL_UNSIGN);  // unsigned overflow
        }
    } */
    return r2.q;         // add
}
 
uint64_t bitscan_ (CThread * t) {
    // Bit scan forward or reverse. Find index to first or last set bit in RS
    SNum a = t->parm[1];                         // input value
    uint8_t IM1 = t->parm[2].b;                  // immediate operand
    a.q &= dataSizeMask[t->operandType];         // mask for operand size
    if (a.q == 0) {
        a.qs = (IM1 & 0x10) ? -1 : 0;            // return 0 or -1 if intput is 0
    }
    else if (IM1 & 1) {
        // reverse
        a.q = bitScanReverse(a.q);
    }
    else {
        // forward    
        a.q = bitScanForward(a.q);
    }
    return a.q;
}
 
static uint64_t roundp2(CThread * t) {
    // Round up or down to nearest power of 2.
    SNum a = t->parm[1];                         // input operand
    uint8_t IM1 = t->parm[2].b;                  // immediate operand
    a.q &= dataSizeMask[t->operandType];         // mask off unused bits
    if (dataSizeTable[t->operandType] > 8) t->interrupt(INT_WRONG_PARAMETERS); // illegal operand type
    if (a.q == 0) {
        a.qs = IM1 & 0x10 ? -1 : 0;              // return 0 or -1 if the intput is 0
    }
    else if (!(a.q & (a.q-1))) {
        return a.q;                              // the number is a power of 2. Return unchanged
    }
    else if (IM1 & 1) {
        // round up to nearest power of 2
        uint32_t s = bitScanReverse(a.q);        // highest set bit
        if (s+1 >= dataSizeTableBits[t->operandType]) { // overflow
            a.qs = IM1 & 0x20 ? -1 : 0;          // return 0 or -1 on overflow
        }
        else {
            a.q = (uint64_t)1 << (s+1);          // round up
        }
    }
    else {
        // round down to nearest power of 2
        a.q = (uint64_t)1 << bitScanReverse(a.q);
    }
    return a.q;
}
 
static uint32_t popcount32(uint32_t x) { // count bits in 32 bit integer. used by popcount_ function
    x = x - ((x >> 1) & 0x55555555);
    x = (x >> 2 & 0x33333333) + (x & 0x33333333);
    x = (x + (x >> 4)) & 0x0F0F0F0F;
    x = (x + (x >> 8)) & 0x00FF00FF;
    x = uint16_t(x + (x >> 16));
    return x;
}
 
uint64_t popcount_ (CThread * t) {
    // Count the number of bits in RS that are 1
    SNum a = t->parm[1];                         // value
    a.q &= dataSizeMask[t->operandType];         // mask for operand size
    return popcount32(a.i) + popcount32(a.q >> 32);
}
 
static uint64_t read_spec(CThread * t) {
    // Read special register RS into g. p. register RD.
    uint8_t rs = t->operands[4];                 // source register
    uint64_t retval = 0;
 
    switch (rs) {
    case REG_NUMCONTR & 0x1F:     // numcontr register
        retval = t->numContr;
        break;
 
    case REG_THREADP & 0x1F:     // threadp register
        retval = t->threadp;
        break;
 
    case REG_DATAP & 0x1F:       // datap register     
        retval = t->datap;
        break;
 
    default:                     // other register not implemented
        t->interrupt(INT_WRONG_PARAMETERS);
    }
    return retval;
}
 
static uint64_t write_spec(CThread * t) {
    // Write g. p. register RS to special register RD
    uint8_t rd = t->operands[0];                 // destination register
    SNum a = t->parm[1];                         // value
    switch (rd) {
    case REG_NUMCONTR & 0x1F:     // numcontr register
        t->numContr = a.i | 1;                   // bit 0 must be set
        if (((t->numContr ^ t->lastMask) & (1<<MSK_SUBNORMAL)) != 0) {
            // subnormal status changed
            enableSubnormals(t->numContr & (1<<MSK_SUBNORMAL));
        }
        t->lastMask = t->numContr;
        break;
 
    case REG_THREADP & 0x1F:     // threadp register
        t->threadp = a.q;
        break;
 
    case REG_DATAP & 0x1F:       // datap register     
        t->datap = a.q;
        break;
 
    default:                     // other register not implemented
        t->interrupt(INT_WRONG_PARAMETERS);
    }
 
    t->returnType = 0;
    return 0;
}
 
static uint64_t read_capabilities(CThread * t) {
    // Read capabilities register into g. p. register RD
    uint8_t capabreg = t->operands[4];    // capabilities register number
    if (capabreg < number_of_capability_registers) {
        return t->capabilyReg[capabreg];
    }
    else {
        t->interrupt(INT_WRONG_PARAMETERS);
    }
    return 0;
}
 
static uint64_t write_capabilities(CThread * t) {
    // Write g. p. register to capabilities register RD
    uint8_t capabreg = t->operands[0];    // capabilities register number
    uint64_t value =  t->parm[1].q;
    if (capabreg < number_of_capability_registers) {
        t->capabilyReg[capabreg] = value;
    }
    else {
        t->interrupt(INT_WRONG_PARAMETERS);
    }
    t->returnType = 0;
    return 0;
}
 
static uint64_t read_perf(CThread * t) {
    // Read performance counter
    uint8_t parfreg = t->operands[4];    // performance register number
    uint8_t par2 = t->parm[2].b;         // second operand
    uint64_t result = 0;
    switch (parfreg) {
    case 0:  // reset all performance counters
        if (par2 & 1) {
            t->perfCounters[perf_cpu_clock_cycles] = 0;
        }
        if (par2 & 2) {
            t->perfCounters[perf_instructions] = 0;
            t->perfCounters[perf_2size_instructions] = 0;
            t->perfCounters[perf_3size_instructions] = 0;
            t->perfCounters[perf_gp_instructions] = 0;
            t->perfCounters[perf_gp_instructions_mask0] = 0;
        }
        if (par2 & 4) {
            t->perfCounters[perf_vector_instructions] = 0;
        }
        if (par2 & 8) {
            t->perfCounters[perf_control_transfer_instructions] = 0;
            t->perfCounters[perf_direct_jumps] = 0;
            t->perfCounters[perf_indirect_jumps] = 0;
            t->perfCounters[perf_cond_jumps] = 0;
        }
        break;
 
    case 1:  // CPU clock cycles
        result = t->perfCounters[perf_cpu_clock_cycles];
        if (par2 == 0) t->perfCounters[perf_cpu_clock_cycles] = 0;
        break;
 
    case 2:  // number of instructions
        switch (par2) {
        case 0:
            result = t->perfCounters[perf_instructions];
            t->perfCounters[perf_instructions] = 0;
            t->perfCounters[perf_2size_instructions] = 0;
            t->perfCounters[perf_3size_instructions] = 0;
            t->perfCounters[perf_gp_instructions] = 0;
            t->perfCounters[perf_gp_instructions_mask0] = 0;
            break;
        case 1:
            result = t->perfCounters[perf_instructions];
            break;
        case 2:
            result = t->perfCounters[perf_2size_instructions];
            break;
        case 3:
            result = t->perfCounters[perf_3size_instructions];
            break;
        case 4:
            result = t->perfCounters[perf_gp_instructions];
            break;
        case 5:
            result = t->perfCounters[perf_gp_instructions_mask0];
            break;
        }
        break;
 
    case 3:  // number of vector instructions
        result = t->perfCounters[perf_vector_instructions];
        if (par2 == 0) t->perfCounters[perf_vector_instructions] = 0;
        break;
 
    case 4:  // vector registers in use
        for (int iv = 0; iv < 32; iv++) {
            if (t->vectorLength[iv] > 0) result |= (uint64_t)1 << iv;
        }
        break;
 
    case 5:  // jumps, calls, and returns
        switch (par2) {
        case 0:
            result = t->perfCounters[perf_control_transfer_instructions];
            t->perfCounters[perf_control_transfer_instructions] = 0;
            t->perfCounters[perf_direct_jumps] = 0;
            t->perfCounters[perf_indirect_jumps] = 0;
            t->perfCounters[perf_cond_jumps] = 0;
            break;
        case 1:    // all jumps, calls, returns
            result = t->perfCounters[perf_control_transfer_instructions];
            break;
        case 2:    // direct unconditional jumps, calls, returns
            result = t->perfCounters[perf_direct_jumps];
            break;
        case 3:
            result = t->perfCounters[perf_indirect_jumps];
            break;
        case 4:
            result = t->perfCounters[perf_cond_jumps];
            break;
        }
        break;
    case 16:  // errors counters
        switch (par2) {
        case 0:
            result = 0;
            t->perfCounters[perf_unknown_instruction] = 0;
            t->perfCounters[perf_wrong_operands] = 0;
            t->perfCounters[perf_array_overflow] = 0;
            t->perfCounters[perf_read_violation] = 0;
            t->perfCounters[perf_write_violation] = 0;
            t->perfCounters[perf_misaligned] = 0;
            t->perfCounters[perf_address_of_first_error] = 0;
            t->perfCounters[perf_type_of_first_error] = 0;
            break;
        case 1:    // unknown instructions
            result = t->perfCounters[perf_unknown_instruction];
            break;
        case 2:    // wrong operands for instruction
            result = t->perfCounters[perf_wrong_operands];
            break;
        case 3:    // array index out of bounds
            result = t->perfCounters[perf_array_overflow];
            break;
        case 4:    // memory read access violation
            result = t->perfCounters[perf_read_violation];
            break;
        case 5:    // memory write access violation
            result = t->perfCounters[perf_write_violation];
            break;
        case 6:    // memory access misaligned
            result = t->perfCounters[perf_misaligned];
            break;
        case 62:   // address of first error
            result = t->perfCounters[perf_address_of_first_error];
            break;
        case 63:   // type of first error
            result = t->perfCounters[perf_type_of_first_error];
            break;
        }
 
        break;
    default:
        t->interrupt(INT_WRONG_PARAMETERS);
    }
 
    return result;
}
 
static uint64_t read_sys(CThread * t) {
    // Read system register RS into g. p. register RD
    t->interrupt(INT_WRONG_PARAMETERS); // not supported yet
    return 0;
}
 
static uint64_t write_sys(CThread * t) {
    // Write g. p. register RS to system register RD
    t->interrupt(INT_WRONG_PARAMETERS); // not supported yet
    t->returnType = 0;
    return 0;
}
 
static uint64_t push_r(CThread * t) {
    // push one or more g.p. registers on a stack pointed to by rd
    int32_t step = dataSizeTable[t->operandType];
    if (!(t->parm[4].i & 0x80)) step = -step;
    uint8_t reg0 = t->operands[0] & 0x1F;   // pointer register
    uint8_t reg1 = t->operands[4] & 0x1F;   // first push register
    uint8_t reglast = t->parm[4].i & 0x1F;  // last push register
    uint8_t reg;
    uint64_t pointer = t->registers[reg0];
    // loop through registers to push
    for (reg = reg1; reg <= reglast; reg++) {
        pointer += (int64_t)step;
        uint64_t value = t->registers[reg];
        t->writeMemoryOperand(value, pointer);
        t->listResult(value);
    }
    t->registers[reg0] = pointer;
    return pointer;
}
 
static uint64_t pop_r(CThread * t) {
    // pop one or more g.p. registers from a stack pointed to by rd
    int32_t step = dataSizeTable[t->operandType];
    if (t->parm[4].i & 0x80) step = -step;
    uint8_t reg0 = t->operands[0] & 0x1F;   // pointer register
    uint8_t reg1 = t->operands[4] & 0x1F;   // first push register
    uint8_t reglast = t->parm[4].i & 0x1F;  // last push register
    uint8_t reg;
    uint64_t pointer = t->registers[reg0];
    // loop through registers to pop in reverse order
    for (reg = reglast; reg >=  reg1; reg--) {
        uint64_t value = t->readMemoryOperand(pointer);
        t->registers[reg] = value;
        pointer += (int64_t)step;
        t->listResult(value);
    }
    t->registers[reg0] = pointer;
    return pointer;
}
 
 
// Format 2.9 A. Three general purpose registers and a 32-bit immediate operand
 
static uint64_t move_hi32(CThread * t) {
    // Load 32-bit constant into the high part of a general purpose register. The low part is zero. RD = IM2 << 32.
    return t->parm[2].q << 32;
}
 
static uint64_t insert_hi32(CThread * t) {
    // Insert 32-bit constant into the high part of a general purpose register, leaving the low part unchanged.
    return t->parm[2].q << 32 | t->parm[1].i;
}
 
static uint64_t add_32u(CThread * t) {
    // Add zero-extended 32-bit constant to general purpose register
    t->parm[2].q = t->parm[2].i;
    return f_add(t);
}
 
static uint64_t sub_32u(CThread * t) {
    // Subtract zero-extended 32-bit constant from general purpose register
    t->parm[2].q = t->parm[2].i;
    return f_sub(t);
}
 
static uint64_t add_hi32(CThread * t) {
    // Add 32-bit constant to high part of general purpose register. RD = RT + (IM2 << 32).
    t->parm[2].q <<= 32;
    return f_add(t);
}
 
static uint64_t and_hi32(CThread * t) {
    // AND high part of general purpose register with 32-bit constant. RD = RT & (IM2 << 32).
    return t->parm[1].q & t->parm[2].q << 32;
}
 
static uint64_t or_hi32(CThread * t) {
    // OR high part of general purpose register with 32-bit constant. RD = RT | (IM2 << 32).
    return t->parm[1].q | t->parm[2].q << 32;
}
 
static uint64_t xor_hi32(CThread * t) {
    // XOR high part of general purpose register with 32-bit constant. RD = RT ^ (IM2 << 32).
    return t->parm[1].q ^ t->parm[2].q << 32;
}
 
static uint64_t replace_bits(CThread * t) {
    // Replace a group of contiguous bits in RT by a specified constant
    SNum a = t->parm[1];
    SNum b = t->parm[2];
    uint64_t val = b.s;                          // value to insert
    uint8_t  pos = uint8_t(b.i >> 16);           // start position
    uint8_t  num = uint8_t(b.i >> 24);           // number of bits to replace
    if (num > 32 || pos + num > 64) t->interrupt(INT_WRONG_PARAMETERS);
    uint64_t mask = ((uint64_t)1 << num) - 1;    // mask with 'num' 1-bits
    return (a.q & ~(mask << pos)) | ((val & mask) << pos);
}
 
static uint64_t address_(CThread * t) {
    // RD = RT + IM2, RS can be THREADP (28), DATAP (29) or IP (30)
    t->returnType = 0x13;
    return t->memAddress;
}
 
// Format 1.2 A. Three vector register operands
 
static uint64_t set_len(CThread * t) {
    // RD = vector register RS with length changed to value of g.p. register RT
    // set_len: the new length is indicated in bytes
    // set_num: the new length is indicated in elements
    uint8_t  rd = t->operands[0];
    uint8_t  rs = t->operands[4];
    uint8_t  rt = t->operands[5];
    uint32_t oldLength = t->vectorLength[rs];
    uint64_t newLength = t->registers[rt];
    if (t->op & 1) newLength *= dataSizeTable[t->operandType];  // set_num: multiply by operand size
    if (newLength > t->MaxVectorLength) newLength = t->MaxVectorLength;
    if (newLength > oldLength) {
        memcpy(t->vectors.buf() + rd*t->MaxVectorLength, t->vectors.buf() + rs*t->MaxVectorLength, oldLength);  // copy first part from RT
        memset(t->vectors.buf() + rd*t->MaxVectorLength + oldLength, 0, size_t(newLength - oldLength));               // set the rest to zero
    }
    else {
        memcpy(t->vectors.buf() + rd*t->MaxVectorLength, t->vectors.buf() + rs*t->MaxVectorLength, size_t(newLength));  // copy newLength from RT
    }
    t->vectorLength[rd] = (uint32_t)newLength;             // set new length
    t->vect = 4;                                           // stop vector loop
    t->running = 2;                                       // don't save RD
    return 0;
}
 
static uint64_t get_len(CThread * t) {
    // Get length of vector register RT into general purpose register RD
    // get_len: get the length in bytes
    // get_num: get the length in elements
    uint8_t  rd = t->operands[0];
    uint8_t  rt = t->operands[4];
    uint32_t length = t->vectorLength[rt];                 // length of RT
    if (t->op & 1) length >>= dataSizeTableLog[t->operandType];  // get_num: divide by operand size (round down)
    t->registers[rd] = length;                             // save in g.p. register, not vector register
    t->vect = 4;                                           // stop vector loop
    t->running = 2;                                        // don't save to vector register RD
    t->returnType = 0x12;                                  // debug return output
    return length;
}
 
uint64_t insert_(CThread * t) {
    // Replace one element in vector RD, starting at offset RT·OS, with scalar RS
    uint64_t pos;                         // position of element insert
    uint8_t  rd = t->operands[3];         // source and destination register
    uint8_t  operandType = t->operandType;       // operand type
    uint64_t returnval;
    uint8_t  dsizelog = dataSizeTableLog[operandType]; // log2(elementsize)
    t->vectorLengthR = t->vectorLength[rd];
    uint8_t sourceVector = t->operands[4];      // source register 
 
    if (t->fInstr->format2 == 0x120) {   //  format 1.2A  v1 = insert(v1, v2, r3)
        uint8_t  rt = t->operands[5];         // index register
        pos = t->registers[rt] << dsizelog;
    }
    else {   // format 1.3B     v1 = insert(v1, v2, imm)
        pos = t->parm[2].q << dsizelog;
    }
    if (pos == t->vectorOffset) {
        if (dsizelog == 4) {  // 128 bits.
            t->parm[5].q = t->readVectorElement(sourceVector, 8); // high part of 128-bit result
        }
        returnval = t->readVectorElement(sourceVector, 0);      // first element of sourceVector
    }
    else {
        if (dsizelog == 4) {  // 128 bits.
            t->parm[5].q = t->readVectorElement(rd, t->vectorOffset + 8); // high part of 128-bit result
        }
        returnval = t->parm[0].q;                     // rd unchanged
    }
    return returnval;
}
 
uint64_t extract_(CThread * t) {
    // Extract one element from vector RT, at offset RS·OS or IM1·OS, with size OS 
    // and broadcast into vector register RD.
    uint8_t  rd = t->operands[0];                          // destination register
    uint8_t  operandType = t->operandType;                 // operand type
    uint8_t  dsizelog = dataSizeTableLog[operandType];     // log2(elementsize)
    uint8_t  rsource = t->operands[4];                     // source vector
    uint64_t pos;                                          // position = index * OS
    if (t->fInstr->format2 == 0x120) {
        uint8_t  rt = t->operands[5];                      // index register
        pos = t->registers[rt] << dsizelog;
    }
    else {  // format 0x130
        pos = t->parm[4].q << dsizelog;
    }
    uint32_t sourceLength = t->vectorLength[rsource];      // length of source vector
    uint64_t result;
    if (pos >= sourceLength) {
        result = 0;                                        // beyond end of source vector
    }
    else {
        int8_t * source = t->vectors.buf() + (uint64_t)rsource * t->MaxVectorLength; // address of rsource data
        result = *(uint64_t*)(source+pos);                 // no problem reading too much, it will be cut off later if the operand size is < 64 bits
        if (dsizelog >= 4) {                               // 128 bits
            t->parm[5].q = *(uint64_t*)(source+pos+8);     // store high part of 128 bit element
        }
    }
    t->vectorLength[rd] = t->vectorLengthR = sourceLength; // length of destination vector
    return result;
}
 
 
 
static uint64_t compress_sparse(CThread * t) {
    // Compress sparse vector elements indicated by mask bits into contiguous vector. 
    uint8_t  rd = t->operands[0];         // destination vector
    //uint8_t  rt = t->operands[4];       // length of input vector not specified
    uint8_t  rt = t->operands[5];         // source vector
    uint8_t  rm = t->operands[1];         // mask vector
    uint32_t sourceLength = t->vectorLength[rt]; // length of source vector
    uint32_t maskLength = t->vectorLength[rm];   // length of mask vector
    //uint64_t newLength = t->registers[rt];       // length of destination
    uint64_t newLength = sourceLength;     // length of destination
    uint32_t elementSize = dataSizeTable[t->operandType];            // size of each element
    int8_t * source = t->vectors.buf() + rt*t->MaxVectorLength;      // address of RT data
    int8_t * masksrc = t->vectors.buf() + rm*t->MaxVectorLength;     // address of mask data
    int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
    // limit length
    if (newLength > t->MaxVectorLength) newLength = t->MaxVectorLength;
    if (newLength > maskLength) newLength = maskLength;              // no reason to go beyond mask
    if (newLength > sourceLength) {                                  // reading beyond the end of the source vector
        memset(source + sourceLength, 0, size_t(newLength - sourceLength));  // make sure the rest is zero
    }
    uint32_t pos1 = 0;                           // position in source vector
    uint32_t pos2 = 0;                           // position in destination vector
    // loop through mask register
    for (pos1 = 0; pos1 < newLength; pos1 += elementSize) {
        if (*(masksrc + pos1) & 1) {             // check mask bit
            // copy from pos1 in source to pos2 in destination
            switch (elementSize) {
            case 1:  // int8
                *(destination+pos2) = *(source+pos1);
                break;
            case 2:  // int16
                *(uint16_t*)(destination+pos2) = *(uint16_t*)(source+pos1);
                break;
            case 4:  // int32, float
                *(uint32_t*)(destination+pos2) = *(uint32_t*)(source+pos1);
                break;
            case 8:  // int64, double
                *(uint64_t*)(destination+pos2) = *(uint64_t*)(source+pos1);
                break;
            case 16:  // int128, float128
                *(uint64_t*)(destination+pos2)   = *(uint64_t*)(source+pos1);
                *(uint64_t*)(destination+pos2+8) = *(uint64_t*)(source+pos1+8);
                break;
            }
            pos2 += elementSize;
        }
    }
    // set new length of destination vector
    t->vectorLength[rd] = pos2;
    t->vect = 4;                                 // stop vector loop
    t->running = 2;                              // don't save. result has already been saved
    return 0;
}
 
static uint64_t expand_sparse(CThread * t) {
    // Expand contiguous vector into sparse vector with positions indicated by mask bits
    // RS = length of output vector
    uint8_t  rd = t->operands[0];         // destination vector
    uint8_t  rs = t->operands[4];         // source vector
    uint8_t  rt = t->operands[5];         // length indicator
    uint8_t  rm = t->operands[1];         // mask vector
    uint32_t sourceLength = t->vectorLength[rs]; // length of source vector
    uint32_t maskLength = t->vectorLength[rm];   // length of mask vector
    uint64_t newLength = t->registers[rt];       // length of destination
    uint32_t elementSize = dataSizeTable[t->operandType & 7];        // size of each element
    int8_t * source = t->vectors.buf() + rs*t->MaxVectorLength;      // address of RS data
    int8_t * masksrc = t->vectors.buf() + rm*t->MaxVectorLength;     // address of mask data
    int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
    if (rd == rs) {
        // source and destination are the same. Make a temporary copy of source to avoid overwriting
        memcpy(t->tempBuffer, source, sourceLength);
        source = t->tempBuffer;
    }
    // limit length
    if (newLength > t->MaxVectorLength) newLength = t->MaxVectorLength;
    if (newLength > maskLength) newLength = maskLength;              // no reason to go beyond mask
    if (newLength > sourceLength) {                                  // reading beyond the end of the source vector
        memset(source + sourceLength, 0, size_t(newLength - sourceLength));  // make sure the rest is zero
    }
    uint32_t pos1 = 0;                           // position in source vector
    uint32_t pos2 = 0;                           // position in destination vector
 
    // loop through mask register
    for (pos2 = 0; pos2 < newLength; pos2 += elementSize) {
        if (*(masksrc + pos2) & 1) {             // check mask bit
            // copy from pos1 in source to pos2 in destination
            switch (elementSize) {
            case 1:  // int8
                *(destination+pos2) = *(source+pos1);
                break;
            case 2:  // int16
                *(uint16_t*)(destination+pos2) = *(uint16_t*)(source+pos1);
                break;
            case 4:  // int32, float
                *(uint32_t*)(destination+pos2) = *(uint32_t*)(source+pos1);
                break;
            case 8:  // int64, double
                *(uint64_t*)(destination+pos2) = *(uint64_t*)(source+pos1);
                break;
            case 16:  // int128, float128
                *(uint64_t*)(destination+pos2)   = *(uint64_t*)(source+pos1);
                *(uint64_t*)(destination+pos2+8) = *(uint64_t*)(source+pos1+8);
                break;
            }
            pos1 += elementSize;
        }
        else {
            // mask is zero. insert zero
            switch (elementSize) {
            case 1:  // int8
                *(destination+pos2) = 0;
                break;
            case 2:  // int16
                *(uint16_t*)(destination+pos2) = 0;
                break;
            case 4:  // int32, float
                *(uint32_t*)(destination+pos2) = 0;
                break;
            case 8:  // int64, double
                *(uint64_t*)(destination+pos2) = 0;
                break;
            case 16:  // int128, float128
                *(uint64_t*)(destination+pos2)   = 0;
                *(uint64_t*)(destination+pos2+8) = 0;
                break;
            }
 
        }
    }
    // set new length of destination vector
    t->vectorLength[rd] = pos2;
    t->vect = 4;                                 // stop vector loop
    t->running = 2;                              // don't save. result has already been saved
    return 0;
}
 
static uint64_t broad_(CThread * t) {
    // Broadcast first element of source vector into all elements of RD with specified length
    uint8_t  rlen;                               // g.p. register indicating length
    uint64_t value;                              // value to broadcast
    uint8_t  rd = t->operands[0];                // destination vector
    if (t->fInstr->format2 == 0x120) {
        rlen = t->operands[5];                   // RT = length
        uint8_t  rs = t->operands[4];            // source vector
        value = t->readVectorElement(rs, 0);     // first element of RS
    }
    else {
        rlen = t->operands[4];                   // first source operand = length
        value = t->parm[2].q;                    // immediate operand
    }
    uint64_t destinationLength = t->registers[rlen];  // value of length register
    if (destinationLength > t->MaxVectorLength) destinationLength = t->MaxVectorLength; // limit length
    // set length of destination register, let vector loop continue to this length
    t->vectorLength[rd] = t->vectorLengthR = (uint32_t)destinationLength;
    return value;
}
 
static uint64_t bits2bool(CThread * t) {
    // The lower n bits of RT are unpacked into a boolean vector RD with length RS
    // with one bit in each element, where n = RS / OS.
    uint8_t  rd = t->operands[0];         // destination vector
    uint8_t  rt = t->operands[5];         // RT = source vector
    uint8_t  rs = t->operands[4];         // RS indicates length
    SNum mask = t->parm[3];                      // mask
    uint8_t * source = (uint8_t*)t->vectors.buf() + rt*t->MaxVectorLength; // address of RT data
    uint8_t * destination = (uint8_t*)t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
    uint64_t destinationLength = t->registers[rs]; // value of RS = length of destination
    uint8_t  dsizelog = dataSizeTableLog[t->operandType]; // log2(elementsize)
    if (destinationLength > t->MaxVectorLength) destinationLength = t->MaxVectorLength; // limit length
    // set length of destination register
    t->vectorLength[rd] = (uint32_t)destinationLength;
    uint32_t num = (uint32_t)destinationLength >> dsizelog; // number of elements
    destinationLength = num << dsizelog;          // round down length to nearest multiple of element size
    // number of bits in source
    uint32_t srcnum = t->vectorLength[rt] * 8;
    if (num < srcnum) num = srcnum;              // limit to the number of bits in source
    mask.q &= -(int64_t)2;                       // remove lower bit of mask. it will be replaced by source bit
    // loop through bits
    for (uint32_t i = 0; i < num; i++) {
        uint8_t bit = (source[i / 8] >> (i & 7)) & 1;  // extract single bit from source
        switch (dsizelog) {
        case 0:  // int8
            *destination = mask.b | bit;  break;
        case 1:  // int16
            *(uint16_t*)destination = mask.s | bit;  break;
        case 2:  // int32
            *(uint32_t*)destination = mask.i | bit;  break;
        case 3:  // int64
            *(uint64_t*)destination = mask.q | bit;  break;
        case 4:  // int128
            *(uint64_t*)destination = mask.q | bit;
            *(uint64_t*)(destination+8) = mask.q | bit;
            break;
        }
        destination += (uint64_t)1 << dsizelog;
    }
    t->vect = 4;                                           // stop vector loop
    t->running = 2;                                        // don't save RD
    if ((t->returnType & 7) >= 5) t->returnType -= 3;      // make return type integer
    return 0;
}
 
 
static uint64_t shift_expand(CThread * t) {
    // Shift vector RS up by RT bytes and extend the vector length by RT. 
    // The lower RT bytes of RD will be zero.
    uint8_t  rd = t->operands[0];         // destination vector
    uint8_t  rs = t->operands[4];         // RS = source vector
    uint8_t  rt = t->operands[5];         // RT indicates length
    uint8_t * source = (uint8_t*)t->vectors.buf() + rs*t->MaxVectorLength; // address of RS data
    uint8_t * destination = (uint8_t*)t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
    uint64_t shiftCount = t->registers[rt];      // value of RT = shift count
    if (shiftCount > t->MaxVectorLength) shiftCount = t->MaxVectorLength; // limit length
    uint32_t sourceLength = t->vectorLength[rs]; // length of source vector
    uint32_t destinationLength = sourceLength + (uint32_t)shiftCount; // length of destination vector
    if (destinationLength > t->MaxVectorLength) destinationLength = t->MaxVectorLength; // limit length
    // set length of destination vector
    t->vectorLength[rd] = destinationLength;
    // set lower part of destination to zero
    memset(destination, 0, size_t(shiftCount));
    // copy the rest from source
    if (destinationLength > shiftCount) {
        memmove(destination + shiftCount, source, size_t(destinationLength - shiftCount));
    }
    t->vect = 4;                                 // stop vector loop
    t->running = 2;                              // don't save RD. It has already been saved
    return 0;
}
 
static uint64_t shift_reduce(CThread * t) {
    // Shift vector RS down RT bytes and reduce the length by RT. 
    // The lower RT bytes of RS are lost
    uint8_t  rd = t->operands[0];         // destination vector
    uint8_t  rs = t->operands[4];         // RS = source vector
    uint8_t  rt = t->operands[5];         // RT indicates length
    uint8_t * source = (uint8_t*)t->vectors.buf() + rs*t->MaxVectorLength; // address of RS data
    uint8_t * destination = (uint8_t*)t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
    uint32_t sourceLength = t->vectorLength[rs]; // length of source vector
    uint64_t shiftCount = t->registers[rt];      // value of RT = shift count
    if (shiftCount > sourceLength) shiftCount = sourceLength; // limit length
    uint32_t destinationLength = sourceLength - (uint32_t)shiftCount; // length of destination vector
    t->vectorLength[rd] = destinationLength;     // set length of destination vector
    // copy data from source
    if (destinationLength > 0) {
        memmove(destination, source + shiftCount, destinationLength);
    }
    t->vect = 4;                                           // stop vector loop
    t->running = 2;                                        // don't save RD. It has already been saved
    return 0;
}
 
static uint64_t shift_up(CThread * t) {
    // Shift elements of vector RS up RT elements.
    // The lower RT elements of RD will be zero, the upper RT elements of RS are lost.
    uint8_t  rd = t->operands[0];         // destination vector
    uint8_t  rs = t->operands[4];         // RS = source vector
    uint8_t  rt = t->operands[5];         // RT indicates length
    uint8_t * source = (uint8_t*)t->vectors.buf() + rs * t->MaxVectorLength; // address of RS data
    uint8_t * destination = (uint8_t*)t->vectors.buf() + rd * t->MaxVectorLength; // address of RD data
    uint8_t  dsizelog = dataSizeTableLog[t->operandType];  // log2(elementsize)
    uint64_t shiftCount = t->registers[rt] << dsizelog;      // value of TS = shift count, elements
    if (shiftCount > t->MaxVectorLength) shiftCount = t->MaxVectorLength; // limit length
    uint32_t sourceLength = t->vectorLength[rs]; // length of source vector
    t->vectorLength[rd] = sourceLength;          // set length of destination vector to the same as source vector
    // copy from source
    if (sourceLength > shiftCount) {
        memmove(destination + shiftCount, source, size_t(sourceLength - shiftCount));
    }
    // set lower part of destination to zero
    memset(destination, 0, size_t(shiftCount));
    t->vect = 4;                                           // stop vector loop
    t->running = 2;                                        // don't save RD. It has already been saved
    return 0;
}
 
static uint64_t shift_down(CThread * t) {
    // Shift elements of vector RS down RT elements.
    // The upper RT elements of RD will be zero, the lower RT elements of RS are lost.
    uint8_t  rd = t->operands[0];                   // destination vector
    uint8_t  rs = t->operands[4];                   // RS = source vector
    uint8_t  rt = t->operands[5];                   // RT indicates length
    uint8_t * source = (uint8_t*)t->vectors.buf() + rs*t->MaxVectorLength; // address of RS data
    uint8_t * destination = (uint8_t*)t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
    uint32_t sourceLength = t->vectorLength[rs];           // length of source vector
    uint8_t  dsizelog = dataSizeTableLog[t->operandType];  // log2(elementsize)
    uint64_t shiftCount = t->registers[rt] << dsizelog;    // value of RT = shift count, elements
    if (shiftCount > sourceLength) shiftCount = sourceLength; // limit length
    t->vectorLength[rd] = sourceLength;                    // set length of destination vector
    if (sourceLength > shiftCount) {                       // copy data from source
        memmove(destination, source + shiftCount, size_t(sourceLength - shiftCount));
    }
    if (shiftCount > 0) {                                  // set the rest to zero
        memset(destination + sourceLength - shiftCount, 0, size_t(shiftCount));
    }
    t->vect = 4;                                           // stop vector loop
    t->running = 2;                                        // don't save RD. It has already been saved
    return 0;
}
 
/*
static uint64_t rotate_up (CThread * t) {
    // Rotate vector RT up one element.
    uint8_t  rd = t->operands[0];         // destination vector
    uint8_t  rt = t->operands[5];         // RT = source vector
    //uint8_t  rs = t->operands[4];         // RS indicates length
    int8_t * source = t->vectors.buf() + rt*t->MaxVectorLength; // address of RT data
    int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
    //uint64_t length = t->registers[rs];          // value of RS = vector length
    //if (length > t->MaxVectorLength) length = t->MaxVectorLength; // limit length
    uint32_t sourceLength = t->vectorLength[rt]; // length of source vector
    uint32_t length = sourceLength;
    if (rd == rt) {
        // source and destination are the same. Make a temporary copy of source to avoid overwriting
        memcpy(t->tempBuffer, source, length);
        source = t->tempBuffer;
    }
    if (length > sourceLength) {                 // reading beyond the end of the source vector. make sure the rest is zero
        memset(source + sourceLength, 0, size_t(length - sourceLength));
    }
    uint32_t elementSize = dataSizeTable[t->operandType];            // size of each element
    if (elementSize > length) elementSize = (uint32_t)length;
    t->vectorLength[rd] = (uint32_t)length;                // set length of destination vector
    memcpy(destination, source + length - elementSize, elementSize); // copy top element to bottom
    memcpy(destination + elementSize, source, size_t(length - elementSize)); // copy the rest
    t->vect = 4;                                           // stop vector loop
    t->running = 2;                                        // don't save RD. It has already been saved
    return 0;
}
 
static uint64_t rotate_down (CThread * t) {
    // Rotate vector RT down one element.
    uint8_t  rd = t->operands[0];         // destination vector
    uint8_t  rt = t->operands[5];         // RT = source vector
    //uint8_t  rs = t->operands[4];         // RS indicates length
    int8_t * source = t->vectors.buf() + rt*t->MaxVectorLength; // address of RT data
    int8_t * destination = t->vectors.buf() + rd*t->MaxVectorLength; // address of RD data
    //uint64_t length = t->registers[rs];          // value of RS = vector length
    uint32_t sourceLength = t->vectorLength[rt]; // length of source vector
    uint32_t length = sourceLength;
    //if (length > t->MaxVectorLength) length = t->MaxVectorLength; // limit length
    if (rd == rt) {
        // source and destination are the same. Make a temporary copy of source to avoid overwriting
        memcpy(t->tempBuffer, source, length);
        source = t->tempBuffer;
    }
    if (length > sourceLength) {                 // reading beyond the end of the source vector. make sure the rest is zero
        memset(source + sourceLength, 0, size_t(length - sourceLength));
    }
    uint32_t elementSize = dataSizeTable[t->operandType];            // size of each element
    if (elementSize > length) elementSize = (uint32_t)length;
    t->vectorLength[rd] = (uint32_t)length;      // set length of destination vector
    memcpy(destination, source + elementSize, size_t(length - elementSize)); // copy down
    memcpy(destination + length - elementSize, source, elementSize); // copy the bottom element to top
    t->vect = 4;                                           // stop vector loop
    t->running = 2;                                        // don't save RD. It has already been saved
    return 0;
}*/
 
static uint64_t div_ex (CThread * t) {
    // Divide vector of double-size integers RS by integers RT. 
    // RS has element size 2·OS. These are divided by the even numbered elements of RT with size OS.
    // The truncated results are stored in the even-numbered elements of RD. 
    // The remainders are stored in the odd-numbered elements of RD
    // op = 24: signed, 25: unsigned
    SNum result;                                 // quotient
    SNum remainder;                              // remainder
    SNum a_lo = t->parm[1];                      // low part of dividend
    SNum b = t->parm[2];                         // divisor
    uint8_t rs = t->operands[4];          // RS indicates length
    uint32_t elementSize = dataSizeTable[t->operandType];            // size of each element
    SNum a_hi;
    a_hi.q = t->readVectorElement(rs, t->vectorOffset + elementSize);  // high part of dividend
    uint64_t sizemask = dataSizeMask[t->operandType]; // mask for operand size
    uint64_t signbit = (sizemask >> 1) + 1;      // mask indicating sign bit
    //SNum mask = t->parm[3];                      // mask register value or NUMCONTR
    bool isUnsigned = t->op & 1;                 // 24: signed, 25: unsigned
    bool overflow = false;
    int sign = 0;                                // 1 if result is negative
 
    if (!isUnsigned) {                           // convert signed division to unsigned
        if (b.q & signbit) {                     // b is negative. make it positive
            b.qs = -b.qs;  sign = 1;
        }
        if (a_hi.q & signbit) {                  // a is negative. make it positive
            a_lo.qs = - a_lo.qs;
            a_hi.q  = ~ a_hi.q;
            if ((a_lo.q & sizemask) == 0) a_hi.q++; // carry from low to high part
            sign ^= 1;                           // invert sign
        }
    }
    // limit data size
    b.q    &= sizemask;
    a_hi.q &= sizemask;
    a_lo.q &= sizemask;
    result.q = 0;
    remainder.q = 0;
    // check for overflow
    if (a_hi.q >= b.q || b.q == 0) {
        overflow = true;
    }
    else {
        switch (t->operandType) {
        case 0: // int8
            a_lo.s |= a_hi.s << 8;
            result.s = a_lo.s / b.s;
            remainder.s = a_lo.s % b.s;
            break;
        case 1: // int16
            a_lo.i |= a_hi.i << 16;
            result.i = a_lo.i / b.i;
            remainder.i = a_lo.i % b.i;
            break;
        case 2: // int32
            a_lo.q |= a_hi.q << 32;
            result.q = a_lo.q / b.q;
            remainder.q = a_lo.q % b.q;
            break;
        case 3: // int64
            // to do: implement 128/64 -> 64 division by intrinsic or inline assembly
            // or bit shift method (other methods are too complex)
        default:
            t->interrupt(INT_WRONG_PARAMETERS);
        }
    }
    // check sign
    if (sign) {
        if (result.q == signbit) overflow = true;
        result.qs = - result.qs;
        if (remainder.q == signbit) overflow = true;
        remainder.qs = - remainder.qs;
    }
    if (overflow) {
        if (isUnsigned) {   // unsigned overflow
            //if (mask.i & MSK_OVERFL_UNSIGN) t->interrupt(INT_OVERFL_UNSIGN);  // unsigned overflow
            result.q = sizemask;
            remainder.q = 0;
        }
        else {       // signed overflow
            //if (mask.i & MSK_OVERFL_SIGN) t->interrupt(INT_OVERFL_SIGN);      // signed overflow
            result.q = signbit;
            remainder.q = 0;
        }
    }
    t->parm[5].q = remainder.q;                  // save remainder
    return result.q;
}
 
static uint64_t f_mul_ex(CThread * t) {
    // extended signed multiply. result uses two consecutive array elements
    if (!t->vect) {
        t->interrupt(INT_WRONG_PARAMETERS);  return 0;
    }
    SNum result;
    switch (t->operandType) {
    case 0:   // int8
        result.is = ((int32_t)t->parm[1].bs * (int32_t)t->parm[2].bs);
        t->parm[5].is = result.is >> 8;  // store high part in parm[q]
        break;
    case 1:   // int16
        result.is = ((int32_t)t->parm[1].ss * (int32_t)t->parm[2].ss);
        t->parm[5].is = result.is >> 16;  // store high part in parm[5]
        break;
    case 2:   // int32
        result.qs = ((int64_t)t->parm[1].is * (int64_t)t->parm[2].is);
        t->parm[5].qs = result.qs >> 32;  // store high part in parm[5]
        break;
    case 3:   // int64
        result.qs = mul64_128s(&t->parm[5].q, t->parm[1].qs, t->parm[2].qs);
        break;
    default:
        t->interrupt(INT_WRONG_PARAMETERS);
        result.i = 0;
    }
    return result.q;
}
 
static uint64_t f_mul_ex_u(CThread * t) {
    // extended unsigned multiply. result uses two consecutive array elements
    if (!t->vect) {
        t->interrupt(INT_WRONG_PARAMETERS);  return 0;
    }
    SNum result;
    switch (t->operandType) {
    case 0:   // int8
        result.i = ((uint32_t)t->parm[1].b * (uint32_t)t->parm[2].b);
        t->parm[5].i = result.i >> 8;  // store high part in parm[5]
        break;
    case 1:   // int16
        result.i = ((uint32_t)t->parm[1].s * (uint32_t)t->parm[2].s);
        t->parm[5].i = result.i >> 16;  // store high part in parm[5]
        break;
    case 2:   // int32
        result.q = ((uint64_t)t->parm[1].i * (uint64_t)t->parm[2].i);
        t->parm[5].q = result.q >> 32;  // store high part in parm[5]
        break;
    case 3:   // int64
        result.q = mul64_128u(&t->parm[5].q, t->parm[1].q, t->parm[2].q);
        break;
    default:
        t->interrupt(INT_WRONG_PARAMETERS);
        result.i = 0;
    }
    return result.q;
}
 
static uint64_t sqrt_ (CThread * t) {
    // square root
    SNum a = t->parm[2];                         // input operand
    SNum result;  result.q = 0;
    uint32_t mask = t->parm[3].i;
    uint8_t operandType = t->operandType;
    bool detectExceptions = (mask & (0xF << MSKI_EXCEPTIONS)) != 0;  // make NAN if exceptions
    bool roundingMode = (mask & (3 << MSKI_ROUNDING)) != 0;  // non-standard rounding mode
    bool error = false;
    switch (operandType) {
    case 0:   // int8
        if (a.bs < 0) error = true;
        else result.b = (int8_t)sqrtf(a.bs);
        break;
    case 1:   // int16
        if (a.ss < 0) error = true;
        else result.s = (int16_t)sqrtf(a.bs);
        break;
    case 2:   // int32
        if (a.is < 0) error = true;
        else result.i = (int32_t)sqrt(a.bs);
        break;
    case 3:   // int64
        if (a.qs < 0) error = true;
        else result.q = (int64_t)sqrt(a.bs);
        break;
    case 5:   // float
        if (a.f < 0) {
            result.q = t->makeNan(nan_invalid_sqrt, operandType);
        }
        else {
            if (detectExceptions) clearExceptionFlags();   // clear previous exceptions
            if (roundingMode) setRoundingMode(mask >> MSKI_ROUNDING);
            result.f = sqrtf(a.f);                         // calculate square root
            if (roundingMode) setRoundingMode(0);
            if (detectExceptions) {
                uint32_t x = getExceptionFlags();          // read exceptions
                if ((mask & (1<<MSK_UNDERFLOW)) && (x & 0x10)) result.q = t->makeNan(nan_underflow, operandType);
                else if ((mask & (1<<MSK_INEXACT)) && (x & 0x20)) result.q = t->makeNan(nan_inexact, operandType);
            }
        }
        break;
    case 6:   // double
        if (a.d < 0) {
            result.q = t->makeNan(nan_invalid_sqrt, operandType);
        }
        else {
            if (detectExceptions) clearExceptionFlags();   // clear previous exceptions
            if (roundingMode) setRoundingMode(mask >> MSKI_ROUNDING);
            result.d = sqrt(a.d);                          // calculate square root
            if (roundingMode) setRoundingMode(0);
            if (detectExceptions) {
                uint32_t x = getExceptionFlags();          // read exceptions
                if ((mask & (1<<MSK_UNDERFLOW)) && (x & 0x10)) result.q = t->makeNan(nan_underflow, operandType);
                else if ((mask & (1<<MSK_INEXACT)) && (x & 0x20)) result.q = t->makeNan(nan_inexact, operandType);
            }
        }
        break;
    default:
        t->interrupt(INT_WRONG_PARAMETERS);
    }
    return result.q;
}
 
static uint64_t add_c (CThread * t) {
    // Add with carry. Vector has two elements. 
    // The upper element is used as carry on input and output
    SNum a = t->parm[1];                         // input operand
    SNum b = t->parm[2];                         // input operand
    SNum result;
    uint8_t rs = t->operands[4];          // RS is first input vector
    uint32_t elementSize = dataSizeTable[t->operandType]; // size of each element
    SNum carry;
    carry.q = t->readVectorElement(rs, t->vectorOffset + elementSize);  // high part of first input vector
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size    
    result.q = a.q + b.q;                        // add    
    uint8_t newCarry = (result.q & sizeMask) < (a.q & sizeMask); // get new carry
    result.q += carry.q & 1;                     // add carry
    if ((result.q & sizeMask) == 0) newCarry = 1;// carry
    t->parm[5].q = newCarry;                     // save new carry
    return result.q;
}
 
static uint64_t sub_b (CThread * t) {
    // Subtract with borrow. Vector has two elements. 
    // The upper element is used as borrow on input and output
    SNum a = t->parm[1];                         // input operand
    SNum b = t->parm[2];                         // input operand
    SNum result;
    uint8_t rs = t->operands[4];          // RS is first input vector
    uint32_t elementSize = dataSizeTable[t->operandType]; // size of each element
    SNum carry;
    carry.q = t->readVectorElement(rs, t->vectorOffset + elementSize);  // high part of first input vector
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size    
    result.q = a.q - b.q;                        // subtract
    uint8_t newCarry = (result.q & sizeMask) > (a.q & sizeMask); // get new carry
    result.q -= carry.q & 1;                     // subtract borrow
    if ((result.q & sizeMask) == sizeMask) newCarry = 1;// borrow
    t->parm[5].q = newCarry;                     // save new borrow
    return result.q;
}
 
static uint64_t add_ss (CThread * t) {
    // Add integer vectors, signed with saturation
    SNum a = t->parm[1];                         // input operand
    SNum b = t->parm[2];                         // input operand
    SNum result;
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
    uint64_t signBit = (sizeMask >> 1) + 1;      // sign bit
    result.q = a.q + b.q;                        // add
    uint64_t overfl = ~(a.q ^ b.q) & (a.q ^ result.q); // overflow if a and b have same sign and result has opposite sign
    if (overfl & signBit) { // overflow
        result.q = (sizeMask >> 1) + ((a.q & signBit) != 0); // INT_MAX or INT_MIN
    }
    return result.q;
}
 
static uint64_t sub_ss (CThread * t) {
    // subtract integer vectors, signed with saturation
    SNum a = t->parm[1];                         // input operand
    SNum b = t->parm[2];                         // input operand
    SNum result;
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
    uint64_t signBit = (sizeMask >> 1) + 1;      // sign bit
    result.q = a.q - b.q;                        // subtract
    uint64_t overfl = (a.q ^ b.q) & (a.q ^ result.q); // overflow if a and b have different sign and result has opposite sign of a
    if (overfl & signBit) { // overflow
        result.q = (sizeMask >> 1) + ((a.q & signBit) != 0); // INT_MAX or INT_MIN
    }
    return result.q;
}
 
static uint64_t add_us (CThread * t) {
    // Add integer vectors, unsigned with saturation
    SNum a = t->parm[1];                         // input operand
    SNum b = t->parm[2];                         // input operand
    SNum result;
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
    result.q = a.q + b.q;                        // add
    if ((result.q & sizeMask) < (a.q & sizeMask)) {   // overflow
        result.q = sizeMask;                     // UINT_MAX
    }
    return result.q;
}
 
static uint64_t sub_us (CThread * t) {
    // subtract integer vectors, unsigned with saturation
    SNum a = t->parm[1];                         // input operand
    SNum b = t->parm[2];                         // input operand
    SNum result;
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
    result.q = a.q - b.q;                        // add
    if ((result.q & sizeMask) > (a.q & sizeMask)) {   // overflow
        result.q = 0;                            // 0
    }
    return result.q;
}
 
static uint64_t mul_ss (CThread * t) {
    // multiply integer vectors, signed with saturation
    SNum a = t->parm[1];                         // input operand
    SNum b = t->parm[2];                         // input operand
    SNum result;
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
    uint64_t signBit = (sizeMask >> 1) + 1;      // sign bit
 
    // check for overflow
    bool overflow = false;
    switch (t->operandType) {
    case 0:  // int8
        result.is = (int32_t)a.bs * (int32_t)b.bs;                        // multiply
        overflow = result.bs != result.is;  break;
    case 1:  // int16
        result.is = (int32_t)a.ss * (int32_t)b.ss;                        // multiply
        overflow = result.ss != result.is;  break;
    case 2:  // int32
        result.qs = (int64_t)a.is * (int64_t)b.is;                        // multiply
        overflow = result.is != result.qs;  break;
    case 3:  // int64
        result.qs = a.qs * b.qs;                        // multiply
        overflow = fabs((double)a.qs * (double)b.qs - (double)result.qs) > 1.E8;
        break;
    default:
        t->interrupt(INT_WRONG_PARAMETERS);
    }
    if (overflow) {
        result.q = (sizeMask >> 1) + (((a.q ^ b.q) & signBit) != 0);  // INT_MAX or INT_MIN
    }
    return result.q;
}
 
static uint64_t mul_us (CThread * t) {
    // multiply integer vectors, unsigned with saturation
    SNum a = t->parm[1];                         // input operand
    SNum b = t->parm[2];                         // input operand
    SNum result;
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
 
    // check for overflow
    bool overflow = false;
    switch (t->operandType) {
    case 0:
        result.i = (uint32_t)a.b * (uint32_t)b.b;                        // multiply
        overflow = result.b != result.i;  break;
    case 1:
        result.i = (uint32_t)a.s * (uint32_t)b.s;
        overflow = result.s != result.i;  break;
    case 2:
        result.q = (uint64_t)a.i * (uint64_t)b.i;
        overflow = result.i != result.q;  break;
    case 3:
        result.q = a.q * b.q;
        overflow = fabs((double)a.q * (double)b.q - (double)result.q) > 1.E8;
        break;
    default:
        t->interrupt(INT_WRONG_PARAMETERS);
    }
    if (overflow) {
        result.q = sizeMask;
    }
    return result.q;
}
 
/*
static uint64_t shift_ss (CThread * t) {
    // Shift left integer vectors, signed with saturation
    SNum a = t->parm[1];                         // input operand
    SNum b = t->parm[2];                         // input operand
    SNum result;
    result.q = a.q << b.i;                       // shift left
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
    uint64_t signBit = (sizeMask >> 1) + 1;      // sign bit
    uint32_t bits1 = bitScanReverse(a.q & sizeMask) + 1;  // number of bits in a
    uint32_t bitsMax = dataSizeTable[t->operandType]; // maximum number of bits if negative
    uint8_t negative = (a.q & signBit) != 0;     // a is negative
    if (!negative) bitsMax--;                    // maximum number of bits if positive
    if ((a.q & sizeMask) != 0 && bits1 + (b.q & sizeMask) > bitsMax) { // overflow
        result.q = (sizeMask >> 1) + negative;   // INT_MAX or INT_MIN
    }
    return result.q;
}
 
static uint64_t shift_us (CThread * t) {
    // Shift left integer vectors, unsigned with saturation
    SNum a = t->parm[1];                         // input operand
    SNum b = t->parm[2];                         // input operand
    SNum result;
    result.q = a.q << b.i;                       // shift left
    uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
    uint32_t bits1 = bitScanReverse(a.q & sizeMask) + 1;  // number of bits in a
    uint32_t bitsMax = dataSizeTable[t->operandType]; // maximum number of bits
    if ((a.q & sizeMask) != 0 && bits1 + (b.q & sizeMask) > bitsMax) { // overflow
        result.q = sizeMask;                     // UINT_MAX
    }
    return result.q;
} */
 
/*
Instructions with overflow check use the even-numbered vector elements for arithmetic instructions.
Each following odd-numbered vector element is used for overflow detection. If the first source operand
is a scalar then the result operand will be a vector with two elements.
Overflow conditions are indicated with the following bits:
bit 0. Unsigned integer overflow (carry).
bit 1. Signed integer overflow.
The values are propagated so that the overflow result of the operation is OR’ed with the corresponding
values of both input operands. */
 
static uint64_t add_oc (CThread * t) {
    // add with overflow check
    SNum a = t->parm[1];                         // input operand
    SNum b = t->parm[2];                         // input operand
    uint8_t rs = t->operands[4];          // RS is first input vector
    uint8_t rt = t->operands[5];          // RT is first input vector
    uint32_t elementSize = dataSizeTable[t->operandType]; // size of each element
    SNum carry;
    carry.q  = t->readVectorElement(rs, t->vectorOffset + elementSize); // high part of first input vector
    carry.q |= t->readVectorElement(rt, t->vectorOffset + elementSize);  // high part of second input vector
    SNum result;
 
    if (t->operandType < 4) {
        uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
        result.q = a.q + b.q;                    // add
        if ((result.q & sizeMask) < (a.q & sizeMask)) { // unsigned overflow
            carry.b |= 1;
        }
        // signed overflow if a and b have same sign and result has opposite sign
        uint64_t signedOverflow = ~(a.q ^ b.q) & (a.q ^ result.q);
        uint64_t signBit = (sizeMask >> 1) + 1;      // sign bit
        if (signedOverflow & signBit) {
            carry.b |= 2;
        }
    }
    else {
        // unsupported operand type
        t->interrupt(INT_WRONG_PARAMETERS);  result.q = 0;
    }
    t->parm[5].q = carry.q & 3;                  // return carry
    return result.q;                             // return result
}
 
static uint64_t sub_oc (CThread * t) {
    // subtract with overflow check
    SNum a = t->parm[1];                         // input operand
    SNum b = t->parm[2];                         // input operand
    uint8_t rs = t->operands[4];          // RS is first input vector
    uint8_t rt = t->operands[5];          // RT is second input vector
    uint32_t elementSize = dataSizeTable[t->operandType]; // size of each element
    SNum carry;
    carry.q  = t->readVectorElement(rs, t->vectorOffset + elementSize);  // high part of first input vector
    carry.q |= t->readVectorElement(rt, t->vectorOffset + elementSize);  // high part of second input vector
    SNum result;
    if (t->operandType < 4) {
        uint64_t sizeMask = dataSizeMask[t->operandType]; // mask for data size
        result.q = a.q - b.q;                    // add
        if ((result.q & sizeMask) > (a.q & sizeMask)) { // unsigned overflow
            carry.b |= 1;
        }
        // signed overflow if a and b have opposite sign and result has opposite sign of a
        uint64_t signedOverflow = (a.q ^ b.q) & (a.q ^ result.q);
        uint64_t signBit = (sizeMask >> 1) + 1;      // sign bit
        if (signedOverflow & signBit) {
            carry.b |= 2;
        }
    }
    else {
        // unsupported operand type
        t->interrupt(INT_WRONG_PARAMETERS);  result.q = 0;
    }
    t->parm[5].q = carry.q & 3;                  // return carry
    return result.q;                             // return result
}
 
static uint64_t mul_oc (CThread * t) {
    // multiply with overflow check
    SNum a = t->parm[1];                         // input operand
    SNum b = t->parm[2];                         // input operand
    uint8_t rs = t->operands[4];          // RS is first input vector
    uint8_t rt = t->operands[5];          // RT is second input vector
    uint32_t elementSize = dataSizeTable[t->operandType]; // size of each element
    SNum carry;
    carry.q  = t->readVectorElement(rs, t->vectorOffset + elementSize);  // high part of first input vector
    carry.q |= t->readVectorElement(rt, t->vectorOffset + elementSize);  // high part of second input vector
    SNum result;
    bool signedOverflow = false;
    bool unsignedOverflow = false;
 
    // multiply and check for signed and unsigned overflow
    switch (t->operandType) {
    case 0:
        result.is = (int32_t)a.bs * (int32_t)b.bs;                        // multiply
        unsignedOverflow = result.b != result.i;
        signedOverflow = result.bs != result.is;
        break;
    case 1:
        result.is = (int32_t)a.ss * (int32_t)b.ss;
        unsignedOverflow = result.s != result.i;
        signedOverflow = result.ss != result.is;
        break;
    case 2:
        result.qs = (int64_t)a.is * (int64_t)b.is;
        unsignedOverflow = result.q != result.i;
        signedOverflow = result.qs != result.is;
        break;
    case 3:
        result.qs = a.qs * b.qs;
        unsignedOverflow = fabs((double)a.q * (double)b.q - (double)result.q) > 1.E8;
        signedOverflow   = fabs((double)a.qs * (double)b.qs - (double)result.qs) > 1.E8;
        break;
    default:
        t->interrupt(INT_WRONG_PARAMETERS);
    }
    if (unsignedOverflow) carry.b |= 1;     // unsigned overflow
    if (signedOverflow)   carry.b |= 2;     // signed overflow
    t->parm[5].q = carry.q & 3;                  // return carry
    return result.q;                             // return result
}
 
static uint64_t div_oc (CThread * t) {
    // signed divide with overflow check
    SNum a = t->parm[1];                         // input operand
    SNum b = t->parm[2];                         // input operand
    uint8_t rs = t->operands[4];          // RS is first input vector
    uint8_t rt = t->operands[5];          // RT is second input vector
    uint32_t elementSize = dataSizeTable[t->operandType]; // size of each element
    SNum carry;
    carry.q  = t->readVectorElement(rs, t->vectorOffset + elementSize);  // high part of first input vector
    carry.q |= t->readVectorElement(rt, t->vectorOffset + elementSize);  // high part of second input vector
    SNum result;
 
    // to do: rounding mode!
 
    switch (t->operandType) {
    case 0:  // int8
        if (b.b == 0) {
            result.i = 0x80; carry.b |= 3;     // signed and unsigned overflow
        }
        else if (a.b == 0x80 && b.bs == -1) {
            result.i = 0x80; carry.b |= 2;     // signed overflow
        }
        else result.i = a.bs / b.bs;
        break;
    case 1:  // int16
        if (b.s == 0) {
            result.i = 0x8000; carry.b |= 3;     // signed and unsigned overflow
        }
        else if (a.s == 0x8000 && b.ss == -1) {
            result.i = 0x8000; carry.b |= 2;     // signed overflow
        }
        else result.i = a.ss / b.ss;
        break;
    case 2:  // int32
        if (b.i == 0) {
            result.i = sign_f; carry.b |= 3;     // signed and unsigned overflow
        }
        else if (a.i == sign_f && b.is == -1) {
            result.i = sign_f; carry.b |= 2;     // signed overflow
        }
        else result.i = a.is / b.is;
        break;
    case 3:  // int64
        if (b.q == 0) {
            result.q = sign_d; carry.b |= 3;     // signed and unsigned overflow
        }
        else if (a.q == sign_d && b.qs == int64_t(-1)) {
            result.q = sign_d; carry.b |= 2;     // signed overflow
        }
        else result.qs = a.qs / b.qs;
        break;
    default:
        t->interrupt(INT_WRONG_PARAMETERS);
    }
    t->parm[5].q = carry.q & 3;                  // return carry
    return result.q;                             // return result
}
 
static uint64_t read_spev (CThread * t) {
    // Read special register RS into vector register RD with length RT.
    // to do
    return 0;
}
 
static uint64_t read_call_stack (CThread * t) {
    // read internal call stack. RD = vector register destination of length RS, RT-RS = internal address
    return 0; // to do
}
 
static uint64_t write_call_stack (CThread * t) {
    // write internal call stack. RD = vector register source of length RS, RT-RS = internal address 
    return 0; // to do
}
 
static uint64_t read_memory_map (CThread * t) {
    // read memory map. RD = vector register destination of length RS, RT-RS = internal address 
    return 0; // to do
}
 
static uint64_t write_memory_map (CThread * t) {
    // write memory map. RD = vector register
    return 0; // to do
}
 
/* Input ports to match soft core
Note: serial input from stdin in windows and Linux is messy. Emulation will have quirks.
 
Input port 8. Serial input:
Read one byte from RS232 serial input. The value is
bit 0-7: Received data (zero if input buffer empty)
bit   8: Data valid. Will be 0 if the input buffer is empty. It will not wait for data if the system allows polling
bit   9: More data ready: The input buffer contains at least one more byte ready to read
bit  12: Buffer overflow error. Data has been lost due to input buffer overflow
bit  13: Frame error. Error detected in start bit or stop bit. May be due to noise or wrong BAUD rate
 
Input port 9. Serial input status:
bit 0-15: Number of bytes currently in input buffer
bit   16: Buffer overflow error. Data has been lost due to input buffer overflow
bit   17: Frame error. Error detected in start bit or stop bit. May be due to noise or wrong BAUD rate
 
Input port 11. Serial output status:
bit 0-15: Number of bytes currently in output buffer
bit   16: Buffer overflow error. Data has been lost due to output buffer overflow
bit   18: Ready. The output buffer has enough space to receive at least one more byte
 
*/
 
static uint64_t input_ (CThread * t) {
    // read from input port. 
    // vector version: RD = vector register, RS = port address, RT = vector length
    // g.p. version: RD = g.p. register, RS = port address, IM1 = port address
    using namespace std;  // some compilers have getchar and putchar in namespace std, some not
    if (t->vect) {   // vector version not implemented yet
        t->interrupt(INT_WRONG_PARAMETERS);
        return 0;
    }
    uint32_t port = t->parm[2].i;           // immediate operand contains port number
    if (port == 255) port = t->parm[1].i;   // register operand contains port number
 
    switch (port) {
#if defined (__WINDOWS__) || defined (_WIN32) || defined (_WIN64)
    case 8:    // port 8: read serial input
        if (_kbhit()) {
            //int res = getchar();          // read character from stdin. waits for enter
            int res = _getch();             // read character from stdin. does not wait for enter
            if (res < 0) return 0;          // error or end of file (EOF = -1)
            else return (res | 0x100);      // input valid
        }
        else return 0;
    case 9:    // port 9: read serial input status. Only in systems that allow polling
        return _kbhit();
#else   // Other operating systems
        // Why is there no portable way of non-blocking read or polling a serial input?
    //case 8: case 9:
    //    return 0;  // to do: implement for Linux using curses.h or something
#endif
    case 11:   // port 11: get serial output status.
        return 0;
    default:
        t->interrupt(INT_WRONG_PARAMETERS);
        break;
    }
    return 0;
}
 
/* Output ports to match soft core
Output port 9. Serial input control:
bit    0: Clear buffer. Delete all data currently in the input buffer, and clear error flags
bit    1: Clear error flags but keep data.
          The error bits remain high after an error condition until reset by this or by system reset
 
Output port 10. Serial output:
Write one byte to RS232 serial output.
bit 0-7: Data to write
Other bits are reserved.
 
Output port 11. Serial output control:
bit    0: Clear buffer. Delete all data currently in the input buffer, and clear error flags
bit    1: Clear error flags but keep data.
          The error bits remain high after an error condition until reset by this or by system reset
*/
 
static uint64_t output_ (CThread * t) {
    // write to output port. 
    // vector version: RD = vector register to write, RS = port address, RT = vector length
    // g.p. version: RD = g.p. register to wrote, RS = port address, IM1 = port address
    using namespace std;  // some compilers have getchar and putchar in namespace std::, some not
    if (t->vect) {   // vector version not implemented yet
        t->interrupt(INT_WRONG_PARAMETERS);
        return 0;
    }
    uint32_t port = t->parm[2].i;           // immediate operand contains port number
    if (port == 255) port = t->parm[1].i;   // register operand contains port number
    uint32_t value = t->parm[0].i;          // value to output
    switch (port) {
    case 9:   // clear input buffer
#if defined (__WINDOWS__) || defined (_WIN32) || defined (_WIN64)
        while (_kbhit()) (void)_getch();
#endif
        break;
    case 10:   // write character
        putchar(value);
        break;
    case 11:   // serial output control. not possible in most operating systems
        break;
    default:
        t->interrupt(INT_WRONG_PARAMETERS);
        break;
    }
    t->running = 2;  // don't save to register RD
    return 0;
}
 
 
// tables of single format instructions
// Format 1.0 A. Three general purpose registers
PFunc funcTab4[64] = {
    0, 0, 0, 0, 0, 0, 0, 0
};
 
// Format 1.1 C. One general purpose register and a 16 bit immediate operand. int64
PFunc funcTab5[64] = {
    move_16s, move_16s, 0, move_16u, shifti1_move, shifti1_move, f_add, 0,   // 0 - 7
    f_mul, 0, shifti1_add, shifti1_add, shifti1_and, shifti1_and, shifti1_or, shifti1_or,  // 8 - 15 
    shifti1_xor, shifti1_xor, shift16_add, 0, 0, 0, 0, // 16 -23    
};
 
 
// Format 1.2 A. Three vector register operands
PFunc funcTab6[64] = {
    get_len, get_len, set_len, set_len, insert_, extract_, broad_, 0,               // 0  - 7
    compress_sparse, expand_sparse, 0, 0, bits2bool, 0, 0, 0,                       // 8 - 15
    shift_expand, shift_reduce, shift_up, shift_down, 0, 0, 0, 0, // 16 - 23
    div_ex, div_ex, f_mul_ex, f_mul_ex_u, sqrt_, 0, 0, 0,                           // 24 - 31
    add_ss, add_us, sub_ss, sub_us, mul_ss, mul_us, add_oc, sub_oc,                 // 32 - 39
    mul_oc, div_oc, add_c, sub_b, 0, 0, 0, 0,                                       // 40 - 47
    0, 0, 0, 0, 0, 0, 0, 0,                                                         // 48 - 55
    read_spev, 0, read_call_stack, write_call_stack, read_memory_map, write_memory_map, input_, output_ // 56 - 63
};
 
 
// Format 1.8 B. Two general purpose registers and an 8-bit immediate operand. int64
PFunc funcTab9[64] = {
    abs_64, shifti_add, bitscan_, roundp2, popcount_, 0, 0, 0,   // 0  - 7
    0, 0, 0, 0, 0, 0, 0, 0,                                      // 8 - 15
    0, 0, 0, 0, 0, 0, 0, 0,                                      // 16 - 23
    0, 0, 0, 0, 0, 0, 0, 0,                                      // 24 - 31
    read_spec, write_spec, read_capabilities, write_capabilities, read_perf, read_perf, read_sys, write_sys, // 32 - 39
    0, 0, 0, 0, 0, 0, 0, 0,                                      // 40 - 47
    0, 0, 0, 0, 0, 0, 0, 0,                                      // 48 - 55
    push_r, pop_r, 0, 0, 0, 0, input_, output_                   // 56 - 63
};
 
// Format 2.9 A. Three general purpose registers and a 32-bit immediate operand
PFunc funcTab12[64] = {
    move_hi32, insert_hi32, add_32u, sub_32u, add_hi32, and_hi32, or_hi32, xor_hi32,  // 0  - 7
    0, replace_bits, 0, 0, 0, 0, 0, 0,                                                // 8 - 15
    0, 0, 0, 0, 0, 0, 0, 0,                                                           // 16 - 23
    0, 0, 0, 0, 0, 0, 0, 0,                                                           // 24 - 31
    address_, 0, 0, 0, 0, 0, 0, 0,                                                    // 32 - 39
    0, 0, 0, 0, 0, 0, 0, 0,                                                           // 40 - 47
};
Browse

Tools

Subversion Repositories forwardcom

[/] [forwardcom/] [bintools/] [emulator4.cpp] - Blame information for rev 139

Line No.	Rev	Author	Line
1	55	Agner	`/************************** emulator4.cpp ******************************`
2			`* Author: Agner Fog`
3			`* date created: 2018-02-18`
4			`* Last modified: 2021-08-05`
5			`* Version: 1.11`
6			`* Project: Binary tools for ForwardCom instruction set`
7			`* Description:`
8			`* Emulator: Execution functions for single format instructions, part 1`
9			`*`
10			`* Copyright 2018-2021 GNU General Public License http://www.gnu.org/licenses`
11			`*****************************************************************************/`
12
13			`#include "stdafx.h"`
14
15
16			`// Format 1.0 A. Three general purpose registers`
17
18			`// Currently no instructions with format 1.0`
19
20
21			`// Format 1.1 C. One general purpose register and a 16 bit immediate operand. int64`
22
23			`static uint64_t move_16s(CThread * t) {`
24			`// Move 16-bit sign-extended constant to general purpose register.`
25			`return t->parm[2].q;`
26			`}`
27
28			`static uint64_t move_16u(CThread * t) {`
29			`// Move 16-bit zero-extended constant to general purpose register.`
30			`return t->parm[2].s;`
31			`}`
32
33			`static uint64_t shift16_add(CThread * t) {`
34			`// Shift 16-bit unsigned constant left by 16 and add.`
35			`t->parm[2].q <<= 16;`
36			`return f_add(t);`
37			`}`
38
39			`static uint64_t shifti1_move(CThread * t) {`
40			`// RD = IM2 << IM1. Sign-extend IM2 to 32/64 bits and shift left by the unsigned value IM1`