URL https://opencores.org/ocsvn/an-fpga-implementation-of-low-latency-noc-based-mpsoc/an-fpga-implementation-of-low-latency-noc-based-mpsoc/trunk
Subversion Repositories an-fpga-implementation-of-low-latency-noc-based-mpsoc

[/] [an-fpga-implementation-of-low-latency-noc-based-mpsoc/] [trunk/] [mpsoc/] [src_processor/] [mor1kx-5.0/] [rtl/] [verilog/] [pfpu32/] [pfpu32_rnd.v] - Rev 48

Compare with Previous | Blame | View Log
/////////////////////////////////////////////////////////////////////
//                                                                 //
//    pfpu32_rnd                                                   //
//    32-bit common rounding module for FPU                        //
//                                                                 //
//    This file is part of the mor1kx project                      //
//    https://github.com/openrisc/mor1kx                           //
//                                                                 //
//    Author: Andrey Bacherov                                      //
//            avbacherov@opencores.org                             //
//                                                                 //
/////////////////////////////////////////////////////////////////////
//                                                                 //
//   Copyright (C) 2014 Andrey Bacherov                            //
//                      avbacherov@opencores.org                   //
//                                                                 //
//   This source file may be used and distributed without          //
//   restriction provided that this copyright statement is not     //
//   removed from the file and that any derivative work contains   //
//   the original copyright notice and the associated disclaimer.  //
//                                                                 //
//       THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY       //
//   EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED     //
//   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS     //
//   FOR A PARTICULAR PURPOSE. IN NO EVENT SHALL THE AUTHOR        //
//   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,           //
//   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES      //
//   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE     //
//   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR          //
//   BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    //
//   LIABILITY, WHETHER IN  CONTRACT, STRICT LIABILITY, OR TORT    //
//   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT    //
//   OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE           //
//   POSSIBILITY OF SUCH DAMAGE.                                   //
//                                                                 //
/////////////////////////////////////////////////////////////////////
 
`include "mor1kx-defines.v"
 
module pfpu32_rnd
(
  // clocks, resets and other controls
  input        clk,
  input        rst,
  input        flush_i,  // flush pipe
  input        adv_i,    // advance pipe
  input  [1:0] rmode_i,  // rounding mode
  // input from add/sub
  input        add_rdy_i,       // add/sub is ready
  input        add_sign_i,      // add/sub signum
  input        add_sub_0_i,     // flag that actual substruction is performed and result is zero
  input  [4:0] add_shl_i,       // do left shift in align stage
  input  [9:0] add_exp10shl_i,  // exponent for left shift align
  input  [9:0] add_exp10sh0_i,  // exponent for no shift in align
  input [27:0] add_fract28_i,   // fractional with appended {r,s} bits
  input        add_inv_i,       // add/sub invalid operation flag
  input        add_inf_i,       // add/sub infinity input
  input        add_snan_i,      // add/sub signaling NaN input
  input        add_qnan_i,      // add/sub quiet NaN input
  input        add_anan_sign_i, // add/sub signum for output nan
  // input from mul
  input        mul_rdy_i,       // mul is ready
  input        mul_sign_i,      // mul signum
  input  [4:0] mul_shr_i,       // do right shift in align stage
  input  [9:0] mul_exp10shr_i,  // exponent for right shift align
  input        mul_shl_i,       // do left shift in align stage
  input  [9:0] mul_exp10shl_i,  // exponent for left shift align
  input  [9:0] mul_exp10sh0_i,  // exponent for no shift in align
  input [27:0] mul_fract28_i,   // fractional with appended {r,s} bits
  input        mul_inv_i,       // mul invalid operation flag
  input        mul_inf_i,       // mul infinity input
  input        mul_snan_i,      // mul signaling NaN input
  input        mul_qnan_i,      // mul quiet NaN input
  input        mul_anan_sign_i, // mul signum for output nan
  // input from div
  input        div_op_i,        // MUL/DIV output is division
  input        div_sign_rmnd_i, // signum or reminder for IEEE compliant rounding
  input        div_dbz_i,       // division by zero flag
  // input from i2f
  input        i2f_rdy_i,       // i2f is ready
  input        i2f_sign_i,      // i2f signum
  input  [3:0] i2f_shr_i,
  input  [7:0] i2f_exp8shr_i,
  input  [4:0] i2f_shl_i,
  input  [7:0] i2f_exp8shl_i,
  input  [7:0] i2f_exp8sh0_i,
  input [31:0] i2f_fract32_i,   
  // input from f2i
  input        f2i_rdy_i,       // f2i is ready
  input        f2i_sign_i,      // f2i signum
  input [23:0] f2i_int24_i,     // f2i fractional
  input  [4:0] f2i_shr_i,       // f2i required shift right value
  input  [3:0] f2i_shl_i,       // f2i required shift left value   
  input        f2i_ovf_i,       // f2i overflow flag
  input        f2i_snan_i,      // f2i signaling NaN input
  // input from cmp
  input        cmp_rdy_i,       // cmp is ready
  input        cmp_res_i,       // cmp result
  input        cmp_inv_i,       // cmp invalid flag
  input        cmp_inf_i,       // cmp infinity flag
  // outputs
  //  arithmetic part's outputs
  output reg                  [31:0] fpu_result_o,
  output reg                         fpu_arith_valid_o,
  //  comparator's outputs 
  output reg                         fpu_cmp_flag_o,
  output reg                         fpu_cmp_valid_o,
  //  common output
  output reg [`OR1K_FPCSR_WIDTH-1:0] fpcsr_o
);
 
  localparam INF  = 31'b1111111100000000000000000000000;
  localparam QNAN = 31'b1111111110000000000000000000000;
  localparam SNAN = 31'b1111111101111111111111111111111;
 
  // rounding mode isn't require pipelinization
  wire rm_nearest = (rmode_i==2'b00);
  wire rm_to_zero = (rmode_i==2'b01);
  wire rm_to_infp = (rmode_i==2'b10);
  wire rm_to_infm = (rmode_i==2'b11);
 
  /*
     Any stage's output is registered.
     Definitions:
       s??o_name - "S"tage number "??", "O"utput
       s??t_name - "S"tage number "??", "T"emporary (internally)
  */
 
  /* Stage #1: common align */
 
  wire        s1t_sign;
  wire [34:0] s1t_fract35;
  wire        s1t_inv;
  wire        s1t_inf;
  wire        s1t_snan;
  wire        s1t_qnan;
  wire        s1t_anan_sign;
  wire  [4:0] s1t_shr;
  wire  [4:0] s1t_shl;
 
  // multiplexer for signums and flags
  wire s1t_add_sign = add_sub_0_i ? rm_to_infm : add_sign_i;
 
  assign {s1t_sign,s1t_inv,s1t_inf,s1t_snan,s1t_qnan,s1t_anan_sign} =
    ({6{add_rdy_i}} & {s1t_add_sign,add_inv_i,add_inf_i,add_snan_i,add_qnan_i,add_anan_sign_i}) |
    ({6{mul_rdy_i}} & {mul_sign_i,mul_inv_i,mul_inf_i,mul_snan_i,mul_qnan_i,mul_anan_sign_i}) |
    ({6{f2i_rdy_i}} & {f2i_sign_i,1'b0,1'b0,f2i_snan_i,1'b0,f2i_sign_i}) |
    ({6{i2f_rdy_i}} & {i2f_sign_i,1'b0,1'b0,1'b0,1'b0,1'b0});
 
  // multiplexer for fractionals
  assign s1t_fract35 =
    ({35{add_rdy_i}} & {7'd0, add_fract28_i}) |
    ({35{mul_rdy_i}} & {7'd0, mul_fract28_i}) |
    ({35{f2i_rdy_i}} & {8'd0, f2i_int24_i, 3'd0}) |
    ({35{i2f_rdy_i}} & {i2f_fract32_i,3'd0});
 
  // overflow bit for add/mul
  wire s1t_addmul_carry = (add_rdy_i & add_fract28_i[27]) |
                          (mul_rdy_i & mul_fract28_i[27]);
 
  // multiplexer for shift values
  wire [4:0] s1t_shr_t;
  assign {s1t_shr_t, s1t_shl} =
    ({10{add_rdy_i}} & {5'd0, add_shl_i}) |
    ({10{mul_rdy_i}} & {mul_shr_i, {4'd0,mul_shl_i}}) |
    ({10{f2i_rdy_i}} & {f2i_shr_i, {1'b0,f2i_shl_i}}) |
    ({10{i2f_rdy_i}} & {{1'b0,i2f_shr_i}, i2f_shl_i});
 
  assign s1t_shr = (|s1t_shr_t) ? s1t_shr_t : {4'd0,s1t_addmul_carry};
 
  // align
  wire [34:0] s1t_fract35sh =
    (|s1t_shr) ? (s1t_fract35 >> s1t_shr) :
                 (s1t_fract35 << s1t_shl);
 
  // update sticky bit for right shift case.
  // maximum right shift value is :
  //    27 for mul/div
  //     8 for i2f
  reg s1r_sticky;
  always @(s1t_fract35 or s1t_shr) begin
    case (s1t_shr)
      5'd0   : s1r_sticky = |s1t_fract35[ 1:0];
      5'd1   : s1r_sticky = |s1t_fract35[ 2:0];
      5'd2   : s1r_sticky = |s1t_fract35[ 3:0];
      5'd3   : s1r_sticky = |s1t_fract35[ 4:0];
      5'd4   : s1r_sticky = |s1t_fract35[ 5:0];
      5'd5   : s1r_sticky = |s1t_fract35[ 6:0];
      5'd6   : s1r_sticky = |s1t_fract35[ 7:0];
      5'd7   : s1r_sticky = |s1t_fract35[ 8:0];
      5'd8   : s1r_sticky = |s1t_fract35[ 9:0];
      5'd9   : s1r_sticky = |s1t_fract35[10:0];
      5'd10  : s1r_sticky = |s1t_fract35[11:0];
      5'd11  : s1r_sticky = |s1t_fract35[12:0];
      5'd12  : s1r_sticky = |s1t_fract35[13:0];
      5'd13  : s1r_sticky = |s1t_fract35[14:0];
      5'd14  : s1r_sticky = |s1t_fract35[15:0];
      5'd15  : s1r_sticky = |s1t_fract35[16:0];
      5'd16  : s1r_sticky = |s1t_fract35[17:0];
      5'd17  : s1r_sticky = |s1t_fract35[18:0];
      5'd18  : s1r_sticky = |s1t_fract35[19:0];
      5'd19  : s1r_sticky = |s1t_fract35[20:0];
      5'd20  : s1r_sticky = |s1t_fract35[21:0];
      5'd21  : s1r_sticky = |s1t_fract35[22:0];
      5'd22  : s1r_sticky = |s1t_fract35[23:0];
      5'd23  : s1r_sticky = |s1t_fract35[24:0];
      5'd24  : s1r_sticky = |s1t_fract35[25:0];
      5'd25  : s1r_sticky = |s1t_fract35[26:0];
      default: s1r_sticky = |s1t_fract35[27:0];
    endcase
  end // always
 
  // update sticky bit for left shift case.
  reg s1l_sticky;
  always @(s1t_fract35 or s1t_shl) begin
    case (s1t_shl)
      5'd0   : s1l_sticky = |s1t_fract35[1:0];
      5'd1   : s1l_sticky =  s1t_fract35[0];
      default: s1l_sticky = 1'b0;
    endcase
  end // always
 
  wire s1t_sticky = (|s1t_shr) ? s1r_sticky : s1l_sticky;
 
  // two stage multiplexer for exponents
  wire [9:0] s1t_exp10shr;
  wire [9:0] s1t_exp10shl;
  wire [9:0] s1t_exp10sh0;
  assign {s1t_exp10shr, s1t_exp10shl, s1t_exp10sh0} =
    ({30{add_rdy_i}} & {add_exp10sh0_i, add_exp10shl_i, add_exp10sh0_i}) |
    ({30{mul_rdy_i}} & {mul_exp10shr_i, mul_exp10shl_i, mul_exp10sh0_i}) |
    ({30{f2i_rdy_i}} & {10'd0, 10'd0, 10'd0}) |
    ({30{i2f_rdy_i}} & {{2'd0,i2f_exp8shr_i},{2'd0,i2f_exp8shl_i},{2'd0,i2f_exp8sh0_i}});
 
  wire [9:0] s1t_exp10 =
    (|s1t_shr_t)  ? s1t_exp10shr :
    (~(|s1t_shl)) ? (s1t_exp10sh0 + {9'd0,s1t_addmul_carry}) :
                    s1t_exp10shl;
 
  // output of align stage 
  reg        s1o_sign;
  reg  [9:0] s1o_exp10;
  reg [31:0] s1o_fract32;
  reg  [1:0] s1o_rs;
  reg        s1o_inv;
  reg        s1o_inf;
  reg        s1o_snan_i;
  reg        s1o_qnan_i;
  reg        s1o_anan_sign_i;
  reg        s1o_div_op, s1o_div_sign_rmnd, s1o_div_dbz;
  reg        s1o_f2i_ovf, s1o_f2i;
  // registering
  always @(posedge clk) begin
    if(adv_i) begin
      s1o_sign    <= s1t_sign;
      s1o_exp10   <= s1t_exp10;
      s1o_fract32 <= s1t_fract35sh[34:3];
      s1o_rs      <= {s1t_fract35sh[2],s1t_sticky};
      // various flags:
      s1o_inv         <= s1t_inv;
      s1o_inf         <= s1t_inf;
      s1o_snan_i      <= s1t_snan;
      s1o_qnan_i      <= s1t_qnan;
      s1o_anan_sign_i <= s1t_anan_sign;
      // DIV specials
      s1o_div_op        <= mul_rdy_i & div_op_i;
      s1o_div_sign_rmnd <= div_sign_rmnd_i;
      s1o_div_dbz       <= div_dbz_i;
      // I2F specials
      s1o_f2i_ovf <= f2i_ovf_i;
      s1o_f2i     <= f2i_rdy_i;
    end // advance
  end // posedge clock
 
  // ready is special case
  reg s1o_ready;
  always @(posedge clk `OR_ASYNC_RST) begin
    if (rst)
      s1o_ready <= 1'b0;
    else if(flush_i)
      s1o_ready <= 1'b0;
    else if(adv_i)
      s1o_ready <= (add_rdy_i | mul_rdy_i | f2i_rdy_i | i2f_rdy_i);
  end // posedge clock
 
 
  /* Stage #2: rounding */
 
 
  wire s2t_dbz  = s1o_div_dbz;
 
  wire s2t_g    = s1o_fract32[0];
  wire s2t_r    = s1o_rs[1];
  wire s2t_s    = s1o_rs[0];
  wire s2t_lost = s2t_r | s2t_s;
 
  wire s2t_rnd_up = (rm_nearest & s2t_r & s2t_s) |
                    (rm_nearest & s2t_g & s2t_r & (~s2t_s)) |
                    (rm_to_infp & (~s1o_sign) & s2t_lost) |
                    (rm_to_infm &   s1o_sign  & s2t_lost);
 
  // IEEE compliance rounding for qutient
  wire s2t_div_rnd_up =
    (rm_nearest & s2t_r & s2t_s & (~s1o_div_sign_rmnd)) |
    ( ((rm_to_infp & (~s1o_sign)) | (rm_to_infm & s1o_sign)) &
      ((s2t_r & s2t_s) | ((~s2t_r) & s2t_s & (~s1o_div_sign_rmnd))) );
  wire s2t_div_rnd_dn = (~s2t_r) & s2t_s & s1o_div_sign_rmnd &
    ( (rm_to_infp &   s1o_sign)  |
      (rm_to_infm & (~s1o_sign)) |
       rm_to_zero );
 
  // set resulting direction of rounding
  //  a) normalized quotient is rounded by quotient related rules
  //  b) de-normalized quotient is rounded by common rules
  wire s2t_rnd_n_qtnt = s1o_div_op & s1o_fract32[23]; // normalized quotient
  wire s2t_set_rnd_up = s2t_rnd_n_qtnt ? s2t_div_rnd_up : s2t_rnd_up;
  wire s2t_set_rnd_dn = s2t_rnd_n_qtnt ? s2t_div_rnd_dn : 1'b0;
 
  // define value for rounding adder
  wire [31:0] s2t_rnd_v32 =
    s2t_set_rnd_up ? 32'd1        : // +1
    s2t_set_rnd_dn ? 32'hFFFFFFFF : // -1
                     32'd0;         // no rounding
  // rounded fractional
  wire [31:0] s2t_fract32_rnd = s1o_fract32 + s2t_rnd_v32;
 
 
  // floating point output
  wire s2t_f32_shr = s2t_fract32_rnd[24];
  // update exponent and fraction
  wire [9:0]  s2t_f32_exp10   = s1o_exp10 + {9'd0,s2t_f32_shr};
  wire [23:0] s2t_f32_fract24 = s2t_f32_shr ? s2t_fract32_rnd[24:1] :
                                              s2t_fract32_rnd[23:0];
   // denormalized or zero
  wire s2t_f32_fract24_dn = ~s2t_f32_fract24[23];
 
 
  // integer output (f2i)
  wire s2t_i32_carry_rnd = s1o_fract32[31];
  wire s2t_i32_inv = ((~s1o_sign) & s2t_i32_carry_rnd) | s1o_f2i_ovf;
  // two's complement for negative number
  wire [31:0] s2t_i32_int32 = (s1o_fract32 ^ {32{s1o_sign}}) + {31'd0,s1o_sign};
  // zero
  wire s2t_i32_int32_00 = (~s2t_i32_inv) & (~(|s2t_i32_int32));
  // int32 output
  wire [31:0] s2t_i32_opc;
  assign s2t_i32_opc =
    s2t_i32_inv ? (32'h7fffffff ^ {32{s1o_sign}}) : s2t_i32_int32;
 
 
   // Generate result and flags
  wire s2t_ine, s2t_ovf, s2t_inf, s2t_unf, s2t_zer;
  wire [31:0] s2t_opc;
  assign {s2t_opc,s2t_ine,s2t_ovf,s2t_inf,s2t_unf,s2t_zer} =
    // f2i
    s1o_f2i ?       //  ine  ovf  inf  unf              zer
      {s2t_i32_opc,s2t_lost,1'b0,1'b0,1'b0,s2t_i32_int32_00} :
    // qnan output
    (s1o_snan_i | s1o_qnan_i) ? // ine  ovf  inf  unf  zer
      {{s1o_anan_sign_i,QNAN},    1'b0,1'b0,1'b0,1'b0,1'b0} :
    // snan output
    s1o_inv ?        // ine  ovf  inf  unf  zer
      {{s1o_sign,SNAN},1'b0,1'b0,1'b0,1'b0,1'b0} :
    // overflow and infinity
    ((s2t_f32_exp10 > 10'd254) | s1o_inf | s2t_dbz) ? // ine                       ovf  inf  unf  zer
      {{s1o_sign,INF},((s2t_lost | (~s1o_inf)) & (~s2t_dbz)),((~s1o_inf) & (~s2t_dbz)),1'b1,1'b0,1'b0} :
    // denormalized or zero
    (s2t_f32_fract24_dn) ?                     // ine  ovf  inf 
      {{s1o_sign,8'd0,s2t_f32_fract24[22:0]},s2t_lost,1'b0,1'b0,
                                // unf        zer
       (s2t_lost & s2t_f32_fract24_dn),~(|s2t_f32_fract24)} :
    // normal result                                          ine  ovf  inf  unf  zer
    {{s1o_sign,s2t_f32_exp10[7:0],s2t_f32_fract24[22:0]},s2t_lost,1'b0,1'b0,1'b0,1'b0};
 
 
  // Output Register
  always @(posedge clk `OR_ASYNC_RST) begin
    if (rst) begin
        // arithmetic results
      fpu_result_o      <= 32'd0;
      fpu_arith_valid_o <=  1'b0;
        // comparison specials
      fpu_cmp_flag_o  <= 1'b0;
      fpu_cmp_valid_o <= 1'b0;
        // exeptions
      fpcsr_o         <= {`OR1K_FPCSR_WIDTH{1'b0}};
    end
    else if(flush_i) begin
        // arithmetic results
      fpu_result_o      <= 32'd0;
      fpu_arith_valid_o <=  1'b0;
        // comparison specials
      fpu_cmp_flag_o  <= 1'b0;
      fpu_cmp_valid_o <= 1'b0;
        // exeptions
      fpcsr_o         <= {`OR1K_FPCSR_WIDTH{1'b0}};
    end
    else if(adv_i) begin
        // arithmetic results
      fpu_result_o      <= s2t_opc;
      fpu_arith_valid_o <= s1o_ready;
        // comparison specials
      fpu_cmp_flag_o  <= cmp_res_i;
      fpu_cmp_valid_o <= cmp_rdy_i;
        // exeptions
      fpcsr_o[`OR1K_FPCSR_OVF] <= s2t_ovf;
      fpcsr_o[`OR1K_FPCSR_UNF] <= s2t_unf;
      fpcsr_o[`OR1K_FPCSR_SNF] <= s1o_inv | (s1o_snan_i & s1o_f2i);
      fpcsr_o[`OR1K_FPCSR_QNF] <= s1o_qnan_i;
      fpcsr_o[`OR1K_FPCSR_ZF]  <= s2t_zer;
      fpcsr_o[`OR1K_FPCSR_IXF] <= s2t_ine;
      fpcsr_o[`OR1K_FPCSR_IVF] <= (s1o_inv | (s2t_i32_inv & s1o_f2i) | s1o_snan_i) |
                                  (cmp_inv_i & cmp_rdy_i);
      fpcsr_o[`OR1K_FPCSR_INF] <= s2t_inf |
                                  (cmp_inf_i & cmp_rdy_i);
      fpcsr_o[`OR1K_FPCSR_DZF] <= s2t_dbz;
    end
  end // posedge clock
 
endmodule // pfpu32_rnd
Compare with Previous | Blame | View Log
Browse

Tools

Subversion Repositories an-fpga-implementation-of-low-latency-noc-based-mpsoc

[/] [an-fpga-implementation-of-low-latency-noc-based-mpsoc/] [trunk/] [mpsoc/] [src_processor/] [mor1kx-5.0/] [rtl/] [verilog/] [pfpu32/] [pfpu32_rnd.v] - Rev 48