OpenCores
URL https://opencores.org/ocsvn/openarty/openarty/trunk

Subversion Repositories openarty

[/] [openarty/] [trunk/] [rtl/] [cpu/] [fastops.v] - Blame information for rev 3

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 3 dgisselq
///////////////////////////////////////////////////////////////////////////
2
//
3
// Filename:    fastops.v
4
//
5
// Project:     Zip CPU -- a small, lightweight, RISC CPU soft core
6
//
7
// Purpose:     This supports the instruction set reordering of operations
8
//              created by the second generation instruction set, as well as
9
//      the new operations of POPC (population count) and BREV (bit reversal).
10
//
11
//
12
// Creator:     Dan Gisselquist, Ph.D.
13
//              Gisselquist Technology, LLC
14
//
15
///////////////////////////////////////////////////////////////////////////
16
//
17
// Copyright (C) 2015-2016, Gisselquist Technology, LLC
18
//
19
// This program is free software (firmware): you can redistribute it and/or
20
// modify it under the terms of  the GNU General Public License as published
21
// by the Free Software Foundation, either version 3 of the License, or (at
22
// your option) any later version.
23
//
24
// This program is distributed in the hope that it will be useful, but WITHOUT
25
// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or
26
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27
// for more details.
28
//
29
// License:     GPL, v3, as defined and found on www.gnu.org,
30
//              http://www.gnu.org/licenses/gpl.html
31
//
32
//
33
///////////////////////////////////////////////////////////////////////////
34
//
35
module  fastops(i_clk,i_rst, i_ce, i_valid, i_op, i_a, i_b, o_c, o_f, o_valid,
36
                        o_illegal, o_busy);
37
        input           i_clk, i_rst, i_ce;
38
        input           [3:0]    i_op;
39
        input           [31:0]   i_a, i_b;
40
        input                   i_valid;
41
        output  reg     [31:0]   o_c;
42
        output  wire    [3:0]    o_f;
43
        output  wire            o_valid;
44
        output  wire            o_illegal;
45
        output  wire            o_busy;
46
 
47
        // Rotate-left logic
48
        wire    [63:0]   w_rol_tmp;
49
        assign  w_rol_tmp = { i_a, i_a } << i_b[4:0];
50
        reg     [31:0]   r_rol_result;
51
        always @(posedge i_clk)
52
                r_rol_result <= w_rol_tmp[63:32]; // Won't set flags
53
 
54
        // Shift register logic
55
        reg     [32:0]           r_lsr_result, r_asr_result, r_lsl_result;
56
        always @(posedge i_clk)
57
        begin
58
                r_asr_result <= (|i_b[31:5])? {(33){i_a[31]}}
59
                                : ( $signed({i_a, 1'b0 })>>> (i_b[4:0]) );// ASR
60
                r_lsr_result <= (|i_b[31:5])? 33'h00
61
                                : ( { i_a, 1'b0 } >> (i_b[4:0]) );// LSR
62
                r_lsl_result <= (|i_b[31:5])? 33'h00 : {1'b0, i_a } << i_b[4:0]; // LSL
63
        end
64
 
65
        // Bit reversal pre-logic
66
        wire    [31:0]   w_brev_result;
67
        reg     [31:0]   r_brev_result;
68
        genvar  k;
69
        generate
70
        for(k=0; k<32; k=k+1)
71
        begin : bit_reversal_cpuop
72
                assign w_brev_result[k] = i_b[31-k];
73
        end endgenerate
74
        always @(posedge i_clk)
75
                r_brev_result <= w_brev_result;
76
 
77
        // Popcount logic
78
        wire    [31:0]   w_popc_result;
79
        reg     [5:0]    r_popc_result;
80
        always @(posedge i_clk)
81
                r_popc_result =
82
                 ({5'h0,i_b[ 0]}+{5'h0,i_b[ 1]}+{5'h0,i_b[ 2]}+{5'h0,i_b[ 3]})
83
                +({5'h0,i_b[ 4]}+{5'h0,i_b[ 5]}+{5'h0,i_b[ 6]}+{5'h0,i_b[ 7]})
84
                +({5'h0,i_b[ 8]}+{5'h0,i_b[ 9]}+{5'h0,i_b[10]}+{5'h0,i_b[11]})
85
                +({5'h0,i_b[12]}+{5'h0,i_b[13]}+{5'h0,i_b[14]}+{5'h0,i_b[15]})
86
                +({5'h0,i_b[16]}+{5'h0,i_b[17]}+{5'h0,i_b[18]}+{5'h0,i_b[19]})
87
                +({5'h0,i_b[20]}+{5'h0,i_b[21]}+{5'h0,i_b[22]}+{5'h0,i_b[23]})
88
                +({5'h0,i_b[24]}+{5'h0,i_b[25]}+{5'h0,i_b[26]}+{5'h0,i_b[27]})
89
                +({5'h0,i_b[28]}+{5'h0,i_b[29]}+{5'h0,i_b[30]}+{5'h0,i_b[31]});
90
        assign  w_popc_result = { 26'h00, r_popc_result };
91
 
92
        // Prelogic for our flags registers
93
        wire    z, n, v;
94
        reg     c, pre_sign, set_ovfl;
95
        always @(posedge i_clk)
96
                if (i_ce) // 1 LUT
97
                        set_ovfl =(((i_op==4'h0)&&(i_a[31] != i_b[31]))//SUB&CMP
98
                                ||((i_op==4'h2)&&(i_a[31] == i_b[31])) // ADD
99
                                ||(i_op == 4'h6) // LSL
100
                                ||(i_op == 4'h5)); // LSR
101
 
102
        reg     [31:0]   r_logical;
103
        always @(posedge i_clk)
104
                r_logical <= (i_op[0]) ? (i_a & i_b) : (i_a | i_b);
105
 
106
        reg     [32:0]   r_sum, r_diff;
107
        reg     [31:0]   r_ldilo, r_bypass, r_xor;
108
        always @(posedge i_clk)
109
                r_sum <= i_a + i_b;                     // Add
110
        always @(posedge i_clk)
111
                r_diff <= {1'b0, i_a } - { 1'b0, i_b }; // SUB
112
        always @(posedge i_clk)
113
                r_xor    <= i_a ^ i_b;                  // XOR
114
        always @(posedge i_clk)
115
                r_ldilo  <= { i_a[31:16], i_b[15:0] };   // LDILO
116
        always @(posedge i_clk)
117
                r_bypass <= i_b;                        // LOD/MOV,ETC
118
 
119
        reg     mpyhi;
120
        wire    mpybusy;
121
 
122
        //
123
        // Multiply logic
124
        //
125
        reg     [63:0]   r_mpy_result;   // Our final goal
126
 
127
        // The three clock option
128
        reg     [31:0]   r_mpy_a_input, r_mpy_b_input;
129
        reg             r_mpy_signed;
130
        reg     [1:0]    mpypipe;
131
 
132
        wire    mpy;
133
        assign  mpy = (i_op[3:1] == 3'h5)||(i_op[3:0] != 4'h8);
134
 
135
        // First clock, latch in the inputs
136
        always @(posedge i_clk)
137
        begin
138
                if (i_op[0]) // i.e. if signed multiply
139
                begin
140
                        r_mpy_a_input <= {(~i_a[31]),i_a[30:0]};
141
                        r_mpy_b_input <= {(~i_b[31]),i_b[30:0]};
142
                end else begin
143
                        r_mpy_a_input <= i_a[31:0];
144
                        r_mpy_b_input <= i_b[31:0];
145
                end
146
                // The signed bit really only matters in the case of 64 bit
147
                // multiply.  We'll keep track of it, though, and pretend in
148
                // all other cases.
149
                r_mpy_signed  <= i_op[0];
150
 
151
                mpyhi  = i_op[1];
152
        end
153
 
154
        // Second clock, do the multiplies, get the "partial products".  Here,
155
        // we break our input up into two halves, 
156
        //
157
        //   A  = (2^16 ah + al)
158
        //   B  = (2^16 bh + bl)
159
        //
160
        // and use these to compute partial products.
161
        //
162
        //   AB = (2^32 ah*bh + 2^16 (ah*bl + al*bh) + (al*bl)
163
        //
164
        // Since we're following the FOIL algorithm to get here,
165
        // we'll name these partial products according to FOIL.
166
        //
167
        // The trick is what happens if A or B is signed.  In
168
        // those cases, the real value of A will not be given by
169
        //      A = (2^16 ah + al)
170
        // but rather
171
        //      A = (2^16 ah[31^] + al) - 2^31
172
        //  (where we have flipped the sign bit of A) and so ...
173
        //
174
        // AB= (2^16 ah + al - 2^31) * (2^16 bh + bl - 2^31)
175
        //      = 2^32(ah*bh)
176
        //              +2^16 (ah*bl+al*bh)
177
        //              +(al*bl)
178
        //              - 2^31 (2^16 bh+bl + 2^16 ah+al)
179
        //              - 2^62
180
        //      = 2^32(ah*bh)
181
        //              +2^16 (ah*bl+al*bh)
182
        //              +(al*bl)
183
        //              - 2^31 (2^16 bh+bl + 2^16 ah+al + 2^31)
184
        //
185
        reg     [31:0]   pp_f, pp_o, pp_i, pp_l; // F, O, I and L from FOIL
186
        reg     [32:0]   pp_s;
187
        always @(posedge i_clk)
188
        begin
189
                pp_f<=r_mpy_a_input[31:16]*r_mpy_b_input[31:16];
190
                pp_o<=r_mpy_a_input[31:16]*r_mpy_b_input[15: 0];
191
                pp_i<=r_mpy_a_input[15: 0]*r_mpy_b_input[31:16];
192
                pp_l<=r_mpy_a_input[15: 0]*r_mpy_b_input[15: 0];
193
                // And a special one for the sign
194
                if (r_mpy_signed)
195
                        pp_s <= 32'h8000_0000-( r_mpy_a_input[31:0]
196
                                                + r_mpy_b_input[31:0]);
197
                else
198
                        pp_s <= 33'h0;
199
        end
200
 
201
        // Third clock, add the results and produce a product
202
        //              r_mpy_result[63:16] <=
203
        //                      { 32'h00, pp_l[31:16] }
204
        //                      + { 16'h00, pp_o }
205
        //                      + { 16'h00, pp_i }
206
        //                      + { pp_s, 15'h00 }
207
        //                      + { pp_f, 16'h00 };
208
        //
209
        //              16'h00          16'h00          pp_l[31:16]     ppl[15:]
210
        //              16'h00          pp_o[31:16]     pp_o[15:0]      16'h00
211
        //              16'h00          pp_i[31:16]     pp_i[15:0]      16'h00
212
        //              pp_s[32:17]     pp_s[16:1]      pp_s[0],15'h0   16'h00
213
        //              pp_f[31:16]     pp_f[31:16]     16'h00          16'h00
214
        //
215
        //              16'h0           15'h0,lo[32]    lo[31:16]       lo[15:]
216
        //              15'h0,oi[32]    oi[31:16]       oi[15:0]        16'h00
217
        //              hi[31:0]        hi[15:0]        16'h00
218
        //
219
        //
220
        reg     [32:0]   partial_mpy_oi, partial_mpy_lo;
221
        reg     [31:0]   partial_mpy_hi;
222
        always @(posedge i_clk)
223
                begin
224
                        partial_mpy_lo[30:0]<= pp_l[30:0];
225
                        partial_mpy_lo[32:31]<= pp_s[0]+pp_l[31];
226
                        partial_mpy_oi[32:0]<= pp_o + pp_i;
227
                        partial_mpy_hi[31:0]<= pp_s[32:1] + pp_f;
228
                end
229
        reg     partial_mpy_2cl, partial_mpy_2ch;
230
        reg     [31:0]   partial_mpy_2lo, partial_mpy_2hi;
231
        // Fourth clock -- Finish adding our partial results
232
        always @(posedge i_clk)
233
                begin
234
                        partial_mpy_2lo[15:0] <= partial_mpy_lo[15:0];
235
                        { partial_mpy_2cl, partial_mpy_2lo[31:16] }
236
                                <= partial_mpy_oi[15:0] + partial_mpy_lo[31:16];
237
                        { partial_mpy_2ch, partial_mpy_2hi[15:0] }
238
                                <= partial_mpy_oi[32:16] + partial_mpy_hi[16:0];
239
                        partial_mpy_2hi[31:17] <= partial_mpy_2hi[31:17];
240
                end
241
        // Fifth clock -- deal with final carries
242
        always @(posedge i_clk)
243
                begin
244
                        r_mpy_result[31:0] <= partial_mpy_2lo[31:0];
245
                        r_mpy_result[63:32] <= partial_mpy_2hi+
246
                                { 14'h0,partial_mpy_2ch,15'h0, partial_mpy_2cl};
247
                end
248
        // Fifth clock -- results are available for writeback.
249
 
250
        //
251
        // The master ALU case statement
252
        //
253
        reg     [3:0]    r_op;
254
        always @(posedge i_clk)
255
        begin
256
                r_op <= i_op;
257
                pre_sign <= (i_a[31]);
258
                c <= 1'b0;
259
                casez(r_op)
260
                4'b0000:{c,o_c } <= r_diff;             // CMP/SUB
261
                4'b00?1:   o_c   <= r_logical;          // BTST/And/Or
262
                4'b0010:{c,o_c } <= r_sum;              // Add
263
                4'b0100:   o_c   <= r_xor;              // Xor
264
                4'b0101:{o_c,c } <= r_lsr_result;       // LSR
265
                4'b0110:{c,o_c } <= r_lsl_result;       // LSL
266
                4'b0111:{o_c,c } <= r_asr_result;       // ASR
267
                4'b1000:   o_c   <= r_mpy_result[31:0]; // MPY
268
                4'b1001:   o_c   <= r_ldilo;            // LODILO
269
                4'b1010:   o_c   <= r_mpy_result[63:32]; // MPYHU
270
                4'b1011:   o_c   <= r_mpy_result[63:32]; // MPYHS
271
                4'b1100:   o_c   <= r_brev_result;      // BREV
272
                4'b1101:   o_c   <= w_popc_result;      // POPC
273
                4'b1110:   o_c   <= r_rol_result;       // ROL
274
                default:   o_c   <= r_bypass;           // MOV, LDI
275
                endcase
276
        end
277
 
278
        // With the multiply implemented (as above), there are no illegal
279
        // results.
280
        assign o_illegal = 1'b0;
281
 
282
        assign  z = (o_c == 32'h0000); // This really costs us a clock ...
283
        assign  n = (o_c[31]);
284
        assign  v = (set_ovfl)&&(pre_sign != o_c[31]);
285
 
286
        assign  o_f = { v, n, c, z };
287
 
288
        reg     [2:0]    alu_pipe;
289
        always @(posedge i_clk)
290
                if (i_rst)
291
                        alu_pipe <= 3'h0;
292
                else
293
                        alu_pipe <= { alu_pipe[1], (i_ce)&(~mpy)|alu_pipe[0],
294
                                (i_ce)&(mpy) };
295
        //
296
        // A longer pipeline would look like:
297
        //
298
        // alu_pipe <= { alu_pipe[2:1], (i_ce)&(~mpy)|alu_pipe[1], alu_pipe[0],
299
        //                      (i_ce)&mpy;
300
        // o_busy <= (|alu_pipe[1:0])
301
 
302
        assign  o_valid = alu_pipe[2];
303
        assign  o_busy  = alu_pipe[0];
304
endmodule

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.