URL
https://opencores.org/ocsvn/lxp32/lxp32/trunk
Subversion Repositories lxp32
Compare Revisions
- This comparison shows the changes necessary to convert path
/lxp32/trunk/rtl
- from Rev 6 to Rev 9
- ↔ Reverse comparison
Rev 6 → Rev 9
/lxp32_alu.vhd
1,250 → 1,250
--------------------------------------------------------------------- |
-- Arithmetic logic unit |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- Performs arithmetic and logic operations. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_alu is |
generic( |
DIVIDER_EN: boolean; |
MUL_ARCH: string |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
valid_i: in std_logic; |
|
cmd_signed_i: in std_logic; |
cmd_addsub_i: in std_logic; |
cmd_mul_i: in std_logic; |
cmd_div_i: in std_logic; |
cmd_div_mod_i: in std_logic; |
cmd_cmp_i: in std_logic; |
cmd_negate_op2_i: in std_logic; |
cmd_and_i: in std_logic; |
cmd_xor_i: in std_logic; |
cmd_shift_i: in std_logic; |
cmd_shift_right_i: in std_logic; |
|
op1_i: in std_logic_vector(31 downto 0); |
op2_i: in std_logic_vector(31 downto 0); |
|
result_o: out std_logic_vector(31 downto 0); |
|
cmp_eq_o: out std_logic; |
cmp_ug_o: out std_logic; |
cmp_sg_o: out std_logic; |
|
we_o: out std_logic; |
busy_o: out std_logic |
); |
end entity; |
|
architecture rtl of lxp32_alu is |
|
signal addend1: unsigned(31 downto 0); |
signal addend2: unsigned(31 downto 0); |
signal adder_result: unsigned(32 downto 0); |
signal adder_we: std_logic; |
|
signal cmp_eq: std_logic; |
signal cmp_carry: std_logic; |
signal cmp_s1: std_logic; |
signal cmp_s2: std_logic; |
|
signal logic_result: std_logic_vector(31 downto 0); |
signal logic_we: std_logic; |
|
signal mul_result: std_logic_vector(31 downto 0); |
signal mul_ce: std_logic; |
signal mul_we: std_logic; |
|
signal div_result: std_logic_vector(31 downto 0); |
signal div_ce: std_logic; |
signal div_we: std_logic; |
|
signal shift_result: std_logic_vector(31 downto 0); |
signal shift_ce: std_logic; |
signal shift_we: std_logic; |
|
signal result_mux: std_logic_vector(31 downto 0); |
signal result_we: std_logic; |
|
signal busy: std_logic:='0'; |
|
begin |
|
assert MUL_ARCH="dsp" or MUL_ARCH="seq" or MUL_ARCH="opt" |
report "Invalid MUL_ARCH generic value: dsp, opt or seq expected" |
severity failure; |
|
-- Add/subtract |
|
addend1<=unsigned(op1_i); |
|
addend2_gen: for i in addend2'range generate |
addend2(i)<=op2_i(i) xor cmd_negate_op2_i; |
end generate; |
|
adder_result<=("0"&addend1)+("0"&addend2)+(to_unsigned(0,adder_result'length-1)&cmd_negate_op2_i); |
adder_we<=cmd_addsub_i and valid_i; |
|
-- Comparator (needs cmd_negate_op2_i to work correctly) |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if valid_i='1' and cmd_cmp_i='1' then |
if op1_i=op2_i then |
cmp_eq<='1'; |
else |
cmp_eq<='0'; |
end if; |
|
cmp_carry<=adder_result(adder_result'high); |
cmp_s1<=op1_i(op1_i'high); |
cmp_s2<=op2_i(op2_i'high); |
end if; |
end if; |
end process; |
|
cmp_eq_o<=cmp_eq; |
cmp_ug_o<=cmp_carry and not cmp_eq; |
cmp_sg_o<=((cmp_s1 and cmp_s2 and cmp_carry) or |
(not cmp_s1 and not cmp_s2 and cmp_carry) or |
(not cmp_s1 and cmp_s2)) and not cmp_eq; |
|
-- Bitwise operations (and, or, xor) |
-- Note: (a or b) = (a and b) or (a xor b) |
|
logic_result_gen: for i in logic_result'range generate |
logic_result(i)<=((op1_i(i) and op2_i(i)) and cmd_and_i) or |
((op1_i(i) xor op2_i(i)) and cmd_xor_i); |
end generate; |
|
logic_we<=(cmd_and_i or cmd_xor_i) and valid_i; |
|
-- Multiplier |
|
mul_ce<=cmd_mul_i and valid_i; |
|
gen_mul_dsp: if MUL_ARCH="dsp" generate |
mul_inst: entity work.lxp32_mul_dsp(rtl) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
ce_i=>mul_ce, |
op1_i=>op1_i, |
op2_i=>op2_i, |
ce_o=>mul_we, |
result_o=>mul_result |
); |
end generate; |
|
gen_mul_opt: if MUL_ARCH="opt" generate |
mul_inst: entity work.lxp32_mul_opt(rtl) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
ce_i=>mul_ce, |
op1_i=>op1_i, |
op2_i=>op2_i, |
ce_o=>mul_we, |
result_o=>mul_result |
); |
end generate; |
|
gen_mul_seq: if MUL_ARCH="seq" generate |
mul_inst: entity work.lxp32_mul_seq(rtl) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
ce_i=>mul_ce, |
op1_i=>op1_i, |
op2_i=>op2_i, |
ce_o=>mul_we, |
result_o=>mul_result |
); |
end generate; |
|
-- Divider |
|
div_ce<=cmd_div_i and valid_i; |
|
gen_divider: if DIVIDER_EN generate |
divider_inst: entity work.lxp32_divider(rtl) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
ce_i=>div_ce, |
op1_i=>op1_i, |
op2_i=>op2_i, |
signed_i=>cmd_signed_i, |
rem_i=>cmd_div_mod_i, |
ce_o=>div_we, |
result_o=>div_result |
); |
end generate; |
|
gen_no_divider: if not DIVIDER_EN generate |
div_we<=div_ce; |
div_result<=(others=>'0'); |
end generate; |
|
-- Shifter |
|
shift_ce<=cmd_shift_i and valid_i; |
|
shifter_inst: entity work.lxp32_shifter(rtl) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
ce_i=>shift_ce, |
d_i=>op1_i, |
s_i=>op2_i(4 downto 0), |
right_i=>cmd_shift_right_i, |
sig_i=>cmd_signed_i, |
ce_o=>shift_we, |
d_o=>shift_result |
); |
|
-- Result multiplexer |
|
result_mux_gen: for i in result_mux'range generate |
result_mux(i)<=(adder_result(i) and adder_we) or |
(logic_result(i) and logic_we) or |
(mul_result(i) and mul_we) or |
(div_result(i) and div_we) or |
(shift_result(i) and shift_we); |
end generate; |
|
result_o<=result_mux; |
|
result_we<=adder_we or logic_we or mul_we or div_we or shift_we; |
we_o<=result_we; |
|
-- Pipeline control |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' or result_we='1' then |
busy<='0'; |
elsif shift_ce='1' or mul_ce='1' or div_ce='1' then |
busy<='1'; |
end if; |
end if; |
end process; |
|
busy_o<=busy; |
|
end architecture; |
--------------------------------------------------------------------- |
-- Arithmetic logic unit |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- Performs arithmetic and logic operations. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_alu is |
generic( |
DIVIDER_EN: boolean; |
MUL_ARCH: string |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
valid_i: in std_logic; |
|
cmd_signed_i: in std_logic; |
cmd_addsub_i: in std_logic; |
cmd_mul_i: in std_logic; |
cmd_div_i: in std_logic; |
cmd_div_mod_i: in std_logic; |
cmd_cmp_i: in std_logic; |
cmd_negate_op2_i: in std_logic; |
cmd_and_i: in std_logic; |
cmd_xor_i: in std_logic; |
cmd_shift_i: in std_logic; |
cmd_shift_right_i: in std_logic; |
|
op1_i: in std_logic_vector(31 downto 0); |
op2_i: in std_logic_vector(31 downto 0); |
|
result_o: out std_logic_vector(31 downto 0); |
|
cmp_eq_o: out std_logic; |
cmp_ug_o: out std_logic; |
cmp_sg_o: out std_logic; |
|
we_o: out std_logic; |
busy_o: out std_logic |
); |
end entity; |
|
architecture rtl of lxp32_alu is |
|
signal addend1: unsigned(31 downto 0); |
signal addend2: unsigned(31 downto 0); |
signal adder_result: unsigned(32 downto 0); |
signal adder_we: std_logic; |
|
signal cmp_eq: std_logic; |
signal cmp_carry: std_logic; |
signal cmp_s1: std_logic; |
signal cmp_s2: std_logic; |
|
signal logic_result: std_logic_vector(31 downto 0); |
signal logic_we: std_logic; |
|
signal mul_result: std_logic_vector(31 downto 0); |
signal mul_ce: std_logic; |
signal mul_we: std_logic; |
|
signal div_result: std_logic_vector(31 downto 0); |
signal div_ce: std_logic; |
signal div_we: std_logic; |
|
signal shift_result: std_logic_vector(31 downto 0); |
signal shift_ce: std_logic; |
signal shift_we: std_logic; |
|
signal result_mux: std_logic_vector(31 downto 0); |
signal result_we: std_logic; |
|
signal busy: std_logic:='0'; |
|
begin |
|
assert MUL_ARCH="dsp" or MUL_ARCH="seq" or MUL_ARCH="opt" |
report "Invalid MUL_ARCH generic value: dsp, opt or seq expected" |
severity failure; |
|
-- Add/subtract |
|
addend1<=unsigned(op1_i); |
|
addend2_gen: for i in addend2'range generate |
addend2(i)<=op2_i(i) xor cmd_negate_op2_i; |
end generate; |
|
adder_result<=("0"&addend1)+("0"&addend2)+(to_unsigned(0,adder_result'length-1)&cmd_negate_op2_i); |
adder_we<=cmd_addsub_i and valid_i; |
|
-- Comparator (needs cmd_negate_op2_i to work correctly) |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if valid_i='1' and cmd_cmp_i='1' then |
if op1_i=op2_i then |
cmp_eq<='1'; |
else |
cmp_eq<='0'; |
end if; |
|
cmp_carry<=adder_result(adder_result'high); |
cmp_s1<=op1_i(op1_i'high); |
cmp_s2<=op2_i(op2_i'high); |
end if; |
end if; |
end process; |
|
cmp_eq_o<=cmp_eq; |
cmp_ug_o<=cmp_carry and not cmp_eq; |
cmp_sg_o<=((cmp_s1 and cmp_s2 and cmp_carry) or |
(not cmp_s1 and not cmp_s2 and cmp_carry) or |
(not cmp_s1 and cmp_s2)) and not cmp_eq; |
|
-- Bitwise operations (and, or, xor) |
-- Note: (a or b) = (a and b) or (a xor b) |
|
logic_result_gen: for i in logic_result'range generate |
logic_result(i)<=((op1_i(i) and op2_i(i)) and cmd_and_i) or |
((op1_i(i) xor op2_i(i)) and cmd_xor_i); |
end generate; |
|
logic_we<=(cmd_and_i or cmd_xor_i) and valid_i; |
|
-- Multiplier |
|
mul_ce<=cmd_mul_i and valid_i; |
|
gen_mul_dsp: if MUL_ARCH="dsp" generate |
mul_inst: entity work.lxp32_mul_dsp(rtl) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
ce_i=>mul_ce, |
op1_i=>op1_i, |
op2_i=>op2_i, |
ce_o=>mul_we, |
result_o=>mul_result |
); |
end generate; |
|
gen_mul_opt: if MUL_ARCH="opt" generate |
mul_inst: entity work.lxp32_mul_opt(rtl) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
ce_i=>mul_ce, |
op1_i=>op1_i, |
op2_i=>op2_i, |
ce_o=>mul_we, |
result_o=>mul_result |
); |
end generate; |
|
gen_mul_seq: if MUL_ARCH="seq" generate |
mul_inst: entity work.lxp32_mul_seq(rtl) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
ce_i=>mul_ce, |
op1_i=>op1_i, |
op2_i=>op2_i, |
ce_o=>mul_we, |
result_o=>mul_result |
); |
end generate; |
|
-- Divider |
|
div_ce<=cmd_div_i and valid_i; |
|
gen_divider: if DIVIDER_EN generate |
divider_inst: entity work.lxp32_divider(rtl) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
ce_i=>div_ce, |
op1_i=>op1_i, |
op2_i=>op2_i, |
signed_i=>cmd_signed_i, |
rem_i=>cmd_div_mod_i, |
ce_o=>div_we, |
result_o=>div_result |
); |
end generate; |
|
gen_no_divider: if not DIVIDER_EN generate |
div_we<=div_ce; |
div_result<=(others=>'0'); |
end generate; |
|
-- Shifter |
|
shift_ce<=cmd_shift_i and valid_i; |
|
shifter_inst: entity work.lxp32_shifter(rtl) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
ce_i=>shift_ce, |
d_i=>op1_i, |
s_i=>op2_i(4 downto 0), |
right_i=>cmd_shift_right_i, |
sig_i=>cmd_signed_i, |
ce_o=>shift_we, |
d_o=>shift_result |
); |
|
-- Result multiplexer |
|
result_mux_gen: for i in result_mux'range generate |
result_mux(i)<=(adder_result(i) and adder_we) or |
(logic_result(i) and logic_we) or |
(mul_result(i) and mul_we) or |
(div_result(i) and div_we) or |
(shift_result(i) and shift_we); |
end generate; |
|
result_o<=result_mux; |
|
result_we<=adder_we or logic_we or mul_we or div_we or shift_we; |
we_o<=result_we; |
|
-- Pipeline control |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' or result_we='1' then |
busy<='0'; |
elsif shift_ce='1' or mul_ce='1' or div_ce='1' then |
busy<='1'; |
end if; |
end if; |
end process; |
|
busy_o<=busy; |
|
end architecture; |
/lxp32_compl.vhd
1,50 → 1,50
--------------------------------------------------------------------- |
-- Complementor |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- Computes a 2's complement of its input. Used as an auxiliary |
-- unit in the divider. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_compl is |
port( |
clk_i: in std_logic; |
compl_i: in std_logic; |
d_i: in std_logic_vector(31 downto 0); |
d_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_compl is |
|
signal d_prepared: unsigned(d_i'range); |
signal sum_low: unsigned(16 downto 0); |
signal d_high: unsigned(15 downto 0); |
signal sum_high: unsigned(15 downto 0); |
|
begin |
|
d_prepared_gen: for i in d_prepared'range generate |
d_prepared(i)<=d_i(i) xor compl_i; |
end generate; |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
sum_low<=("0"&d_prepared(15 downto 0))+(to_unsigned(0,16)&compl_i); |
d_high<=d_prepared(31 downto 16); |
end if; |
end process; |
|
sum_high<=d_high+(to_unsigned(0,15)&sum_low(sum_low'high)); |
|
d_o<=std_logic_vector(sum_high&sum_low(15 downto 0)); |
|
end architecture; |
--------------------------------------------------------------------- |
-- Complementor |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- Computes a 2's complement of its input. Used as an auxiliary |
-- unit in the divider. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_compl is |
port( |
clk_i: in std_logic; |
compl_i: in std_logic; |
d_i: in std_logic_vector(31 downto 0); |
d_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_compl is |
|
signal d_prepared: unsigned(d_i'range); |
signal sum_low: unsigned(16 downto 0); |
signal d_high: unsigned(15 downto 0); |
signal sum_high: unsigned(15 downto 0); |
|
begin |
|
d_prepared_gen: for i in d_prepared'range generate |
d_prepared(i)<=d_i(i) xor compl_i; |
end generate; |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
sum_low<=("0"&d_prepared(15 downto 0))+(to_unsigned(0,16)&compl_i); |
d_high<=d_prepared(31 downto 16); |
end if; |
end process; |
|
sum_high<=d_high+(to_unsigned(0,15)&sum_low(sum_low'high)); |
|
d_o<=std_logic_vector(sum_high&sum_low(15 downto 0)); |
|
end architecture; |
/lxp32_cpu.vhd
1,256 → 1,256
--------------------------------------------------------------------- |
-- LXP32 CPU Core |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
|
entity lxp32_cpu is |
generic( |
DBUS_RMW: boolean; |
DIVIDER_EN: boolean; |
MUL_ARCH: string; |
START_ADDR: std_logic_vector(31 downto 0) |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
lli_re_o: out std_logic; |
lli_adr_o: out std_logic_vector(29 downto 0); |
lli_dat_i: in std_logic_vector(31 downto 0); |
lli_busy_i: in std_logic; |
|
dbus_cyc_o: out std_logic; |
dbus_stb_o: out std_logic; |
dbus_we_o: out std_logic; |
dbus_sel_o: out std_logic_vector(3 downto 0); |
dbus_ack_i: in std_logic; |
dbus_adr_o: out std_logic_vector(31 downto 2); |
dbus_dat_o: out std_logic_vector(31 downto 0); |
dbus_dat_i: in std_logic_vector(31 downto 0); |
|
irq_i: in std_logic_vector(7 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_cpu is |
|
signal fetch_word: std_logic_vector(31 downto 0); |
signal fetch_next_ip: std_logic_vector(29 downto 0); |
signal fetch_current_ip: std_logic_vector(29 downto 0); |
signal fetch_valid: std_logic; |
signal fetch_jump_ready: std_logic; |
|
signal decode_ready: std_logic; |
signal decode_valid: std_logic; |
|
signal decode_cmd_loadop3: std_logic; |
signal decode_cmd_signed: std_logic; |
signal decode_cmd_dbus: std_logic; |
signal decode_cmd_dbus_store: std_logic; |
signal decode_cmd_dbus_byte: std_logic; |
signal decode_cmd_addsub: std_logic; |
signal decode_cmd_mul: std_logic; |
signal decode_cmd_div: std_logic; |
signal decode_cmd_div_mod: std_logic; |
signal decode_cmd_cmp: std_logic; |
signal decode_cmd_jump: std_logic; |
signal decode_cmd_negate_op2: std_logic; |
signal decode_cmd_and: std_logic; |
signal decode_cmd_xor: std_logic; |
signal decode_cmd_shift: std_logic; |
signal decode_cmd_shift_right: std_logic; |
|
signal decode_jump_type: std_logic_vector(3 downto 0); |
|
signal decode_op1: std_logic_vector(31 downto 0); |
signal decode_op2: std_logic_vector(31 downto 0); |
signal decode_op3: std_logic_vector(31 downto 0); |
signal decode_dst: std_logic_vector(7 downto 0); |
|
signal execute_ready: std_logic; |
signal execute_jump_valid: std_logic; |
signal execute_jump_dst: std_logic_vector(29 downto 0); |
|
signal sp_raddr1: std_logic_vector(7 downto 0); |
signal sp_rdata1: std_logic_vector(31 downto 0); |
signal sp_raddr2: std_logic_vector(7 downto 0); |
signal sp_rdata2: std_logic_vector(31 downto 0); |
signal sp_waddr: std_logic_vector(7 downto 0); |
signal sp_we: std_logic; |
signal sp_wdata: std_logic_vector(31 downto 0); |
|
signal interrupt_valid: std_logic; |
signal interrupt_vector: std_logic_vector(2 downto 0); |
signal interrupt_ready: std_logic; |
signal interrupt_return: std_logic; |
|
begin |
|
fetch_inst: entity work.lxp32_fetch(rtl) |
generic map( |
START_ADDR=>START_ADDR |
) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
lli_re_o=>lli_re_o, |
lli_adr_o=>lli_adr_o, |
lli_dat_i=>lli_dat_i, |
lli_busy_i=>lli_busy_i, |
|
word_o=>fetch_word, |
next_ip_o=>fetch_next_ip, |
current_ip_o=>fetch_current_ip, |
valid_o=>fetch_valid, |
ready_i=>decode_ready, |
|
jump_valid_i=>execute_jump_valid, |
jump_dst_i=>execute_jump_dst, |
jump_ready_o=>fetch_jump_ready |
); |
|
decode_inst: entity work.lxp32_decode(rtl) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
word_i=>fetch_word, |
next_ip_i=>fetch_next_ip, |
current_ip_i=>fetch_current_ip, |
valid_i=>fetch_valid, |
jump_valid_i=>execute_jump_valid, |
ready_o=>decode_ready, |
|
interrupt_valid_i=>interrupt_valid, |
interrupt_vector_i=>interrupt_vector, |
interrupt_ready_o=>interrupt_ready, |
|
sp_raddr1_o=>sp_raddr1, |
sp_rdata1_i=>sp_rdata1, |
sp_raddr2_o=>sp_raddr2, |
sp_rdata2_i=>sp_rdata2, |
|
ready_i=>execute_ready, |
valid_o=>decode_valid, |
|
cmd_loadop3_o=>decode_cmd_loadop3, |
cmd_signed_o=>decode_cmd_signed, |
cmd_dbus_o=>decode_cmd_dbus, |
cmd_dbus_store_o=>decode_cmd_dbus_store, |
cmd_dbus_byte_o=>decode_cmd_dbus_byte, |
cmd_addsub_o=>decode_cmd_addsub, |
cmd_mul_o=>decode_cmd_mul, |
cmd_div_o=>decode_cmd_div, |
cmd_div_mod_o=>decode_cmd_div_mod, |
cmd_cmp_o=>decode_cmd_cmp, |
cmd_jump_o=>decode_cmd_jump, |
cmd_negate_op2_o=>decode_cmd_negate_op2, |
cmd_and_o=>decode_cmd_and, |
cmd_xor_o=>decode_cmd_xor, |
cmd_shift_o=>decode_cmd_shift, |
cmd_shift_right_o=>decode_cmd_shift_right, |
|
jump_type_o=>decode_jump_type, |
|
op1_o=>decode_op1, |
op2_o=>decode_op2, |
op3_o=>decode_op3, |
dst_o=>decode_dst |
); |
|
execute_inst: entity work.lxp32_execute(rtl) |
generic map( |
DBUS_RMW=>DBUS_RMW, |
DIVIDER_EN=>DIVIDER_EN, |
MUL_ARCH=>MUL_ARCH |
) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
cmd_loadop3_i=>decode_cmd_loadop3, |
cmd_signed_i=>decode_cmd_signed, |
cmd_dbus_i=>decode_cmd_dbus, |
cmd_dbus_store_i=>decode_cmd_dbus_store, |
cmd_dbus_byte_i=>decode_cmd_dbus_byte, |
cmd_addsub_i=>decode_cmd_addsub, |
cmd_mul_i=>decode_cmd_mul, |
cmd_div_i=>decode_cmd_div, |
cmd_div_mod_i=>decode_cmd_div_mod, |
cmd_cmp_i=>decode_cmd_cmp, |
cmd_jump_i=>decode_cmd_jump, |
cmd_negate_op2_i=>decode_cmd_negate_op2, |
cmd_and_i=>decode_cmd_and, |
cmd_xor_i=>decode_cmd_xor, |
cmd_shift_i=>decode_cmd_shift, |
cmd_shift_right_i=>decode_cmd_shift_right, |
|
jump_type_i=>decode_jump_type, |
|
op1_i=>decode_op1, |
op2_i=>decode_op2, |
op3_i=>decode_op3, |
dst_i=>decode_dst, |
|
sp_waddr_o=>sp_waddr, |
sp_we_o=>sp_we, |
sp_wdata_o=>sp_wdata, |
|
valid_i=>decode_valid, |
ready_o=>execute_ready, |
|
dbus_cyc_o=>dbus_cyc_o, |
dbus_stb_o=>dbus_stb_o, |
dbus_we_o=>dbus_we_o, |
dbus_sel_o=>dbus_sel_o, |
dbus_ack_i=>dbus_ack_i, |
dbus_adr_o=>dbus_adr_o, |
dbus_dat_o=>dbus_dat_o, |
dbus_dat_i=>dbus_dat_i, |
|
jump_valid_o=>execute_jump_valid, |
jump_dst_o=>execute_jump_dst, |
jump_ready_i=>fetch_jump_ready, |
|
interrupt_return_o=>interrupt_return |
); |
|
scratchpad_inst: entity work.lxp32_scratchpad(rtl) |
port map( |
clk_i=>clk_i, |
|
raddr1_i=>sp_raddr1, |
rdata1_o=>sp_rdata1, |
raddr2_i=>sp_raddr2, |
rdata2_o=>sp_rdata2, |
|
waddr_i=>sp_waddr, |
we_i=>sp_we, |
wdata_i=>sp_wdata |
); |
|
interrupt_mux_inst: entity work.lxp32_interrupt_mux(rtl) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
irq_i=>irq_i, |
|
interrupt_valid_o=>interrupt_valid, |
interrupt_vector_o=>interrupt_vector, |
interrupt_ready_i=>interrupt_ready, |
interrupt_return_i=>interrupt_return, |
|
sp_waddr_i=>sp_waddr, |
sp_we_i=>sp_we, |
sp_wdata_i=>sp_wdata |
); |
|
end architecture; |
--------------------------------------------------------------------- |
-- LXP32 CPU Core |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
|
entity lxp32_cpu is |
generic( |
DBUS_RMW: boolean; |
DIVIDER_EN: boolean; |
MUL_ARCH: string; |
START_ADDR: std_logic_vector(31 downto 0) |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
lli_re_o: out std_logic; |
lli_adr_o: out std_logic_vector(29 downto 0); |
lli_dat_i: in std_logic_vector(31 downto 0); |
lli_busy_i: in std_logic; |
|
dbus_cyc_o: out std_logic; |
dbus_stb_o: out std_logic; |
dbus_we_o: out std_logic; |
dbus_sel_o: out std_logic_vector(3 downto 0); |
dbus_ack_i: in std_logic; |
dbus_adr_o: out std_logic_vector(31 downto 2); |
dbus_dat_o: out std_logic_vector(31 downto 0); |
dbus_dat_i: in std_logic_vector(31 downto 0); |
|
irq_i: in std_logic_vector(7 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_cpu is |
|
signal fetch_word: std_logic_vector(31 downto 0); |
signal fetch_next_ip: std_logic_vector(29 downto 0); |
signal fetch_current_ip: std_logic_vector(29 downto 0); |
signal fetch_valid: std_logic; |
signal fetch_jump_ready: std_logic; |
|
signal decode_ready: std_logic; |
signal decode_valid: std_logic; |
|
signal decode_cmd_loadop3: std_logic; |
signal decode_cmd_signed: std_logic; |
signal decode_cmd_dbus: std_logic; |
signal decode_cmd_dbus_store: std_logic; |
signal decode_cmd_dbus_byte: std_logic; |
signal decode_cmd_addsub: std_logic; |
signal decode_cmd_mul: std_logic; |
signal decode_cmd_div: std_logic; |
signal decode_cmd_div_mod: std_logic; |
signal decode_cmd_cmp: std_logic; |
signal decode_cmd_jump: std_logic; |
signal decode_cmd_negate_op2: std_logic; |
signal decode_cmd_and: std_logic; |
signal decode_cmd_xor: std_logic; |
signal decode_cmd_shift: std_logic; |
signal decode_cmd_shift_right: std_logic; |
|
signal decode_jump_type: std_logic_vector(3 downto 0); |
|
signal decode_op1: std_logic_vector(31 downto 0); |
signal decode_op2: std_logic_vector(31 downto 0); |
signal decode_op3: std_logic_vector(31 downto 0); |
signal decode_dst: std_logic_vector(7 downto 0); |
|
signal execute_ready: std_logic; |
signal execute_jump_valid: std_logic; |
signal execute_jump_dst: std_logic_vector(29 downto 0); |
|
signal sp_raddr1: std_logic_vector(7 downto 0); |
signal sp_rdata1: std_logic_vector(31 downto 0); |
signal sp_raddr2: std_logic_vector(7 downto 0); |
signal sp_rdata2: std_logic_vector(31 downto 0); |
signal sp_waddr: std_logic_vector(7 downto 0); |
signal sp_we: std_logic; |
signal sp_wdata: std_logic_vector(31 downto 0); |
|
signal interrupt_valid: std_logic; |
signal interrupt_vector: std_logic_vector(2 downto 0); |
signal interrupt_ready: std_logic; |
signal interrupt_return: std_logic; |
|
begin |
|
fetch_inst: entity work.lxp32_fetch(rtl) |
generic map( |
START_ADDR=>START_ADDR |
) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
lli_re_o=>lli_re_o, |
lli_adr_o=>lli_adr_o, |
lli_dat_i=>lli_dat_i, |
lli_busy_i=>lli_busy_i, |
|
word_o=>fetch_word, |
next_ip_o=>fetch_next_ip, |
current_ip_o=>fetch_current_ip, |
valid_o=>fetch_valid, |
ready_i=>decode_ready, |
|
jump_valid_i=>execute_jump_valid, |
jump_dst_i=>execute_jump_dst, |
jump_ready_o=>fetch_jump_ready |
); |
|
decode_inst: entity work.lxp32_decode(rtl) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
word_i=>fetch_word, |
next_ip_i=>fetch_next_ip, |
current_ip_i=>fetch_current_ip, |
valid_i=>fetch_valid, |
jump_valid_i=>execute_jump_valid, |
ready_o=>decode_ready, |
|
interrupt_valid_i=>interrupt_valid, |
interrupt_vector_i=>interrupt_vector, |
interrupt_ready_o=>interrupt_ready, |
|
sp_raddr1_o=>sp_raddr1, |
sp_rdata1_i=>sp_rdata1, |
sp_raddr2_o=>sp_raddr2, |
sp_rdata2_i=>sp_rdata2, |
|
ready_i=>execute_ready, |
valid_o=>decode_valid, |
|
cmd_loadop3_o=>decode_cmd_loadop3, |
cmd_signed_o=>decode_cmd_signed, |
cmd_dbus_o=>decode_cmd_dbus, |
cmd_dbus_store_o=>decode_cmd_dbus_store, |
cmd_dbus_byte_o=>decode_cmd_dbus_byte, |
cmd_addsub_o=>decode_cmd_addsub, |
cmd_mul_o=>decode_cmd_mul, |
cmd_div_o=>decode_cmd_div, |
cmd_div_mod_o=>decode_cmd_div_mod, |
cmd_cmp_o=>decode_cmd_cmp, |
cmd_jump_o=>decode_cmd_jump, |
cmd_negate_op2_o=>decode_cmd_negate_op2, |
cmd_and_o=>decode_cmd_and, |
cmd_xor_o=>decode_cmd_xor, |
cmd_shift_o=>decode_cmd_shift, |
cmd_shift_right_o=>decode_cmd_shift_right, |
|
jump_type_o=>decode_jump_type, |
|
op1_o=>decode_op1, |
op2_o=>decode_op2, |
op3_o=>decode_op3, |
dst_o=>decode_dst |
); |
|
execute_inst: entity work.lxp32_execute(rtl) |
generic map( |
DBUS_RMW=>DBUS_RMW, |
DIVIDER_EN=>DIVIDER_EN, |
MUL_ARCH=>MUL_ARCH |
) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
cmd_loadop3_i=>decode_cmd_loadop3, |
cmd_signed_i=>decode_cmd_signed, |
cmd_dbus_i=>decode_cmd_dbus, |
cmd_dbus_store_i=>decode_cmd_dbus_store, |
cmd_dbus_byte_i=>decode_cmd_dbus_byte, |
cmd_addsub_i=>decode_cmd_addsub, |
cmd_mul_i=>decode_cmd_mul, |
cmd_div_i=>decode_cmd_div, |
cmd_div_mod_i=>decode_cmd_div_mod, |
cmd_cmp_i=>decode_cmd_cmp, |
cmd_jump_i=>decode_cmd_jump, |
cmd_negate_op2_i=>decode_cmd_negate_op2, |
cmd_and_i=>decode_cmd_and, |
cmd_xor_i=>decode_cmd_xor, |
cmd_shift_i=>decode_cmd_shift, |
cmd_shift_right_i=>decode_cmd_shift_right, |
|
jump_type_i=>decode_jump_type, |
|
op1_i=>decode_op1, |
op2_i=>decode_op2, |
op3_i=>decode_op3, |
dst_i=>decode_dst, |
|
sp_waddr_o=>sp_waddr, |
sp_we_o=>sp_we, |
sp_wdata_o=>sp_wdata, |
|
valid_i=>decode_valid, |
ready_o=>execute_ready, |
|
dbus_cyc_o=>dbus_cyc_o, |
dbus_stb_o=>dbus_stb_o, |
dbus_we_o=>dbus_we_o, |
dbus_sel_o=>dbus_sel_o, |
dbus_ack_i=>dbus_ack_i, |
dbus_adr_o=>dbus_adr_o, |
dbus_dat_o=>dbus_dat_o, |
dbus_dat_i=>dbus_dat_i, |
|
jump_valid_o=>execute_jump_valid, |
jump_dst_o=>execute_jump_dst, |
jump_ready_i=>fetch_jump_ready, |
|
interrupt_return_o=>interrupt_return |
); |
|
scratchpad_inst: entity work.lxp32_scratchpad(rtl) |
port map( |
clk_i=>clk_i, |
|
raddr1_i=>sp_raddr1, |
rdata1_o=>sp_rdata1, |
raddr2_i=>sp_raddr2, |
rdata2_o=>sp_rdata2, |
|
waddr_i=>sp_waddr, |
we_i=>sp_we, |
wdata_i=>sp_wdata |
); |
|
interrupt_mux_inst: entity work.lxp32_interrupt_mux(rtl) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
irq_i=>irq_i, |
|
interrupt_valid_o=>interrupt_valid, |
interrupt_vector_o=>interrupt_vector, |
interrupt_ready_i=>interrupt_ready, |
interrupt_return_i=>interrupt_return, |
|
sp_waddr_i=>sp_waddr, |
sp_we_i=>sp_we, |
sp_wdata_i=>sp_wdata |
); |
|
end architecture; |
/lxp32_dbus.vhd
1,171 → 1,171
--------------------------------------------------------------------- |
-- DBUS master |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- Manages data bus (DBUS) access. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_dbus is |
generic( |
RMW: boolean |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
valid_i: in std_logic; |
|
cmd_dbus_i: in std_logic; |
cmd_dbus_store_i: in std_logic; |
cmd_dbus_byte_i: in std_logic; |
cmd_signed_i: in std_logic; |
addr_i: in std_logic_vector(31 downto 0); |
wdata_i: in std_logic_vector(31 downto 0); |
|
rdata_o: out std_logic_vector(31 downto 0); |
we_o: out std_logic; |
busy_o: out std_logic; |
|
dbus_cyc_o: out std_logic; |
dbus_stb_o: out std_logic; |
dbus_we_o: out std_logic; |
dbus_sel_o: out std_logic_vector(3 downto 0); |
dbus_ack_i: in std_logic; |
dbus_adr_o: out std_logic_vector(31 downto 2); |
dbus_dat_o: out std_logic_vector(31 downto 0); |
dbus_dat_i: in std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_dbus is |
|
signal strobe: std_logic:='0'; |
signal we_out: std_logic:='0'; |
signal we: std_logic; |
signal byte_mode: std_logic; |
signal sel: std_logic_vector(3 downto 0); |
signal sig: std_logic; |
signal rmw_mode: std_logic; |
|
signal dbus_rdata: std_logic_vector(31 downto 0); |
signal selected_byte: std_logic_vector(7 downto 0); |
|
begin |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
we_out<='0'; |
strobe<='0'; |
sig<='-'; |
byte_mode<='-'; |
sel<=(others=>'-'); |
we<='-'; |
rmw_mode<='-'; |
dbus_adr_o<=(others=>'-'); |
dbus_dat_o<=(others=>'-'); |
else |
we_out<='0'; |
if strobe='0' then |
if valid_i='1' and cmd_dbus_i='1' then |
strobe<='1'; |
sig<=cmd_signed_i; |
|
dbus_adr_o<=addr_i(31 downto 2); |
|
if cmd_dbus_byte_i='0' then |
byte_mode<='0'; |
dbus_dat_o<=wdata_i; |
sel<="1111"; |
|
-- synthesis translate_off |
assert addr_i(1 downto 0)="00" |
report "Misaligned word-granular access on data bus" |
severity warning; |
-- synthesis translate_on |
else |
byte_mode<='1'; |
dbus_dat_o<=wdata_i(7 downto 0)&wdata_i(7 downto 0)& |
wdata_i(7 downto 0)&wdata_i(7 downto 0); |
|
case addr_i(1 downto 0) is |
when "00" => sel<="0001"; |
when "01" => sel<="0010"; |
when "10" => sel<="0100"; |
when "11" => sel<="1000"; |
when others => |
end case; |
end if; |
|
if not RMW then |
we<=cmd_dbus_store_i; |
rmw_mode<='0'; |
else |
we<=cmd_dbus_store_i and not cmd_dbus_byte_i; |
rmw_mode<=cmd_dbus_store_i and cmd_dbus_byte_i; |
end if; |
end if; |
else |
if dbus_ack_i='1' then |
if rmw_mode='1' and we='0' and RMW then |
we<='1'; |
for i in sel'range loop |
if sel(i)='0' then |
dbus_dat_o(i*8+7 downto i*8)<= |
dbus_dat_i(i*8+7 downto i*8); |
end if; |
end loop; |
else |
strobe<='0'; |
if we='0' then |
we_out<='1'; |
end if; |
end if; |
end if; |
end if; |
end if; |
end if; |
end process; |
|
dbus_cyc_o<=strobe; |
dbus_stb_o<=strobe; |
dbus_we_o<=we; |
|
sel_no_rmw_gen: if not RMW generate |
dbus_sel_o<=sel; |
end generate; |
|
sel_rmw_gen: if RMW generate |
dbus_sel_o<=(others=>'1'); |
end generate; |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
dbus_rdata<=dbus_dat_i; |
end if; |
end process; |
|
selected_byte_gen: for i in selected_byte'range generate |
selected_byte(i)<=(dbus_rdata(i) and sel(0)) or |
(dbus_rdata(i+8) and sel(1)) or |
(dbus_rdata(i+16) and sel(2)) or |
(dbus_rdata(i+24) and sel(3)); |
end generate; |
|
rdata_o<=dbus_rdata when byte_mode='0' else |
X"000000"&selected_byte when selected_byte(selected_byte'high)='0' or sig='0' else |
X"FFFFFF"&selected_byte; |
|
we_o<=we_out; |
busy_o<=strobe or we_out; |
|
end architecture; |
--------------------------------------------------------------------- |
-- DBUS master |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- Manages data bus (DBUS) access. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_dbus is |
generic( |
RMW: boolean |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
valid_i: in std_logic; |
|
cmd_dbus_i: in std_logic; |
cmd_dbus_store_i: in std_logic; |
cmd_dbus_byte_i: in std_logic; |
cmd_signed_i: in std_logic; |
addr_i: in std_logic_vector(31 downto 0); |
wdata_i: in std_logic_vector(31 downto 0); |
|
rdata_o: out std_logic_vector(31 downto 0); |
we_o: out std_logic; |
busy_o: out std_logic; |
|
dbus_cyc_o: out std_logic; |
dbus_stb_o: out std_logic; |
dbus_we_o: out std_logic; |
dbus_sel_o: out std_logic_vector(3 downto 0); |
dbus_ack_i: in std_logic; |
dbus_adr_o: out std_logic_vector(31 downto 2); |
dbus_dat_o: out std_logic_vector(31 downto 0); |
dbus_dat_i: in std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_dbus is |
|
signal strobe: std_logic:='0'; |
signal we_out: std_logic:='0'; |
signal we: std_logic; |
signal byte_mode: std_logic; |
signal sel: std_logic_vector(3 downto 0); |
signal sig: std_logic; |
signal rmw_mode: std_logic; |
|
signal dbus_rdata: std_logic_vector(31 downto 0); |
signal selected_byte: std_logic_vector(7 downto 0); |
|
begin |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
we_out<='0'; |
strobe<='0'; |
sig<='-'; |
byte_mode<='-'; |
sel<=(others=>'-'); |
we<='-'; |
rmw_mode<='-'; |
dbus_adr_o<=(others=>'-'); |
dbus_dat_o<=(others=>'-'); |
else |
we_out<='0'; |
if strobe='0' then |
if valid_i='1' and cmd_dbus_i='1' then |
strobe<='1'; |
sig<=cmd_signed_i; |
|
dbus_adr_o<=addr_i(31 downto 2); |
|
if cmd_dbus_byte_i='0' then |
byte_mode<='0'; |
dbus_dat_o<=wdata_i; |
sel<="1111"; |
|
-- synthesis translate_off |
assert addr_i(1 downto 0)="00" |
report "Misaligned word-granular access on data bus" |
severity warning; |
-- synthesis translate_on |
else |
byte_mode<='1'; |
dbus_dat_o<=wdata_i(7 downto 0)&wdata_i(7 downto 0)& |
wdata_i(7 downto 0)&wdata_i(7 downto 0); |
|
case addr_i(1 downto 0) is |
when "00" => sel<="0001"; |
when "01" => sel<="0010"; |
when "10" => sel<="0100"; |
when "11" => sel<="1000"; |
when others => |
end case; |
end if; |
|
if not RMW then |
we<=cmd_dbus_store_i; |
rmw_mode<='0'; |
else |
we<=cmd_dbus_store_i and not cmd_dbus_byte_i; |
rmw_mode<=cmd_dbus_store_i and cmd_dbus_byte_i; |
end if; |
end if; |
else |
if dbus_ack_i='1' then |
if rmw_mode='1' and we='0' and RMW then |
we<='1'; |
for i in sel'range loop |
if sel(i)='0' then |
dbus_dat_o(i*8+7 downto i*8)<= |
dbus_dat_i(i*8+7 downto i*8); |
end if; |
end loop; |
else |
strobe<='0'; |
if we='0' then |
we_out<='1'; |
end if; |
end if; |
end if; |
end if; |
end if; |
end if; |
end process; |
|
dbus_cyc_o<=strobe; |
dbus_stb_o<=strobe; |
dbus_we_o<=we; |
|
sel_no_rmw_gen: if not RMW generate |
dbus_sel_o<=sel; |
end generate; |
|
sel_rmw_gen: if RMW generate |
dbus_sel_o<=(others=>'1'); |
end generate; |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
dbus_rdata<=dbus_dat_i; |
end if; |
end process; |
|
selected_byte_gen: for i in selected_byte'range generate |
selected_byte(i)<=(dbus_rdata(i) and sel(0)) or |
(dbus_rdata(i+8) and sel(1)) or |
(dbus_rdata(i+16) and sel(2)) or |
(dbus_rdata(i+24) and sel(3)); |
end generate; |
|
rdata_o<=dbus_rdata when byte_mode='0' else |
X"000000"&selected_byte when selected_byte(selected_byte'high)='0' or sig='0' else |
X"FFFFFF"&selected_byte; |
|
we_o<=we_out; |
busy_o<=strobe or we_out; |
|
end architecture; |
/lxp32_decode.vhd
1,327 → 1,327
--------------------------------------------------------------------- |
-- Instruction decoder |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- The second stage of the LXP32 pipeline. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_decode is |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
word_i: in std_logic_vector(31 downto 0); |
next_ip_i: in std_logic_vector(29 downto 0); |
current_ip_i: in std_logic_vector(29 downto 0); |
valid_i: in std_logic; |
jump_valid_i: in std_logic; |
ready_o: out std_logic; |
|
interrupt_valid_i: in std_logic; |
interrupt_vector_i: in std_logic_vector(2 downto 0); |
interrupt_ready_o: out std_logic; |
|
sp_raddr1_o: out std_logic_vector(7 downto 0); |
sp_rdata1_i: in std_logic_vector(31 downto 0); |
sp_raddr2_o: out std_logic_vector(7 downto 0); |
sp_rdata2_i: in std_logic_vector(31 downto 0); |
|
ready_i: in std_logic; |
valid_o: out std_logic; |
|
cmd_loadop3_o: out std_logic; |
cmd_signed_o: out std_logic; |
cmd_dbus_o: out std_logic; |
cmd_dbus_store_o: out std_logic; |
cmd_dbus_byte_o: out std_logic; |
cmd_addsub_o: out std_logic; |
cmd_mul_o: out std_logic; |
cmd_div_o: out std_logic; |
cmd_div_mod_o: out std_logic; |
cmd_cmp_o: out std_logic; |
cmd_jump_o: out std_logic; |
cmd_negate_op2_o: out std_logic; |
cmd_and_o: out std_logic; |
cmd_xor_o: out std_logic; |
cmd_shift_o: out std_logic; |
cmd_shift_right_o: out std_logic; |
|
jump_type_o: out std_logic_vector(3 downto 0); |
|
op1_o: out std_logic_vector(31 downto 0); |
op2_o: out std_logic_vector(31 downto 0); |
op3_o: out std_logic_vector(31 downto 0); |
dst_o: out std_logic_vector(7 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_decode is |
|
-- Decoder FSM state |
|
type DecoderState is (Regular,ContinueLc,ContinueCjmp,ContinueInterrupt,Halt); |
signal state: DecoderState:=Regular; |
|
-- Input instruction portions |
|
signal opcode: std_logic_vector(5 downto 0); |
signal t1: std_logic; |
signal t2: std_logic; |
signal destination: std_logic_vector(7 downto 0); |
signal rd1: std_logic_vector(7 downto 0); |
signal rd2: std_logic_vector(7 downto 0); |
|
-- Signals related to pipeline control |
|
signal downstream_busy: std_logic; |
signal self_busy: std_logic:='0'; |
signal busy: std_logic; |
signal valid_out: std_logic:='0'; |
|
signal dst_out: std_logic_vector(7 downto 0); |
|
-- Signals related to RD operand decoding |
|
signal rd1_reg: std_logic_vector(7 downto 0); |
signal rd2_reg: std_logic_vector(7 downto 0); |
|
signal rd1_select: std_logic; |
signal rd1_direct: std_logic_vector(31 downto 0); |
signal rd2_select: std_logic; |
signal rd2_direct: std_logic_vector(31 downto 0); |
|
-- Signals related to interrupt handling |
|
signal interrupt_ready: std_logic:='0'; |
|
begin |
|
-- Dissect input word |
|
opcode<=word_i(31 downto 26); |
t1<=word_i(25); |
t2<=word_i(24); |
destination<=word_i(23 downto 16); |
rd1<=word_i(15 downto 8); |
rd2<=word_i(7 downto 0); |
|
-- Pipeline control |
|
downstream_busy<=valid_out and not ready_i; |
busy<=downstream_busy or self_busy; |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
valid_out<='0'; |
self_busy<='0'; |
state<=Regular; |
interrupt_ready<='0'; |
cmd_loadop3_o<='-'; |
cmd_signed_o<='-'; |
cmd_dbus_o<='-'; |
cmd_dbus_store_o<='-'; |
cmd_dbus_byte_o<='-'; |
cmd_addsub_o<='-'; |
cmd_negate_op2_o<='-'; |
cmd_mul_o<='-'; |
cmd_div_o<='-'; |
cmd_div_mod_o<='-'; |
cmd_cmp_o<='-'; |
cmd_jump_o<='-'; |
cmd_and_o<='-'; |
cmd_xor_o<='-'; |
cmd_shift_o<='-'; |
cmd_shift_right_o<='-'; |
rd1_select<='-'; |
rd1_direct<=(others=>'-'); |
rd2_select<='-'; |
rd2_direct<=(others=>'-'); |
op3_o<=(others=>'-'); |
jump_type_o<=(others=>'-'); |
dst_out<=(others=>'-'); |
else |
interrupt_ready<='0'; |
if jump_valid_i='1' then |
valid_out<='0'; |
self_busy<='0'; |
state<=Regular; |
elsif downstream_busy='0' then |
op3_o<=(others=>'-'); |
rd1_direct<=std_logic_vector(resize(signed(rd1),rd1_direct'length)); |
rd2_direct<=std_logic_vector(resize(signed(rd2),rd2_direct'length)); |
|
cmd_signed_o<=opcode(0); |
cmd_div_mod_o<=opcode(1); |
cmd_shift_right_o<=opcode(1); |
cmd_dbus_byte_o<=opcode(1); |
cmd_dbus_store_o<=opcode(2); |
|
case state is |
when Regular => |
cmd_loadop3_o<='0'; |
cmd_dbus_o<='0'; |
cmd_addsub_o<='0'; |
cmd_negate_op2_o<='0'; |
cmd_mul_o<='0'; |
cmd_div_o<='0'; |
cmd_cmp_o<='0'; |
cmd_jump_o<='0'; |
cmd_and_o<='0'; |
cmd_xor_o<='0'; |
cmd_shift_o<='0'; |
|
jump_type_o<=opcode(3 downto 0); |
|
if interrupt_valid_i='1' and valid_i='1' then |
cmd_jump_o<='1'; |
cmd_loadop3_o<='1'; |
op3_o<=current_ip_i&"01"; -- LSB indicates interrupt return |
dst_out<=X"FD"; -- interrupt return pointer |
rd1_select<='1'; |
rd2_select<='0'; |
valid_out<='1'; |
interrupt_ready<='1'; |
self_busy<='1'; |
state<=ContinueInterrupt; |
else |
if opcode(5 downto 3)="101" or opcode="000001" then -- lc or lcs |
cmd_loadop3_o<='1'; |
-- Setting op3_o here only affects the lcs instruction |
op3_o<=std_logic_vector(resize(signed(opcode(2 downto 0)& |
t1&t2&rd1&rd2),op3_o'length)); |
end if; |
|
if opcode(5 downto 3)="001" then |
cmd_dbus_o<='1'; |
end if; |
|
if opcode(5 downto 1)="01000" then |
cmd_addsub_o<='1'; |
end if; |
|
cmd_negate_op2_o<=opcode(0); |
|
if opcode="010010" then |
cmd_mul_o<='1'; |
end if; |
|
if opcode(5 downto 2)="0101" then |
cmd_div_o<='1'; |
end if; |
|
if opcode(5 downto 3)="100" then -- jump or call |
cmd_jump_o<='1'; |
cmd_loadop3_o<=opcode(0); |
-- Setting op3_o here only affects the call instruction |
op3_o<=next_ip_i&"00"; |
end if; |
|
-- Note: (a or b) = (a and b) or (a xor b) |
|
if opcode(5 downto 1)="01100" then |
cmd_and_o<='1'; |
end if; |
|
if opcode="011010" or opcode="011001" then |
cmd_xor_o<='1'; |
end if; |
|
if opcode(5 downto 2)="0111" then |
cmd_shift_o<='1'; |
end if; |
|
if opcode(5 downto 4)="11" then |
cmd_cmp_o<='1'; |
cmd_negate_op2_o<='1'; |
end if; |
|
rd1_select<=t1; |
rd2_select<=t2; |
|
dst_out<=destination; |
|
if valid_i='1' then |
if opcode="000001" then |
valid_out<='0'; |
self_busy<='0'; |
state<=ContinueLc; |
elsif opcode="000010" then |
valid_out<='0'; |
self_busy<='1'; |
state<=Halt; |
elsif opcode(5 downto 4)="11" then |
valid_out<='1'; |
self_busy<='1'; |
state<=ContinueCjmp; |
else |
valid_out<='1'; |
end if; |
else |
valid_out<='0'; |
end if; |
end if; |
when ContinueLc => |
if valid_i='1' then |
valid_out<='1'; |
op3_o<=word_i; |
self_busy<='0'; |
state<=Regular; |
end if; |
when ContinueCjmp => |
valid_out<='1'; |
cmd_jump_o<='1'; |
rd1_select<='1'; |
self_busy<='0'; |
state<=Regular; |
when ContinueInterrupt => |
valid_out<='0'; |
when Halt => |
if interrupt_valid_i='1' then |
self_busy<='0'; |
state<=Regular; |
end if; |
end case; |
end if; |
end if; |
end if; |
end process; |
|
valid_o<=valid_out; |
dst_o<=dst_out; |
|
ready_o<=not busy; |
|
interrupt_ready_o<=interrupt_ready; |
|
-- Decode RD (register/direct) operands |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if busy='0' then |
rd1_reg<=rd1; |
rd2_reg<=rd2; |
end if; |
end if; |
end process; |
|
sp_raddr1_o<="11110"&interrupt_vector_i when (state=Regular and interrupt_valid_i='1' and downstream_busy='0') or state=ContinueInterrupt else |
dst_out when (state=ContinueCjmp and downstream_busy='0') else |
rd1_reg when busy='1' else |
rd1; |
|
sp_raddr2_o<=rd2_reg when busy='1' else rd2; |
|
op1_o<=sp_rdata1_i when rd1_select='1' else rd1_direct; |
op2_o<=sp_rdata2_i when rd2_select='1' else rd2_direct; |
|
end architecture; |
--------------------------------------------------------------------- |
-- Instruction decoder |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- The second stage of the LXP32 pipeline. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_decode is |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
word_i: in std_logic_vector(31 downto 0); |
next_ip_i: in std_logic_vector(29 downto 0); |
current_ip_i: in std_logic_vector(29 downto 0); |
valid_i: in std_logic; |
jump_valid_i: in std_logic; |
ready_o: out std_logic; |
|
interrupt_valid_i: in std_logic; |
interrupt_vector_i: in std_logic_vector(2 downto 0); |
interrupt_ready_o: out std_logic; |
|
sp_raddr1_o: out std_logic_vector(7 downto 0); |
sp_rdata1_i: in std_logic_vector(31 downto 0); |
sp_raddr2_o: out std_logic_vector(7 downto 0); |
sp_rdata2_i: in std_logic_vector(31 downto 0); |
|
ready_i: in std_logic; |
valid_o: out std_logic; |
|
cmd_loadop3_o: out std_logic; |
cmd_signed_o: out std_logic; |
cmd_dbus_o: out std_logic; |
cmd_dbus_store_o: out std_logic; |
cmd_dbus_byte_o: out std_logic; |
cmd_addsub_o: out std_logic; |
cmd_mul_o: out std_logic; |
cmd_div_o: out std_logic; |
cmd_div_mod_o: out std_logic; |
cmd_cmp_o: out std_logic; |
cmd_jump_o: out std_logic; |
cmd_negate_op2_o: out std_logic; |
cmd_and_o: out std_logic; |
cmd_xor_o: out std_logic; |
cmd_shift_o: out std_logic; |
cmd_shift_right_o: out std_logic; |
|
jump_type_o: out std_logic_vector(3 downto 0); |
|
op1_o: out std_logic_vector(31 downto 0); |
op2_o: out std_logic_vector(31 downto 0); |
op3_o: out std_logic_vector(31 downto 0); |
dst_o: out std_logic_vector(7 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_decode is |
|
-- Decoder FSM state |
|
type DecoderState is (Regular,ContinueLc,ContinueCjmp,ContinueInterrupt,Halt); |
signal state: DecoderState:=Regular; |
|
-- Input instruction portions |
|
signal opcode: std_logic_vector(5 downto 0); |
signal t1: std_logic; |
signal t2: std_logic; |
signal destination: std_logic_vector(7 downto 0); |
signal rd1: std_logic_vector(7 downto 0); |
signal rd2: std_logic_vector(7 downto 0); |
|
-- Signals related to pipeline control |
|
signal downstream_busy: std_logic; |
signal self_busy: std_logic:='0'; |
signal busy: std_logic; |
signal valid_out: std_logic:='0'; |
|
signal dst_out: std_logic_vector(7 downto 0); |
|
-- Signals related to RD operand decoding |
|
signal rd1_reg: std_logic_vector(7 downto 0); |
signal rd2_reg: std_logic_vector(7 downto 0); |
|
signal rd1_select: std_logic; |
signal rd1_direct: std_logic_vector(31 downto 0); |
signal rd2_select: std_logic; |
signal rd2_direct: std_logic_vector(31 downto 0); |
|
-- Signals related to interrupt handling |
|
signal interrupt_ready: std_logic:='0'; |
|
begin |
|
-- Dissect input word |
|
opcode<=word_i(31 downto 26); |
t1<=word_i(25); |
t2<=word_i(24); |
destination<=word_i(23 downto 16); |
rd1<=word_i(15 downto 8); |
rd2<=word_i(7 downto 0); |
|
-- Pipeline control |
|
downstream_busy<=valid_out and not ready_i; |
busy<=downstream_busy or self_busy; |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
valid_out<='0'; |
self_busy<='0'; |
state<=Regular; |
interrupt_ready<='0'; |
cmd_loadop3_o<='-'; |
cmd_signed_o<='-'; |
cmd_dbus_o<='-'; |
cmd_dbus_store_o<='-'; |
cmd_dbus_byte_o<='-'; |
cmd_addsub_o<='-'; |
cmd_negate_op2_o<='-'; |
cmd_mul_o<='-'; |
cmd_div_o<='-'; |
cmd_div_mod_o<='-'; |
cmd_cmp_o<='-'; |
cmd_jump_o<='-'; |
cmd_and_o<='-'; |
cmd_xor_o<='-'; |
cmd_shift_o<='-'; |
cmd_shift_right_o<='-'; |
rd1_select<='-'; |
rd1_direct<=(others=>'-'); |
rd2_select<='-'; |
rd2_direct<=(others=>'-'); |
op3_o<=(others=>'-'); |
jump_type_o<=(others=>'-'); |
dst_out<=(others=>'-'); |
else |
interrupt_ready<='0'; |
if jump_valid_i='1' then |
valid_out<='0'; |
self_busy<='0'; |
state<=Regular; |
elsif downstream_busy='0' then |
op3_o<=(others=>'-'); |
rd1_direct<=std_logic_vector(resize(signed(rd1),rd1_direct'length)); |
rd2_direct<=std_logic_vector(resize(signed(rd2),rd2_direct'length)); |
|
cmd_signed_o<=opcode(0); |
cmd_div_mod_o<=opcode(1); |
cmd_shift_right_o<=opcode(1); |
cmd_dbus_byte_o<=opcode(1); |
cmd_dbus_store_o<=opcode(2); |
|
case state is |
when Regular => |
cmd_loadop3_o<='0'; |
cmd_dbus_o<='0'; |
cmd_addsub_o<='0'; |
cmd_negate_op2_o<='0'; |
cmd_mul_o<='0'; |
cmd_div_o<='0'; |
cmd_cmp_o<='0'; |
cmd_jump_o<='0'; |
cmd_and_o<='0'; |
cmd_xor_o<='0'; |
cmd_shift_o<='0'; |
|
jump_type_o<=opcode(3 downto 0); |
|
if interrupt_valid_i='1' and valid_i='1' then |
cmd_jump_o<='1'; |
cmd_loadop3_o<='1'; |
op3_o<=current_ip_i&"01"; -- LSB indicates interrupt return |
dst_out<=X"FD"; -- interrupt return pointer |
rd1_select<='1'; |
rd2_select<='0'; |
valid_out<='1'; |
interrupt_ready<='1'; |
self_busy<='1'; |
state<=ContinueInterrupt; |
else |
if opcode(5 downto 3)="101" or opcode="000001" then -- lc or lcs |
cmd_loadop3_o<='1'; |
-- Setting op3_o here only affects the lcs instruction |
op3_o<=std_logic_vector(resize(signed(opcode(2 downto 0)& |
t1&t2&rd1&rd2),op3_o'length)); |
end if; |
|
if opcode(5 downto 3)="001" then |
cmd_dbus_o<='1'; |
end if; |
|
if opcode(5 downto 1)="01000" then |
cmd_addsub_o<='1'; |
end if; |
|
cmd_negate_op2_o<=opcode(0); |
|
if opcode="010010" then |
cmd_mul_o<='1'; |
end if; |
|
if opcode(5 downto 2)="0101" then |
cmd_div_o<='1'; |
end if; |
|
if opcode(5 downto 3)="100" then -- jump or call |
cmd_jump_o<='1'; |
cmd_loadop3_o<=opcode(0); |
-- Setting op3_o here only affects the call instruction |
op3_o<=next_ip_i&"00"; |
end if; |
|
-- Note: (a or b) = (a and b) or (a xor b) |
|
if opcode(5 downto 1)="01100" then |
cmd_and_o<='1'; |
end if; |
|
if opcode="011010" or opcode="011001" then |
cmd_xor_o<='1'; |
end if; |
|
if opcode(5 downto 2)="0111" then |
cmd_shift_o<='1'; |
end if; |
|
if opcode(5 downto 4)="11" then |
cmd_cmp_o<='1'; |
cmd_negate_op2_o<='1'; |
end if; |
|
rd1_select<=t1; |
rd2_select<=t2; |
|
dst_out<=destination; |
|
if valid_i='1' then |
if opcode="000001" then |
valid_out<='0'; |
self_busy<='0'; |
state<=ContinueLc; |
elsif opcode="000010" then |
valid_out<='0'; |
self_busy<='1'; |
state<=Halt; |
elsif opcode(5 downto 4)="11" then |
valid_out<='1'; |
self_busy<='1'; |
state<=ContinueCjmp; |
else |
valid_out<='1'; |
end if; |
else |
valid_out<='0'; |
end if; |
end if; |
when ContinueLc => |
if valid_i='1' then |
valid_out<='1'; |
op3_o<=word_i; |
self_busy<='0'; |
state<=Regular; |
end if; |
when ContinueCjmp => |
valid_out<='1'; |
cmd_jump_o<='1'; |
rd1_select<='1'; |
self_busy<='0'; |
state<=Regular; |
when ContinueInterrupt => |
valid_out<='0'; |
when Halt => |
if interrupt_valid_i='1' then |
self_busy<='0'; |
state<=Regular; |
end if; |
end case; |
end if; |
end if; |
end if; |
end process; |
|
valid_o<=valid_out; |
dst_o<=dst_out; |
|
ready_o<=not busy; |
|
interrupt_ready_o<=interrupt_ready; |
|
-- Decode RD (register/direct) operands |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if busy='0' then |
rd1_reg<=rd1; |
rd2_reg<=rd2; |
end if; |
end if; |
end process; |
|
sp_raddr1_o<="11110"&interrupt_vector_i when (state=Regular and interrupt_valid_i='1' and downstream_busy='0') or state=ContinueInterrupt else |
dst_out when (state=ContinueCjmp and downstream_busy='0') else |
rd1_reg when busy='1' else |
rd1; |
|
sp_raddr2_o<=rd2_reg when busy='1' else rd2; |
|
op1_o<=sp_rdata1_i when rd1_select='1' else rd1_direct; |
op2_o<=sp_rdata2_i when rd2_select='1' else rd2_direct; |
|
end architecture; |
/lxp32_divider.vhd
1,172 → 1,172
--------------------------------------------------------------------- |
-- Divider |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- Based on the NRD (Non Restoring Division) algorithm. Takes |
-- 36 cycles to calculate quotient (37 for remainder). |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_divider is |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
ce_i: in std_logic; |
op1_i: in std_logic_vector(31 downto 0); |
op2_i: in std_logic_vector(31 downto 0); |
signed_i: in std_logic; |
rem_i: in std_logic; |
ce_o: out std_logic; |
result_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_divider is |
|
-- Complementor signals |
|
signal compl_inv: std_logic; |
signal compl_mux: std_logic_vector(31 downto 0); |
signal compl_out: std_logic_vector(31 downto 0); |
|
signal inv_res: std_logic; |
|
-- Divider FSM signals |
|
signal fsm_ce: std_logic:='0'; |
|
signal dividend: unsigned(31 downto 0); |
signal divisor: unsigned(32 downto 0); |
signal want_remainder: std_logic; |
|
signal partial_remainder: unsigned(32 downto 0); |
signal addend: unsigned(32 downto 0); |
signal sum: unsigned(32 downto 0); |
signal sum_positive: std_logic; |
signal sum_subtract: std_logic; |
|
signal cnt: integer range 0 to 34:=0; |
|
signal ceo: std_logic:='0'; |
|
-- Output restoration signals |
|
signal remainder_corrector: unsigned(31 downto 0); |
signal remainder_corrector_1: std_logic; |
signal remainder_pos: unsigned(31 downto 0); |
signal result_pos: unsigned(31 downto 0); |
|
begin |
|
compl_inv<=op1_i(31) and signed_i when ce_i='1' else inv_res; |
compl_mux<=op1_i when ce_i='1' else std_logic_vector(result_pos); |
|
compl_op1_inst: entity work.lxp32_compl(rtl) |
port map( |
clk_i=>clk_i, |
compl_i=>compl_inv, |
d_i=>compl_mux, |
d_o=>compl_out |
); |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
fsm_ce<='0'; |
want_remainder<='-'; |
inv_res<='-'; |
else |
fsm_ce<=ce_i; |
if ce_i='1' then |
want_remainder<=rem_i; |
if rem_i='1' then |
inv_res<=op1_i(31) and signed_i; |
else |
inv_res<=(op1_i(31) xor op2_i(31)) and signed_i; |
end if; |
end if; |
end if; |
end if; |
end process; |
|
-- Main adder/subtractor |
|
addend_gen: for i in addend'range generate |
addend(i)<=divisor(i) xor sum_subtract; |
end generate; |
|
sum<=partial_remainder+addend+(to_unsigned(0,32)&sum_subtract); |
sum_positive<=not sum(32); |
|
-- Divider state machine |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
cnt<=0; |
ceo<='0'; |
divisor<=(others=>'-'); |
dividend<=(others=>'-'); |
partial_remainder<=(others=>'-'); |
sum_subtract<='-'; |
else |
if cnt=1 then |
ceo<='1'; |
else |
ceo<='0'; |
end if; |
|
if ce_i='1' then |
divisor(31 downto 0)<=unsigned(op2_i); |
divisor(32)<=op2_i(31) and signed_i; |
end if; |
|
if fsm_ce='1' then |
dividend<=unsigned(compl_out(30 downto 0)&"0"); |
partial_remainder<=to_unsigned(0,32)&compl_out(31); |
sum_subtract<=not divisor(32); |
if want_remainder='1' then |
cnt<=34; |
else |
cnt<=33; |
end if; |
else |
partial_remainder<=sum(31 downto 0)÷nd(31); |
sum_subtract<=sum_positive xor divisor(32); |
dividend<=dividend(30 downto 0)&sum_positive; |
if cnt>0 then |
cnt<=cnt-1; |
end if; |
end if; |
end if; |
end if; |
end process; |
|
-- Output restoration circuit |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
for i in remainder_corrector'range loop |
remainder_corrector(i)<=(divisor(i) xor divisor(32)) and not sum_positive; |
end loop; |
remainder_corrector_1<=divisor(32) and not sum_positive; |
remainder_pos<=partial_remainder(32 downto 1)+remainder_corrector+ |
(to_unsigned(0,31)&remainder_corrector_1); |
end if; |
end process; |
|
result_pos<=remainder_pos when want_remainder='1' else dividend; |
|
result_o<=compl_out; |
ce_o<=ceo; |
|
end architecture; |
--------------------------------------------------------------------- |
-- Divider |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- Based on the NRD (Non Restoring Division) algorithm. Takes |
-- 36 cycles to calculate quotient (37 for remainder). |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_divider is |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
ce_i: in std_logic; |
op1_i: in std_logic_vector(31 downto 0); |
op2_i: in std_logic_vector(31 downto 0); |
signed_i: in std_logic; |
rem_i: in std_logic; |
ce_o: out std_logic; |
result_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_divider is |
|
-- Complementor signals |
|
signal compl_inv: std_logic; |
signal compl_mux: std_logic_vector(31 downto 0); |
signal compl_out: std_logic_vector(31 downto 0); |
|
signal inv_res: std_logic; |
|
-- Divider FSM signals |
|
signal fsm_ce: std_logic:='0'; |
|
signal dividend: unsigned(31 downto 0); |
signal divisor: unsigned(32 downto 0); |
signal want_remainder: std_logic; |
|
signal partial_remainder: unsigned(32 downto 0); |
signal addend: unsigned(32 downto 0); |
signal sum: unsigned(32 downto 0); |
signal sum_positive: std_logic; |
signal sum_subtract: std_logic; |
|
signal cnt: integer range 0 to 34:=0; |
|
signal ceo: std_logic:='0'; |
|
-- Output restoration signals |
|
signal remainder_corrector: unsigned(31 downto 0); |
signal remainder_corrector_1: std_logic; |
signal remainder_pos: unsigned(31 downto 0); |
signal result_pos: unsigned(31 downto 0); |
|
begin |
|
compl_inv<=op1_i(31) and signed_i when ce_i='1' else inv_res; |
compl_mux<=op1_i when ce_i='1' else std_logic_vector(result_pos); |
|
compl_op1_inst: entity work.lxp32_compl(rtl) |
port map( |
clk_i=>clk_i, |
compl_i=>compl_inv, |
d_i=>compl_mux, |
d_o=>compl_out |
); |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
fsm_ce<='0'; |
want_remainder<='-'; |
inv_res<='-'; |
else |
fsm_ce<=ce_i; |
if ce_i='1' then |
want_remainder<=rem_i; |
if rem_i='1' then |
inv_res<=op1_i(31) and signed_i; |
else |
inv_res<=(op1_i(31) xor op2_i(31)) and signed_i; |
end if; |
end if; |
end if; |
end if; |
end process; |
|
-- Main adder/subtractor |
|
addend_gen: for i in addend'range generate |
addend(i)<=divisor(i) xor sum_subtract; |
end generate; |
|
sum<=partial_remainder+addend+(to_unsigned(0,32)&sum_subtract); |
sum_positive<=not sum(32); |
|
-- Divider state machine |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
cnt<=0; |
ceo<='0'; |
divisor<=(others=>'-'); |
dividend<=(others=>'-'); |
partial_remainder<=(others=>'-'); |
sum_subtract<='-'; |
else |
if cnt=1 then |
ceo<='1'; |
else |
ceo<='0'; |
end if; |
|
if ce_i='1' then |
divisor(31 downto 0)<=unsigned(op2_i); |
divisor(32)<=op2_i(31) and signed_i; |
end if; |
|
if fsm_ce='1' then |
dividend<=unsigned(compl_out(30 downto 0)&"0"); |
partial_remainder<=to_unsigned(0,32)&compl_out(31); |
sum_subtract<=not divisor(32); |
if want_remainder='1' then |
cnt<=34; |
else |
cnt<=33; |
end if; |
else |
partial_remainder<=sum(31 downto 0)÷nd(31); |
sum_subtract<=sum_positive xor divisor(32); |
dividend<=dividend(30 downto 0)&sum_positive; |
if cnt>0 then |
cnt<=cnt-1; |
end if; |
end if; |
end if; |
end if; |
end process; |
|
-- Output restoration circuit |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
for i in remainder_corrector'range loop |
remainder_corrector(i)<=(divisor(i) xor divisor(32)) and not sum_positive; |
end loop; |
remainder_corrector_1<=divisor(32) and not sum_positive; |
remainder_pos<=partial_remainder(32 downto 1)+remainder_corrector+ |
(to_unsigned(0,31)&remainder_corrector_1); |
end if; |
end process; |
|
result_pos<=remainder_pos when want_remainder='1' else dividend; |
|
result_o<=compl_out; |
ce_o<=ceo; |
|
end architecture; |
/lxp32_execute.vhd
1,260 → 1,260
--------------------------------------------------------------------- |
-- Execution unit |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- The third stage of the LXP32 pipeline. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
|
entity lxp32_execute is |
generic( |
DBUS_RMW: boolean; |
DIVIDER_EN: boolean; |
MUL_ARCH: string |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
cmd_loadop3_i: in std_logic; |
cmd_signed_i: in std_logic; |
cmd_dbus_i: in std_logic; |
cmd_dbus_store_i: in std_logic; |
cmd_dbus_byte_i: in std_logic; |
cmd_addsub_i: in std_logic; |
cmd_mul_i: in std_logic; |
cmd_div_i: in std_logic; |
cmd_div_mod_i: in std_logic; |
cmd_cmp_i: in std_logic; |
cmd_jump_i: in std_logic; |
cmd_negate_op2_i: in std_logic; |
cmd_and_i: in std_logic; |
cmd_xor_i: in std_logic; |
cmd_shift_i: in std_logic; |
cmd_shift_right_i: in std_logic; |
|
jump_type_i: in std_logic_vector(3 downto 0); |
|
op1_i: in std_logic_vector(31 downto 0); |
op2_i: in std_logic_vector(31 downto 0); |
op3_i: in std_logic_vector(31 downto 0); |
dst_i: in std_logic_vector(7 downto 0); |
|
sp_waddr_o: out std_logic_vector(7 downto 0); |
sp_we_o: out std_logic; |
sp_wdata_o: out std_logic_vector(31 downto 0); |
|
valid_i: in std_logic; |
ready_o: out std_logic; |
|
dbus_cyc_o: out std_logic; |
dbus_stb_o: out std_logic; |
dbus_we_o: out std_logic; |
dbus_sel_o: out std_logic_vector(3 downto 0); |
dbus_ack_i: in std_logic; |
dbus_adr_o: out std_logic_vector(31 downto 2); |
dbus_dat_o: out std_logic_vector(31 downto 0); |
dbus_dat_i: in std_logic_vector(31 downto 0); |
|
jump_valid_o: out std_logic; |
jump_dst_o: out std_logic_vector(29 downto 0); |
jump_ready_i: in std_logic; |
|
interrupt_return_o: out std_logic |
); |
end entity; |
|
architecture rtl of lxp32_execute is |
|
-- Pipeline control signals |
|
signal busy: std_logic; |
signal can_execute: std_logic; |
|
-- ALU signals |
|
signal alu_result: std_logic_vector(31 downto 0); |
signal alu_we: std_logic; |
signal alu_busy: std_logic; |
|
signal alu_cmp_eq: std_logic; |
signal alu_cmp_ug: std_logic; |
signal alu_cmp_sg: std_logic; |
|
-- OP3 loader signals |
|
signal loadop3_we: std_logic; |
|
-- Jump machine signals |
|
signal jump_condition: std_logic; |
signal jump_valid: std_logic:='0'; |
signal jump_dst: std_logic_vector(jump_dst_o'range); |
|
-- DBUS signals |
|
signal dbus_result: std_logic_vector(31 downto 0); |
signal dbus_busy: std_logic; |
signal dbus_we: std_logic; |
|
-- Result mux signals |
|
signal result_mux: std_logic_vector(31 downto 0); |
signal result_valid: std_logic; |
signal result_regaddr: std_logic_vector(7 downto 0); |
|
signal dst_reg: std_logic_vector(7 downto 0); |
|
-- Signals related to interrupt handling |
|
signal interrupt_return: std_logic:='0'; |
|
begin |
|
-- Pipeline control |
|
busy<=alu_busy or dbus_busy; |
ready_o<=not busy; |
can_execute<=valid_i and not busy; |
|
-- ALU |
|
alu_inst: entity work.lxp32_alu(rtl) |
generic map( |
DIVIDER_EN=>DIVIDER_EN, |
MUL_ARCH=>MUL_ARCH |
) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
valid_i=>can_execute, |
|
cmd_signed_i=>cmd_signed_i, |
cmd_addsub_i=>cmd_addsub_i, |
cmd_mul_i=>cmd_mul_i, |
cmd_div_i=>cmd_div_i, |
cmd_div_mod_i=>cmd_div_mod_i, |
cmd_cmp_i=>cmd_cmp_i, |
cmd_negate_op2_i=>cmd_negate_op2_i, |
cmd_and_i=>cmd_and_i, |
cmd_xor_i=>cmd_xor_i, |
cmd_shift_i=>cmd_shift_i, |
cmd_shift_right_i=>cmd_shift_right_i, |
|
op1_i=>op1_i, |
op2_i=>op2_i, |
|
result_o=>alu_result, |
|
cmp_eq_o=>alu_cmp_eq, |
cmp_ug_o=>alu_cmp_ug, |
cmp_sg_o=>alu_cmp_sg, |
|
we_o=>alu_we, |
busy_o=>alu_busy |
); |
|
-- OP3 loader |
|
loadop3_we<=can_execute and cmd_loadop3_i; |
|
-- Jump logic |
|
jump_condition<=(not cmd_cmp_i) or (jump_type_i(3) and alu_cmp_eq) or |
(jump_type_i(2) and not alu_cmp_eq) or (jump_type_i(1) and alu_cmp_ug) or |
(jump_type_i(0) and alu_cmp_sg); |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
jump_valid<='0'; |
interrupt_return<='0'; |
jump_dst<=(others=>'-'); |
else |
if jump_valid='0' then |
jump_dst<=op1_i(31 downto 2); |
if can_execute='1' and cmd_jump_i='1' and jump_condition='1' then |
jump_valid<='1'; |
interrupt_return<=op1_i(0); |
end if; |
elsif jump_ready_i='1' then |
jump_valid<='0'; |
interrupt_return<='0'; |
end if; |
end if; |
end if; |
end process; |
|
jump_valid_o<=jump_valid or (can_execute and cmd_jump_i and jump_condition); |
jump_dst_o<=jump_dst when jump_valid='1' else op1_i(31 downto 2); |
|
interrupt_return_o<=interrupt_return; |
|
-- DBUS access |
|
dbus_inst: entity work.lxp32_dbus(rtl) |
generic map( |
RMW=>DBUS_RMW |
) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
valid_i=>can_execute, |
|
cmd_dbus_i=>cmd_dbus_i, |
cmd_dbus_store_i=>cmd_dbus_store_i, |
cmd_dbus_byte_i=>cmd_dbus_byte_i, |
cmd_signed_i=>cmd_signed_i, |
addr_i=>op1_i, |
wdata_i=>op2_i, |
|
rdata_o=>dbus_result, |
busy_o=>dbus_busy, |
we_o=>dbus_we, |
|
dbus_cyc_o=>dbus_cyc_o, |
dbus_stb_o=>dbus_stb_o, |
dbus_we_o=>dbus_we_o, |
dbus_sel_o=>dbus_sel_o, |
dbus_ack_i=>dbus_ack_i, |
dbus_adr_o=>dbus_adr_o, |
dbus_dat_o=>dbus_dat_o, |
dbus_dat_i=>dbus_dat_i |
); |
|
-- Result multiplexer |
|
result_mux_gen: for i in result_mux'range generate |
result_mux(i)<=(alu_result(i) and alu_we) or |
(op3_i(i) and loadop3_we) or |
(dbus_result(i) and dbus_we); |
end generate; |
|
result_valid<=alu_we or loadop3_we or dbus_we; |
|
-- Write destination register |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if can_execute='1' then |
dst_reg<=dst_i; |
end if; |
end if; |
end process; |
|
result_regaddr<=dst_i when can_execute='1' else dst_reg; |
|
sp_we_o<=result_valid; |
sp_waddr_o<=result_regaddr; |
sp_wdata_o<=result_mux; |
|
end architecture; |
--------------------------------------------------------------------- |
-- Execution unit |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- The third stage of the LXP32 pipeline. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
|
entity lxp32_execute is |
generic( |
DBUS_RMW: boolean; |
DIVIDER_EN: boolean; |
MUL_ARCH: string |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
cmd_loadop3_i: in std_logic; |
cmd_signed_i: in std_logic; |
cmd_dbus_i: in std_logic; |
cmd_dbus_store_i: in std_logic; |
cmd_dbus_byte_i: in std_logic; |
cmd_addsub_i: in std_logic; |
cmd_mul_i: in std_logic; |
cmd_div_i: in std_logic; |
cmd_div_mod_i: in std_logic; |
cmd_cmp_i: in std_logic; |
cmd_jump_i: in std_logic; |
cmd_negate_op2_i: in std_logic; |
cmd_and_i: in std_logic; |
cmd_xor_i: in std_logic; |
cmd_shift_i: in std_logic; |
cmd_shift_right_i: in std_logic; |
|
jump_type_i: in std_logic_vector(3 downto 0); |
|
op1_i: in std_logic_vector(31 downto 0); |
op2_i: in std_logic_vector(31 downto 0); |
op3_i: in std_logic_vector(31 downto 0); |
dst_i: in std_logic_vector(7 downto 0); |
|
sp_waddr_o: out std_logic_vector(7 downto 0); |
sp_we_o: out std_logic; |
sp_wdata_o: out std_logic_vector(31 downto 0); |
|
valid_i: in std_logic; |
ready_o: out std_logic; |
|
dbus_cyc_o: out std_logic; |
dbus_stb_o: out std_logic; |
dbus_we_o: out std_logic; |
dbus_sel_o: out std_logic_vector(3 downto 0); |
dbus_ack_i: in std_logic; |
dbus_adr_o: out std_logic_vector(31 downto 2); |
dbus_dat_o: out std_logic_vector(31 downto 0); |
dbus_dat_i: in std_logic_vector(31 downto 0); |
|
jump_valid_o: out std_logic; |
jump_dst_o: out std_logic_vector(29 downto 0); |
jump_ready_i: in std_logic; |
|
interrupt_return_o: out std_logic |
); |
end entity; |
|
architecture rtl of lxp32_execute is |
|
-- Pipeline control signals |
|
signal busy: std_logic; |
signal can_execute: std_logic; |
|
-- ALU signals |
|
signal alu_result: std_logic_vector(31 downto 0); |
signal alu_we: std_logic; |
signal alu_busy: std_logic; |
|
signal alu_cmp_eq: std_logic; |
signal alu_cmp_ug: std_logic; |
signal alu_cmp_sg: std_logic; |
|
-- OP3 loader signals |
|
signal loadop3_we: std_logic; |
|
-- Jump machine signals |
|
signal jump_condition: std_logic; |
signal jump_valid: std_logic:='0'; |
signal jump_dst: std_logic_vector(jump_dst_o'range); |
|
-- DBUS signals |
|
signal dbus_result: std_logic_vector(31 downto 0); |
signal dbus_busy: std_logic; |
signal dbus_we: std_logic; |
|
-- Result mux signals |
|
signal result_mux: std_logic_vector(31 downto 0); |
signal result_valid: std_logic; |
signal result_regaddr: std_logic_vector(7 downto 0); |
|
signal dst_reg: std_logic_vector(7 downto 0); |
|
-- Signals related to interrupt handling |
|
signal interrupt_return: std_logic:='0'; |
|
begin |
|
-- Pipeline control |
|
busy<=alu_busy or dbus_busy; |
ready_o<=not busy; |
can_execute<=valid_i and not busy; |
|
-- ALU |
|
alu_inst: entity work.lxp32_alu(rtl) |
generic map( |
DIVIDER_EN=>DIVIDER_EN, |
MUL_ARCH=>MUL_ARCH |
) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
valid_i=>can_execute, |
|
cmd_signed_i=>cmd_signed_i, |
cmd_addsub_i=>cmd_addsub_i, |
cmd_mul_i=>cmd_mul_i, |
cmd_div_i=>cmd_div_i, |
cmd_div_mod_i=>cmd_div_mod_i, |
cmd_cmp_i=>cmd_cmp_i, |
cmd_negate_op2_i=>cmd_negate_op2_i, |
cmd_and_i=>cmd_and_i, |
cmd_xor_i=>cmd_xor_i, |
cmd_shift_i=>cmd_shift_i, |
cmd_shift_right_i=>cmd_shift_right_i, |
|
op1_i=>op1_i, |
op2_i=>op2_i, |
|
result_o=>alu_result, |
|
cmp_eq_o=>alu_cmp_eq, |
cmp_ug_o=>alu_cmp_ug, |
cmp_sg_o=>alu_cmp_sg, |
|
we_o=>alu_we, |
busy_o=>alu_busy |
); |
|
-- OP3 loader |
|
loadop3_we<=can_execute and cmd_loadop3_i; |
|
-- Jump logic |
|
jump_condition<=(not cmd_cmp_i) or (jump_type_i(3) and alu_cmp_eq) or |
(jump_type_i(2) and not alu_cmp_eq) or (jump_type_i(1) and alu_cmp_ug) or |
(jump_type_i(0) and alu_cmp_sg); |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
jump_valid<='0'; |
interrupt_return<='0'; |
jump_dst<=(others=>'-'); |
else |
if jump_valid='0' then |
jump_dst<=op1_i(31 downto 2); |
if can_execute='1' and cmd_jump_i='1' and jump_condition='1' then |
jump_valid<='1'; |
interrupt_return<=op1_i(0); |
end if; |
elsif jump_ready_i='1' then |
jump_valid<='0'; |
interrupt_return<='0'; |
end if; |
end if; |
end if; |
end process; |
|
jump_valid_o<=jump_valid or (can_execute and cmd_jump_i and jump_condition); |
jump_dst_o<=jump_dst when jump_valid='1' else op1_i(31 downto 2); |
|
interrupt_return_o<=interrupt_return; |
|
-- DBUS access |
|
dbus_inst: entity work.lxp32_dbus(rtl) |
generic map( |
RMW=>DBUS_RMW |
) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
valid_i=>can_execute, |
|
cmd_dbus_i=>cmd_dbus_i, |
cmd_dbus_store_i=>cmd_dbus_store_i, |
cmd_dbus_byte_i=>cmd_dbus_byte_i, |
cmd_signed_i=>cmd_signed_i, |
addr_i=>op1_i, |
wdata_i=>op2_i, |
|
rdata_o=>dbus_result, |
busy_o=>dbus_busy, |
we_o=>dbus_we, |
|
dbus_cyc_o=>dbus_cyc_o, |
dbus_stb_o=>dbus_stb_o, |
dbus_we_o=>dbus_we_o, |
dbus_sel_o=>dbus_sel_o, |
dbus_ack_i=>dbus_ack_i, |
dbus_adr_o=>dbus_adr_o, |
dbus_dat_o=>dbus_dat_o, |
dbus_dat_i=>dbus_dat_i |
); |
|
-- Result multiplexer |
|
result_mux_gen: for i in result_mux'range generate |
result_mux(i)<=(alu_result(i) and alu_we) or |
(op3_i(i) and loadop3_we) or |
(dbus_result(i) and dbus_we); |
end generate; |
|
result_valid<=alu_we or loadop3_we or dbus_we; |
|
-- Write destination register |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if can_execute='1' then |
dst_reg<=dst_i; |
end if; |
end if; |
end process; |
|
result_regaddr<=dst_i when can_execute='1' else dst_reg; |
|
sp_we_o<=result_valid; |
sp_waddr_o<=result_regaddr; |
sp_wdata_o<=result_mux; |
|
end architecture; |
/lxp32_fetch.vhd
1,226 → 1,226
--------------------------------------------------------------------- |
-- Instruction fetch |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- The first stage of the LXP32 pipeline. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_fetch is |
generic( |
START_ADDR: std_logic_vector(31 downto 0) |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
lli_re_o: out std_logic; |
lli_adr_o: out std_logic_vector(29 downto 0); |
lli_dat_i: in std_logic_vector(31 downto 0); |
lli_busy_i: in std_logic; |
|
word_o: out std_logic_vector(31 downto 0); |
current_ip_o: out std_logic_vector(29 downto 0); |
next_ip_o: out std_logic_vector(29 downto 0); |
valid_o: out std_logic; |
ready_i: in std_logic; |
|
jump_valid_i: in std_logic; |
jump_dst_i: in std_logic_vector(29 downto 0); |
jump_ready_o: out std_logic |
); |
end entity; |
|
architecture rtl of lxp32_fetch is |
|
signal init: std_logic:='1'; |
signal init_cnt: unsigned(7 downto 0):=(others=>'0'); |
|
signal fetch_addr: std_logic_vector(29 downto 0):=START_ADDR(31 downto 2); |
|
signal next_word: std_logic; |
signal suppress_re: std_logic:='0'; |
signal re: std_logic; |
signal requested: std_logic:='0'; |
|
signal fifo_rst: std_logic; |
signal fifo_we: std_logic; |
signal fifo_din: std_logic_vector(31 downto 0); |
signal fifo_re: std_logic; |
signal fifo_dout: std_logic_vector(31 downto 0); |
signal fifo_empty: std_logic; |
signal fifo_full: std_logic; |
|
signal jr: std_logic:='0'; |
|
signal next_ip: std_logic_vector(fetch_addr'range); |
signal current_ip: std_logic_vector(fetch_addr'range); |
|
begin |
|
-- INIT state machine (to initialize all registers) |
|
-- All CPU registers are expected to be zero-initialized after reset. |
-- Since these registers are implemented as a RAM block, we perform |
-- the initialization sequentially by generating "mov rN, 0" instructions |
-- for each N from 0 to 255. |
-- |
-- With SRAM-based FPGAs, flip-flops and RAM blocks have deterministic |
-- state after configuration. On these technologies the CPU can operate |
-- without reset and the initialization procedure described above is not |
-- needed. However, the initialization is still performed as usual when |
-- external reset signal is asserted. |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
init<='0'; |
init_cnt<=(others=>'0'); |
else |
if init='0' and ready_i='1' then |
init_cnt<=init_cnt+1; |
if init_cnt=X"FF" then |
init<='1'; |
end if; |
end if; |
end if; |
end if; |
end process; |
|
-- FETCH state machine |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
fetch_addr<=START_ADDR(31 downto 2); |
requested<='0'; |
jr<='0'; |
suppress_re<='0'; |
next_ip<=(others=>'-'); |
else |
jr<='0'; |
-- Suppress LLI request if jump signal is active but will not be processed |
-- in this cycle. Helps to reduce jump latency with high-latency LLI slaves. |
-- Note: gating "re" with "jump_valid_i and not jr" asynchronously would |
-- reduce jump latency even more, but we really want to avoid too large |
-- clock-to-out on LLI outputs. |
suppress_re<=jump_valid_i and not jr and not next_word; |
if lli_busy_i='0' then |
requested<=re and not (jump_valid_i and not jr); |
end if; |
if next_word='1' then |
-- It's not immediately obvious why, but current_ip and next_ip will contain |
-- the addresses of the current instruction and the next instruction to be |
-- fetched, respectively, by the time the instruction is passed to the decode |
-- stage. Basically, this is because when either the decoder or the IBUS |
-- stalls, the fetch_addr counter will also stop incrementing. |
next_ip<=fetch_addr; |
current_ip<=next_ip; |
if jump_valid_i='1' and jr='0' then |
fetch_addr<=jump_dst_i; |
jr<='1'; |
else |
fetch_addr<=std_logic_vector(unsigned(fetch_addr)+1); |
end if; |
end if; |
end if; |
end if; |
end process; |
|
next_word<=(fifo_empty or ready_i) and not lli_busy_i and init; |
re<=(fifo_empty or ready_i) and init and not suppress_re; |
lli_re_o<=re; |
lli_adr_o<=fetch_addr; |
|
jump_ready_o<=jr; |
|
-- Small instruction buffer |
|
fifo_rst<=rst_i or (jump_valid_i and not jr); |
fifo_we<=requested and not lli_busy_i; |
fifo_din<=lli_dat_i; |
fifo_re<=ready_i and not fifo_empty; |
|
ubuf_inst: entity work.lxp32_ubuf(rtl) |
generic map( |
DATA_WIDTH=>32 |
) |
port map( |
clk_i=>clk_i, |
rst_i=>fifo_rst, |
|
we_i=>fifo_we, |
d_i=>fifo_din, |
re_i=>fifo_re, |
d_o=>fifo_dout, |
|
empty_o=>fifo_empty, |
full_o=>fifo_full |
); |
|
next_ip_o<=next_ip; |
current_ip_o<=current_ip; |
word_o<=fifo_dout when init='1' else X"40"&std_logic_vector(init_cnt)&X"0000"; |
valid_o<=not fifo_empty or not init; |
|
-- Note: the following code contains a few simulation-only assertions |
-- to check that current_ip and next_ip signals, used in procedure calls |
-- and interrupts, are correct. |
-- This code should be ignored by a synthesizer since it doesn't drive |
-- any signals, but we also surround it by metacomments, just in case. |
|
-- synthesis translate_off |
|
process (clk_i) is |
type Pair is record |
addr: std_logic_vector(fetch_addr'range); |
data: std_logic_vector(31 downto 0); |
end record; |
type Pairs is array (7 downto 0) of Pair; |
variable buf: Pairs; |
variable count: integer range buf'range:=0; |
variable current_pair: Pair; |
begin |
if rising_edge(clk_i) then |
if fifo_rst='1' then -- jump |
count:=0; |
elsif fifo_we='1' then -- LLI returned data |
current_pair.data:=fifo_din; |
buf(count):=current_pair; |
count:=count+1; |
end if; |
if re='1' and lli_busy_i='0' then -- data requested |
current_pair.addr:=fetch_addr; |
end if; |
if fifo_empty='0' and fifo_rst='0' then -- fetch output is valid |
assert count>0 |
report "Fetch: buffer should be empty" |
severity failure; |
assert buf(0).data=fifo_dout |
report "Fetch: incorrect data" |
severity failure; |
assert buf(0).addr=current_ip |
report "Fetch: incorrect current_ip" |
severity failure; |
assert std_logic_vector(unsigned(buf(0).addr)+1)=next_ip |
report "Fetch: incorrect next_ip" |
severity failure; |
if ready_i='1' then |
buf(buf'high-1 downto 0):=buf(buf'high downto 1); -- we don't care about the highest item |
count:=count-1; |
end if; |
end if; |
end if; |
end process; |
|
-- synthesis translate_on |
|
end architecture; |
--------------------------------------------------------------------- |
-- Instruction fetch |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- The first stage of the LXP32 pipeline. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_fetch is |
generic( |
START_ADDR: std_logic_vector(31 downto 0) |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
lli_re_o: out std_logic; |
lli_adr_o: out std_logic_vector(29 downto 0); |
lli_dat_i: in std_logic_vector(31 downto 0); |
lli_busy_i: in std_logic; |
|
word_o: out std_logic_vector(31 downto 0); |
current_ip_o: out std_logic_vector(29 downto 0); |
next_ip_o: out std_logic_vector(29 downto 0); |
valid_o: out std_logic; |
ready_i: in std_logic; |
|
jump_valid_i: in std_logic; |
jump_dst_i: in std_logic_vector(29 downto 0); |
jump_ready_o: out std_logic |
); |
end entity; |
|
architecture rtl of lxp32_fetch is |
|
signal init: std_logic:='1'; |
signal init_cnt: unsigned(7 downto 0):=(others=>'0'); |
|
signal fetch_addr: std_logic_vector(29 downto 0):=START_ADDR(31 downto 2); |
|
signal next_word: std_logic; |
signal suppress_re: std_logic:='0'; |
signal re: std_logic; |
signal requested: std_logic:='0'; |
|
signal fifo_rst: std_logic; |
signal fifo_we: std_logic; |
signal fifo_din: std_logic_vector(31 downto 0); |
signal fifo_re: std_logic; |
signal fifo_dout: std_logic_vector(31 downto 0); |
signal fifo_empty: std_logic; |
signal fifo_full: std_logic; |
|
signal jr: std_logic:='0'; |
|
signal next_ip: std_logic_vector(fetch_addr'range); |
signal current_ip: std_logic_vector(fetch_addr'range); |
|
begin |
|
-- INIT state machine (to initialize all registers) |
|
-- All CPU registers are expected to be zero-initialized after reset. |
-- Since these registers are implemented as a RAM block, we perform |
-- the initialization sequentially by generating "mov rN, 0" instructions |
-- for each N from 0 to 255. |
-- |
-- With SRAM-based FPGAs, flip-flops and RAM blocks have deterministic |
-- state after configuration. On these technologies the CPU can operate |
-- without reset and the initialization procedure described above is not |
-- needed. However, the initialization is still performed as usual when |
-- external reset signal is asserted. |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
init<='0'; |
init_cnt<=(others=>'0'); |
else |
if init='0' and ready_i='1' then |
init_cnt<=init_cnt+1; |
if init_cnt=X"FF" then |
init<='1'; |
end if; |
end if; |
end if; |
end if; |
end process; |
|
-- FETCH state machine |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
fetch_addr<=START_ADDR(31 downto 2); |
requested<='0'; |
jr<='0'; |
suppress_re<='0'; |
next_ip<=(others=>'-'); |
else |
jr<='0'; |
-- Suppress LLI request if jump signal is active but will not be processed |
-- in this cycle. Helps to reduce jump latency with high-latency LLI slaves. |
-- Note: gating "re" with "jump_valid_i and not jr" asynchronously would |
-- reduce jump latency even more, but we really want to avoid too large |
-- clock-to-out on LLI outputs. |
suppress_re<=jump_valid_i and not jr and not next_word; |
if lli_busy_i='0' then |
requested<=re and not (jump_valid_i and not jr); |
end if; |
if next_word='1' then |
-- It's not immediately obvious why, but current_ip and next_ip will contain |
-- the addresses of the current instruction and the next instruction to be |
-- fetched, respectively, by the time the instruction is passed to the decode |
-- stage. Basically, this is because when either the decoder or the IBUS |
-- stalls, the fetch_addr counter will also stop incrementing. |
next_ip<=fetch_addr; |
current_ip<=next_ip; |
if jump_valid_i='1' and jr='0' then |
fetch_addr<=jump_dst_i; |
jr<='1'; |
else |
fetch_addr<=std_logic_vector(unsigned(fetch_addr)+1); |
end if; |
end if; |
end if; |
end if; |
end process; |
|
next_word<=(fifo_empty or ready_i) and not lli_busy_i and init; |
re<=(fifo_empty or ready_i) and init and not suppress_re; |
lli_re_o<=re; |
lli_adr_o<=fetch_addr; |
|
jump_ready_o<=jr; |
|
-- Small instruction buffer |
|
fifo_rst<=rst_i or (jump_valid_i and not jr); |
fifo_we<=requested and not lli_busy_i; |
fifo_din<=lli_dat_i; |
fifo_re<=ready_i and not fifo_empty; |
|
ubuf_inst: entity work.lxp32_ubuf(rtl) |
generic map( |
DATA_WIDTH=>32 |
) |
port map( |
clk_i=>clk_i, |
rst_i=>fifo_rst, |
|
we_i=>fifo_we, |
d_i=>fifo_din, |
re_i=>fifo_re, |
d_o=>fifo_dout, |
|
empty_o=>fifo_empty, |
full_o=>fifo_full |
); |
|
next_ip_o<=next_ip; |
current_ip_o<=current_ip; |
word_o<=fifo_dout when init='1' else X"40"&std_logic_vector(init_cnt)&X"0000"; |
valid_o<=not fifo_empty or not init; |
|
-- Note: the following code contains a few simulation-only assertions |
-- to check that current_ip and next_ip signals, used in procedure calls |
-- and interrupts, are correct. |
-- This code should be ignored by a synthesizer since it doesn't drive |
-- any signals, but we also surround it by metacomments, just in case. |
|
-- synthesis translate_off |
|
process (clk_i) is |
type Pair is record |
addr: std_logic_vector(fetch_addr'range); |
data: std_logic_vector(31 downto 0); |
end record; |
type Pairs is array (7 downto 0) of Pair; |
variable buf: Pairs; |
variable count: integer range buf'range:=0; |
variable current_pair: Pair; |
begin |
if rising_edge(clk_i) then |
if fifo_rst='1' then -- jump |
count:=0; |
elsif fifo_we='1' then -- LLI returned data |
current_pair.data:=fifo_din; |
buf(count):=current_pair; |
count:=count+1; |
end if; |
if re='1' and lli_busy_i='0' then -- data requested |
current_pair.addr:=fetch_addr; |
end if; |
if fifo_empty='0' and fifo_rst='0' then -- fetch output is valid |
assert count>0 |
report "Fetch: buffer should be empty" |
severity failure; |
assert buf(0).data=fifo_dout |
report "Fetch: incorrect data" |
severity failure; |
assert buf(0).addr=current_ip |
report "Fetch: incorrect current_ip" |
severity failure; |
assert std_logic_vector(unsigned(buf(0).addr)+1)=next_ip |
report "Fetch: incorrect next_ip" |
severity failure; |
if ready_i='1' then |
buf(buf'high-1 downto 0):=buf(buf'high downto 1); -- we don't care about the highest item |
count:=count-1; |
end if; |
end if; |
end if; |
end process; |
|
-- synthesis translate_on |
|
end architecture; |
/lxp32_icache.vhd
1,289 → 1,289
--------------------------------------------------------------------- |
-- Instruction cache |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- A simple single-page buffer providing both caching and |
-- prefetching capabilities. Useful for high-latency memory, |
-- such as external SDRAM. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_icache is |
generic( |
BURST_SIZE: integer; |
PREFETCH_SIZE: integer |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
lli_re_i: in std_logic; |
lli_adr_i: in std_logic_vector(29 downto 0); |
lli_dat_o: out std_logic_vector(31 downto 0); |
lli_busy_o: out std_logic; |
|
wbm_cyc_o: out std_logic; |
wbm_stb_o: out std_logic; |
wbm_cti_o: out std_logic_vector(2 downto 0); |
wbm_bte_o: out std_logic_vector(1 downto 0); |
wbm_ack_i: in std_logic; |
wbm_adr_o: out std_logic_vector(29 downto 0); |
wbm_dat_i: in std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_icache is |
|
signal lli_adr_reg: std_logic_vector(lli_adr_i'range); |
signal lli_adr_mux: std_logic_vector(lli_adr_i'range); |
|
signal ram_waddr: std_logic_vector(7 downto 0); |
signal ram_raddr: std_logic_vector(7 downto 0); |
signal ram_re: std_logic; |
signal ram_we: std_logic; |
|
signal read_base: unsigned(21 downto 0); |
signal read_offset: unsigned(7 downto 0); |
|
signal init: std_logic:='0'; |
signal burst1: std_logic; |
signal terminate_burst: std_logic; |
signal near_miss: std_logic:='0'; |
signal prefetch_distance: unsigned(7 downto 0); |
signal wrap_cnt: integer range 0 to 3:=0; |
signal burst_cnt: integer range 0 to BURST_SIZE:=0; |
signal wb_stb: std_logic:='0'; |
signal wb_cti: std_logic_vector(2 downto 0); |
|
-- Note: the following five signals are zero-initialized for |
-- simulation only, to suppress warnings from numeric_std. |
-- This initialization is not required for synthesis. |
|
signal current_base: unsigned(21 downto 0):=(others=>'0'); |
signal current_offset: unsigned(7 downto 0):=(others=>'0'); |
signal prev_base: unsigned(21 downto 0):=(others=>'0'); |
signal next_base: unsigned(21 downto 0):=(others=>'0'); |
signal start_offset: unsigned(7 downto 0):=(others=>'0'); |
|
signal hitc: std_logic; |
signal hitp: std_logic; |
signal miss: std_logic:='0'; |
|
begin |
|
assert PREFETCH_SIZE>=4 |
report "PREFETCH_SIZE cannot be less than 4" |
severity failure; |
assert BURST_SIZE>=4 |
report "BURST_SIZE cannot be less than 4" |
severity failure; |
assert PREFETCH_SIZE+BURST_SIZE<=128 |
report "PREFETCH_SIZE and BURST_SIZE combined cannot be greater than 128" |
severity failure; |
|
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if miss='0' then |
lli_adr_reg<=lli_adr_i; |
end if; |
end if; |
end process; |
|
lli_adr_mux<=lli_adr_i when miss='0' else lli_adr_reg; |
|
read_base<=unsigned(lli_adr_mux(29 downto 8)); |
read_offset<=unsigned(lli_adr_mux(7 downto 0)); |
|
-- Cache RAM |
|
ram_waddr<=std_logic_vector(current_offset); |
ram_raddr<=std_logic_vector(read_offset); |
ram_we<=wb_stb and wbm_ack_i; |
ram_re<=lli_re_i or miss; |
|
ram_inst: entity work.lxp32_ram256x32(rtl) |
port map( |
clk_i=>clk_i, |
|
we_i=>ram_we, |
waddr_i=>ram_waddr, |
wdata_i=>wbm_dat_i, |
|
re_i=>ram_re, |
raddr_i=>ram_raddr, |
rdata_o=>lli_dat_o |
); |
|
-- Determine hit/miss |
|
-- This cache uses a single ring buffer. Address in buffer corresponds |
-- to the lower 8 bits of the full address. The part of the buffer that |
-- is higher than current_offset represents a previous block ("p"), the |
-- other part represents a current block ("c"). |
|
hitc<='1' when read_base=current_base and read_offset<current_offset and |
((wrap_cnt=1 and read_offset>=start_offset) or |
wrap_cnt=2 or wrap_cnt=3) else '0'; |
|
hitp<='1' when read_base=prev_base and read_offset>current_offset and |
((wrap_cnt=2 and read_offset>=start_offset) or |
wrap_cnt=3) else '0'; |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
miss<='0'; |
else |
if hitc='0' and hitp='0' and ram_re='1' then |
miss<='1'; |
else |
miss<='0'; |
end if; |
end if; |
end if; |
end process; |
|
lli_busy_o<=miss; |
|
-- Set INIT flag when the first lli_re_i signal is detected |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
init<='0'; |
elsif lli_re_i='1' then |
init<='1'; |
end if; |
end if; |
end process; |
|
-- Fill cache |
|
prefetch_distance<=current_offset-read_offset; |
|
-- Note: "near_miss" signal prevents cache invalidation when difference |
-- between the requested address and the currently fetched address |
-- is too small (and, therefore, the requested data will be fetched soon |
-- without invalidation). |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
near_miss<='0'; |
elsif wrap_cnt>0 and read_offset-current_offset<=to_unsigned(BURST_SIZE/2,8) and |
((read_base=current_base and read_offset>=current_offset) or |
(read_base=next_base and read_offset<current_offset)) |
then |
near_miss<='1'; |
else |
near_miss<='0'; |
end if; |
end if; |
end process; |
|
terminate_burst<='1' when burst_cnt<BURST_SIZE-1 and miss='1' and |
(burst_cnt>2 or burst1='0') and near_miss='0' else '0'; |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
burst_cnt<=0; |
wb_stb<='0'; |
wrap_cnt<=0; |
wb_cti<=(others=>'-'); |
burst1<='-'; |
current_offset<=(others=>'-'); |
start_offset<=(others=>'-'); |
current_base<=(others=>'-'); |
next_base<=(others=>'-'); |
prev_base<=(others=>'-'); |
|
-- To suppress numeric_std warnings |
-- synthesis translate_off |
current_offset<=(others=>'0'); |
start_offset<=(others=>'0'); |
current_base<=(others=>'0'); |
next_base<=(others=>'0'); |
prev_base<=(others=>'0'); |
-- synthesis translate_on |
else |
if burst_cnt=0 and init='1' then |
if miss='1' and near_miss='0' then |
wb_stb<='1'; |
wb_cti<="010"; |
current_offset<=read_offset; |
start_offset<=read_offset; |
current_base<=read_base; |
next_base<=read_base+1; |
burst_cnt<=1; |
burst1<='1'; |
wrap_cnt<=1; |
elsif prefetch_distance<to_unsigned(PREFETCH_SIZE,8) or near_miss='1' then |
wb_stb<='1'; |
wb_cti<="010"; |
burst_cnt<=1; |
burst1<='0'; |
end if; |
else |
if wbm_ack_i='1' then |
current_offset<=current_offset+1; |
if current_offset=X"FF" then |
current_base<=next_base; |
next_base<=next_base+1; |
prev_base<=current_base; |
if wrap_cnt<3 then |
wrap_cnt<=wrap_cnt+1; |
end if; |
end if; |
if burst_cnt=BURST_SIZE-1 or terminate_burst='1' then |
burst_cnt<=BURST_SIZE; |
wb_cti<="111"; |
elsif burst_cnt<BURST_SIZE-1 then |
burst_cnt<=burst_cnt+1; |
wb_cti<="010"; |
else |
if miss='1' and near_miss='0' then |
wb_stb<='1'; |
wb_cti<="010"; |
current_offset<=read_offset; |
start_offset<=read_offset; |
current_base<=read_base; |
next_base<=read_base+1; |
burst_cnt<=1; |
burst1<='1'; |
wrap_cnt<=1; |
elsif prefetch_distance<to_unsigned(PREFETCH_SIZE,8) or near_miss='1' then |
wb_stb<='1'; |
wb_cti<="010"; |
burst_cnt<=1; |
burst1<='0'; |
else |
burst_cnt<=0; |
wb_stb<='0'; |
end if; |
end if; |
end if; |
end if; |
end if; |
end if; |
end process; |
|
wbm_cyc_o<=wb_stb; |
wbm_stb_o<=wb_stb; |
wbm_cti_o<=wb_cti; |
wbm_bte_o<="00"; |
wbm_adr_o<=std_logic_vector(current_base¤t_offset); |
|
end architecture; |
--------------------------------------------------------------------- |
-- Instruction cache |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- A simple single-page buffer providing both caching and |
-- prefetching capabilities. Useful for high-latency memory, |
-- such as external SDRAM. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_icache is |
generic( |
BURST_SIZE: integer; |
PREFETCH_SIZE: integer |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
lli_re_i: in std_logic; |
lli_adr_i: in std_logic_vector(29 downto 0); |
lli_dat_o: out std_logic_vector(31 downto 0); |
lli_busy_o: out std_logic; |
|
wbm_cyc_o: out std_logic; |
wbm_stb_o: out std_logic; |
wbm_cti_o: out std_logic_vector(2 downto 0); |
wbm_bte_o: out std_logic_vector(1 downto 0); |
wbm_ack_i: in std_logic; |
wbm_adr_o: out std_logic_vector(29 downto 0); |
wbm_dat_i: in std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_icache is |
|
signal lli_adr_reg: std_logic_vector(lli_adr_i'range); |
signal lli_adr_mux: std_logic_vector(lli_adr_i'range); |
|
signal ram_waddr: std_logic_vector(7 downto 0); |
signal ram_raddr: std_logic_vector(7 downto 0); |
signal ram_re: std_logic; |
signal ram_we: std_logic; |
|
signal read_base: unsigned(21 downto 0); |
signal read_offset: unsigned(7 downto 0); |
|
signal init: std_logic:='0'; |
signal burst1: std_logic; |
signal terminate_burst: std_logic; |
signal near_miss: std_logic:='0'; |
signal prefetch_distance: unsigned(7 downto 0); |
signal wrap_cnt: integer range 0 to 3:=0; |
signal burst_cnt: integer range 0 to BURST_SIZE:=0; |
signal wb_stb: std_logic:='0'; |
signal wb_cti: std_logic_vector(2 downto 0); |
|
-- Note: the following five signals are zero-initialized for |
-- simulation only, to suppress warnings from numeric_std. |
-- This initialization is not required for synthesis. |
|
signal current_base: unsigned(21 downto 0):=(others=>'0'); |
signal current_offset: unsigned(7 downto 0):=(others=>'0'); |
signal prev_base: unsigned(21 downto 0):=(others=>'0'); |
signal next_base: unsigned(21 downto 0):=(others=>'0'); |
signal start_offset: unsigned(7 downto 0):=(others=>'0'); |
|
signal hitc: std_logic; |
signal hitp: std_logic; |
signal miss: std_logic:='0'; |
|
begin |
|
assert PREFETCH_SIZE>=4 |
report "PREFETCH_SIZE cannot be less than 4" |
severity failure; |
assert BURST_SIZE>=4 |
report "BURST_SIZE cannot be less than 4" |
severity failure; |
assert PREFETCH_SIZE+BURST_SIZE<=128 |
report "PREFETCH_SIZE and BURST_SIZE combined cannot be greater than 128" |
severity failure; |
|
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if miss='0' then |
lli_adr_reg<=lli_adr_i; |
end if; |
end if; |
end process; |
|
lli_adr_mux<=lli_adr_i when miss='0' else lli_adr_reg; |
|
read_base<=unsigned(lli_adr_mux(29 downto 8)); |
read_offset<=unsigned(lli_adr_mux(7 downto 0)); |
|
-- Cache RAM |
|
ram_waddr<=std_logic_vector(current_offset); |
ram_raddr<=std_logic_vector(read_offset); |
ram_we<=wb_stb and wbm_ack_i; |
ram_re<=lli_re_i or miss; |
|
ram_inst: entity work.lxp32_ram256x32(rtl) |
port map( |
clk_i=>clk_i, |
|
we_i=>ram_we, |
waddr_i=>ram_waddr, |
wdata_i=>wbm_dat_i, |
|
re_i=>ram_re, |
raddr_i=>ram_raddr, |
rdata_o=>lli_dat_o |
); |
|
-- Determine hit/miss |
|
-- This cache uses a single ring buffer. Address in buffer corresponds |
-- to the lower 8 bits of the full address. The part of the buffer that |
-- is higher than current_offset represents a previous block ("p"), the |
-- other part represents a current block ("c"). |
|
hitc<='1' when read_base=current_base and read_offset<current_offset and |
((wrap_cnt=1 and read_offset>=start_offset) or |
wrap_cnt=2 or wrap_cnt=3) else '0'; |
|
hitp<='1' when read_base=prev_base and read_offset>current_offset and |
((wrap_cnt=2 and read_offset>=start_offset) or |
wrap_cnt=3) else '0'; |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
miss<='0'; |
else |
if hitc='0' and hitp='0' and ram_re='1' then |
miss<='1'; |
else |
miss<='0'; |
end if; |
end if; |
end if; |
end process; |
|
lli_busy_o<=miss; |
|
-- Set INIT flag when the first lli_re_i signal is detected |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
init<='0'; |
elsif lli_re_i='1' then |
init<='1'; |
end if; |
end if; |
end process; |
|
-- Fill cache |
|
prefetch_distance<=current_offset-read_offset; |
|
-- Note: "near_miss" signal prevents cache invalidation when difference |
-- between the requested address and the currently fetched address |
-- is too small (and, therefore, the requested data will be fetched soon |
-- without invalidation). |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
near_miss<='0'; |
elsif wrap_cnt>0 and read_offset-current_offset<=to_unsigned(BURST_SIZE/2,8) and |
((read_base=current_base and read_offset>=current_offset) or |
(read_base=next_base and read_offset<current_offset)) |
then |
near_miss<='1'; |
else |
near_miss<='0'; |
end if; |
end if; |
end process; |
|
terminate_burst<='1' when burst_cnt<BURST_SIZE-1 and miss='1' and |
(burst_cnt>2 or burst1='0') and near_miss='0' else '0'; |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
burst_cnt<=0; |
wb_stb<='0'; |
wrap_cnt<=0; |
wb_cti<=(others=>'-'); |
burst1<='-'; |
current_offset<=(others=>'-'); |
start_offset<=(others=>'-'); |
current_base<=(others=>'-'); |
next_base<=(others=>'-'); |
prev_base<=(others=>'-'); |
|
-- To suppress numeric_std warnings |
-- synthesis translate_off |
current_offset<=(others=>'0'); |
start_offset<=(others=>'0'); |
current_base<=(others=>'0'); |
next_base<=(others=>'0'); |
prev_base<=(others=>'0'); |
-- synthesis translate_on |
else |
if burst_cnt=0 and init='1' then |
if miss='1' and near_miss='0' then |
wb_stb<='1'; |
wb_cti<="010"; |
current_offset<=read_offset; |
start_offset<=read_offset; |
current_base<=read_base; |
next_base<=read_base+1; |
burst_cnt<=1; |
burst1<='1'; |
wrap_cnt<=1; |
elsif prefetch_distance<to_unsigned(PREFETCH_SIZE,8) or near_miss='1' then |
wb_stb<='1'; |
wb_cti<="010"; |
burst_cnt<=1; |
burst1<='0'; |
end if; |
else |
if wbm_ack_i='1' then |
current_offset<=current_offset+1; |
if current_offset=X"FF" then |
current_base<=next_base; |
next_base<=next_base+1; |
prev_base<=current_base; |
if wrap_cnt<3 then |
wrap_cnt<=wrap_cnt+1; |
end if; |
end if; |
if burst_cnt=BURST_SIZE-1 or terminate_burst='1' then |
burst_cnt<=BURST_SIZE; |
wb_cti<="111"; |
elsif burst_cnt<BURST_SIZE-1 then |
burst_cnt<=burst_cnt+1; |
wb_cti<="010"; |
else |
if miss='1' and near_miss='0' then |
wb_stb<='1'; |
wb_cti<="010"; |
current_offset<=read_offset; |
start_offset<=read_offset; |
current_base<=read_base; |
next_base<=read_base+1; |
burst_cnt<=1; |
burst1<='1'; |
wrap_cnt<=1; |
elsif prefetch_distance<to_unsigned(PREFETCH_SIZE,8) or near_miss='1' then |
wb_stb<='1'; |
wb_cti<="010"; |
burst_cnt<=1; |
burst1<='0'; |
else |
burst_cnt<=0; |
wb_stb<='0'; |
end if; |
end if; |
end if; |
end if; |
end if; |
end if; |
end process; |
|
wbm_cyc_o<=wb_stb; |
wbm_stb_o<=wb_stb; |
wbm_cti_o<=wb_cti; |
wbm_bte_o<="00"; |
wbm_adr_o<=std_logic_vector(current_base¤t_offset); |
|
end architecture; |
/lxp32_interrupt_mux.vhd
1,112 → 1,112
--------------------------------------------------------------------- |
-- Interrupt multiplexer |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- Manages LXP32 interrupts. Interrupts with lower numbers have |
-- higher priority. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_interrupt_mux is |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
irq_i: in std_logic_vector(7 downto 0); |
|
interrupt_valid_o: out std_logic; |
interrupt_vector_o: out std_logic_vector(2 downto 0); |
interrupt_ready_i: in std_logic; |
interrupt_return_i: in std_logic; |
|
sp_waddr_i: in std_logic_vector(7 downto 0); |
sp_we_i: in std_logic; |
sp_wdata_i: in std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_interrupt_mux is |
|
signal irq_reg: std_logic_vector(irq_i'range):=(others=>'0'); |
|
type state_type is (Ready,Requested,WaitForExit); |
signal state: state_type:=Ready; |
|
signal pending_interrupts: std_logic_vector(irq_i'range):=(others=>'0'); |
|
signal interrupt_valid: std_logic:='0'; |
|
signal interrupts_enabled: std_logic_vector(7 downto 0):=(others=>'0'); |
signal interrupts_blocked: std_logic_vector(7 downto 0):=(others=>'0'); |
|
begin |
|
-- Note: "disabled" interrupts (i.e. for which interrupts_enabled_i(i)='0') |
-- are ignored completely, meaning that the interrupt handler won't be |
-- called even if the interrupt is enabled later. Conversely, "blocked" |
-- interrupts are registered, but their handlers are not called until they |
-- are unblocked. |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
irq_reg<=(others=>'0'); |
pending_interrupts<=(others=>'0'); |
state<=Ready; |
interrupt_valid<='0'; |
interrupt_vector_o<=(others=>'-'); |
else |
irq_reg<=irq_i; |
|
pending_interrupts<=(pending_interrupts or |
(irq_i and not irq_reg)) and |
interrupts_enabled; |
|
case state is |
when Ready => |
for i in pending_interrupts'reverse_range loop -- lower interrupts have priority |
if pending_interrupts(i)='1' and interrupts_blocked(i)='0' then |
pending_interrupts(i)<='0'; |
interrupt_valid<='1'; |
interrupt_vector_o<=std_logic_vector(to_unsigned(i,3)); |
state<=Requested; |
exit; |
end if; |
end loop; |
when Requested => |
if interrupt_ready_i='1' then |
interrupt_valid<='0'; |
state<=WaitForExit; |
end if; |
when WaitForExit => |
if interrupt_return_i='1' then |
state<=Ready; |
end if; |
end case; |
end if; |
end if; |
end process; |
|
interrupt_valid_o<=interrupt_valid; |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
interrupts_enabled<=(others=>'0'); |
interrupts_blocked<=(others=>'0'); |
elsif sp_we_i='1' and sp_waddr_i=X"FC" then |
interrupts_enabled<=sp_wdata_i(7 downto 0); |
interrupts_blocked<=sp_wdata_i(15 downto 8); |
end if; |
end if; |
end process; |
|
end architecture; |
--------------------------------------------------------------------- |
-- Interrupt multiplexer |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- Manages LXP32 interrupts. Interrupts with lower numbers have |
-- higher priority. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_interrupt_mux is |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
irq_i: in std_logic_vector(7 downto 0); |
|
interrupt_valid_o: out std_logic; |
interrupt_vector_o: out std_logic_vector(2 downto 0); |
interrupt_ready_i: in std_logic; |
interrupt_return_i: in std_logic; |
|
sp_waddr_i: in std_logic_vector(7 downto 0); |
sp_we_i: in std_logic; |
sp_wdata_i: in std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_interrupt_mux is |
|
signal irq_reg: std_logic_vector(irq_i'range):=(others=>'0'); |
|
type state_type is (Ready,Requested,WaitForExit); |
signal state: state_type:=Ready; |
|
signal pending_interrupts: std_logic_vector(irq_i'range):=(others=>'0'); |
|
signal interrupt_valid: std_logic:='0'; |
|
signal interrupts_enabled: std_logic_vector(7 downto 0):=(others=>'0'); |
signal interrupts_blocked: std_logic_vector(7 downto 0):=(others=>'0'); |
|
begin |
|
-- Note: "disabled" interrupts (i.e. for which interrupts_enabled_i(i)='0') |
-- are ignored completely, meaning that the interrupt handler won't be |
-- called even if the interrupt is enabled later. Conversely, "blocked" |
-- interrupts are registered, but their handlers are not called until they |
-- are unblocked. |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
irq_reg<=(others=>'0'); |
pending_interrupts<=(others=>'0'); |
state<=Ready; |
interrupt_valid<='0'; |
interrupt_vector_o<=(others=>'-'); |
else |
irq_reg<=irq_i; |
|
pending_interrupts<=(pending_interrupts or |
(irq_i and not irq_reg)) and |
interrupts_enabled; |
|
case state is |
when Ready => |
for i in pending_interrupts'reverse_range loop -- lower interrupts have priority |
if pending_interrupts(i)='1' and interrupts_blocked(i)='0' then |
pending_interrupts(i)<='0'; |
interrupt_valid<='1'; |
interrupt_vector_o<=std_logic_vector(to_unsigned(i,3)); |
state<=Requested; |
exit; |
end if; |
end loop; |
when Requested => |
if interrupt_ready_i='1' then |
interrupt_valid<='0'; |
state<=WaitForExit; |
end if; |
when WaitForExit => |
if interrupt_return_i='1' then |
state<=Ready; |
end if; |
end case; |
end if; |
end if; |
end process; |
|
interrupt_valid_o<=interrupt_valid; |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
interrupts_enabled<=(others=>'0'); |
interrupts_blocked<=(others=>'0'); |
elsif sp_we_i='1' and sp_waddr_i=X"FC" then |
interrupts_enabled<=sp_wdata_i(7 downto 0); |
interrupts_blocked<=sp_wdata_i(15 downto 8); |
end if; |
end if; |
end process; |
|
end architecture; |
/lxp32_mul16x16.vhd
1,36 → 1,36
--------------------------------------------------------------------- |
-- A basic parallel 16x16 multiplier with an output register |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- A straightforward behavioral description. Can be replaced |
-- with a library component wrapper if needed. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_mul16x16 is |
port( |
clk_i: in std_logic; |
a_i: in std_logic_vector(15 downto 0); |
b_i: in std_logic_vector(15 downto 0); |
p_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_mul16x16 is |
|
begin |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
p_o<=std_logic_vector(unsigned(a_i)*unsigned(b_i)); |
end if; |
end process; |
|
end architecture; |
--------------------------------------------------------------------- |
-- A basic parallel 16x16 multiplier with an output register |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- A straightforward behavioral description. Can be replaced |
-- with a library component wrapper if needed. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_mul16x16 is |
port( |
clk_i: in std_logic; |
a_i: in std_logic_vector(15 downto 0); |
b_i: in std_logic_vector(15 downto 0); |
p_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_mul16x16 is |
|
begin |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
p_o<=std_logic_vector(unsigned(a_i)*unsigned(b_i)); |
end if; |
end process; |
|
end architecture; |
/lxp32_mul_dsp.vhd
1,82 → 1,82
--------------------------------------------------------------------- |
-- DSP multiplier |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- This multiplier is designed for technologies that provide fast |
-- 16x16 multipliers, including most modern FPGA families. One |
-- multiplication takes 2 cycles. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_mul_dsp is |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
ce_i: in std_logic; |
op1_i: in std_logic_vector(31 downto 0); |
op2_i: in std_logic_vector(31 downto 0); |
ce_o: out std_logic; |
result_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_mul_dsp is |
|
signal pp00: std_logic_vector(31 downto 0); |
signal pp01: std_logic_vector(31 downto 0); |
signal pp10: std_logic_vector(31 downto 0); |
|
signal product: unsigned(31 downto 0); |
|
signal ceo: std_logic:='0'; |
|
begin |
|
mul00_inst: entity work.lxp32_mul16x16 |
port map( |
clk_i=>clk_i, |
a_i=>op1_i(15 downto 0), |
b_i=>op2_i(15 downto 0), |
p_o=>pp00 |
); |
|
mul01_inst: entity work.lxp32_mul16x16 |
port map( |
clk_i=>clk_i, |
a_i=>op1_i(15 downto 0), |
b_i=>op2_i(31 downto 16), |
p_o=>pp01 |
); |
|
mul10_inst: entity work.lxp32_mul16x16 |
port map( |
clk_i=>clk_i, |
a_i=>op1_i(31 downto 16), |
b_i=>op2_i(15 downto 0), |
p_o=>pp10 |
); |
|
product(31 downto 16)<=unsigned(pp00(31 downto 16))+unsigned(pp01(15 downto 0))+unsigned(pp10(15 downto 0)); |
product(15 downto 0)<=unsigned(pp00(15 downto 0)); |
result_o<=std_logic_vector(product); |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
ceo<='0'; |
else |
ceo<=ce_i; |
end if; |
end if; |
end process; |
|
ce_o<=ceo; |
|
end architecture; |
--------------------------------------------------------------------- |
-- DSP multiplier |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- This multiplier is designed for technologies that provide fast |
-- 16x16 multipliers, including most modern FPGA families. One |
-- multiplication takes 2 cycles. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_mul_dsp is |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
ce_i: in std_logic; |
op1_i: in std_logic_vector(31 downto 0); |
op2_i: in std_logic_vector(31 downto 0); |
ce_o: out std_logic; |
result_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_mul_dsp is |
|
signal pp00: std_logic_vector(31 downto 0); |
signal pp01: std_logic_vector(31 downto 0); |
signal pp10: std_logic_vector(31 downto 0); |
|
signal product: unsigned(31 downto 0); |
|
signal ceo: std_logic:='0'; |
|
begin |
|
mul00_inst: entity work.lxp32_mul16x16 |
port map( |
clk_i=>clk_i, |
a_i=>op1_i(15 downto 0), |
b_i=>op2_i(15 downto 0), |
p_o=>pp00 |
); |
|
mul01_inst: entity work.lxp32_mul16x16 |
port map( |
clk_i=>clk_i, |
a_i=>op1_i(15 downto 0), |
b_i=>op2_i(31 downto 16), |
p_o=>pp01 |
); |
|
mul10_inst: entity work.lxp32_mul16x16 |
port map( |
clk_i=>clk_i, |
a_i=>op1_i(31 downto 16), |
b_i=>op2_i(15 downto 0), |
p_o=>pp10 |
); |
|
product(31 downto 16)<=unsigned(pp00(31 downto 16))+unsigned(pp01(15 downto 0))+unsigned(pp10(15 downto 0)); |
product(15 downto 0)<=unsigned(pp00(15 downto 0)); |
result_o<=std_logic_vector(product); |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
ceo<='0'; |
else |
ceo<=ce_i; |
end if; |
end if; |
end process; |
|
ce_o<=ceo; |
|
end architecture; |
/lxp32_mul_opt.vhd
1,168 → 1,168
--------------------------------------------------------------------- |
-- Optimized multiplier |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- This multiplier is designed for technologies that don't provide |
-- fast 16x16 multipliers. One multiplication takes 6 cycles. |
-- |
-- The multiplication algorithm is based on carry-save accumulation |
-- of partial products. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_mul_opt is |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
ce_i: in std_logic; |
op1_i: in std_logic_vector(31 downto 0); |
op2_i: in std_logic_vector(31 downto 0); |
ce_o: out std_logic; |
result_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_mul_opt is |
|
function csa_sum(a: unsigned; b: unsigned; c: unsigned; n: integer) return unsigned is |
variable r: unsigned(n-1 downto 0); |
begin |
for i in r'range loop |
r(i):=a(i) xor b(i) xor c(i); |
end loop; |
return r; |
end function; |
|
function csa_carry(a: unsigned; b: unsigned; c: unsigned; n: integer) return unsigned is |
variable r: unsigned(n-1 downto 0); |
begin |
for i in r'range loop |
r(i):=(a(i) and b(i)) or (a(i) and c(i)) or (b(i) and c(i)); |
end loop; |
return r&"0"; |
end function; |
|
signal reg1: unsigned(op1_i'range); |
signal reg2: unsigned(op2_i'range); |
|
type pp_type is array (7 downto 0) of unsigned(31 downto 0); |
signal pp: pp_type; |
|
type pp_sum_type is array (7 downto 0) of unsigned(31 downto 0); |
signal pp_sum: pp_sum_type; |
|
type pp_carry_type is array (7 downto 0) of unsigned(32 downto 0); |
signal pp_carry: pp_carry_type; |
|
signal acc_sum: unsigned(31 downto 0); |
signal acc_carry: unsigned(31 downto 0); |
|
signal cnt: integer range 0 to 4:=0; |
|
signal result: std_logic_vector(result_o'range); |
signal ceo: std_logic:='0'; |
|
begin |
|
-- Calculate 8 partial products in parallel |
|
pp_gen: for i in pp'range generate |
pp(i)<=shift_left(reg1,i) when reg2(i)='1' else (others=>'0'); |
end generate; |
|
-- Add partial products to the accumulator using carry-save adder tree |
|
pp_sum(0)<=csa_sum(pp(0),pp(1),pp(2),32); |
pp_carry(0)<=csa_carry(pp(0),pp(1),pp(2),32); |
|
pp_sum(1)<=csa_sum(pp(3),pp(4),pp(5),32); |
pp_carry(1)<=csa_carry(pp(3),pp(4),pp(5),32); |
|
pp_sum(2)<=csa_sum(pp(6),pp(7),acc_sum,32); |
pp_carry(2)<=csa_carry(pp(6),pp(7),acc_sum,32); |
|
pp_sum(3)<=csa_sum(pp_sum(0),pp_carry(0),pp_sum(1),32); |
pp_carry(3)<=csa_carry(pp_sum(0),pp_carry(0),pp_sum(1),32); |
|
pp_sum(4)<=csa_sum(pp_carry(1),pp_sum(2),pp_carry(2),32); |
pp_carry(4)<=csa_carry(pp_carry(1),pp_sum(2),pp_carry(2),32); |
|
pp_sum(5)<=csa_sum(pp_sum(3),pp_carry(3),pp_sum(4),32); |
pp_carry(5)<=csa_carry(pp_sum(3),pp_carry(3),pp_sum(4),32); |
|
pp_sum(6)<=csa_sum(pp_sum(5),pp_carry(5),pp_carry(4),32); |
pp_carry(6)<=csa_carry(pp_sum(5),pp_carry(5),pp_carry(4),32); |
|
pp_sum(7)<=csa_sum(pp_sum(6),pp_carry(6),acc_carry,32); |
pp_carry(7)<=csa_carry(pp_sum(6),pp_carry(6),acc_carry,32); |
|
-- Multiplier state machine |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
ceo<='0'; |
cnt<=0; |
reg1<=(others=>'-'); |
reg2<=(others=>'-'); |
acc_sum<=(others=>'-'); |
acc_carry<=(others=>'-'); |
else |
if cnt=1 then |
ceo<='1'; |
else |
ceo<='0'; |
end if; |
|
if ce_i='1' then |
cnt<=4; |
reg1<=unsigned(op1_i); |
reg2<=unsigned(op2_i); |
acc_sum<=(others=>'0'); |
acc_carry<=(others=>'0'); |
else |
acc_sum<=pp_sum(7); |
acc_carry<=pp_carry(7)(acc_carry'range); |
reg1<=reg1(reg1'high-8 downto 0)&X"00"; |
reg2<=X"00"®2(reg2'high downto 8); |
if cnt>0 then |
cnt<=cnt-1; |
end if; |
end if; |
end if; |
end if; |
end process; |
|
result<=std_logic_vector(acc_sum+acc_carry); |
|
result_o<=result; |
ce_o<=ceo; |
|
-- A simulation-time multiplication check |
|
-- synthesis translate_off |
|
process (clk_i) is |
variable p: unsigned(op1_i'length+op2_i'length-1 downto 0); |
begin |
if rising_edge(clk_i) then |
if ce_i='1' then |
p:=unsigned(op1_i)*unsigned(op2_i); |
elsif ceo='1' then |
assert result=std_logic_vector(p(result'range)) |
report "Incorrect multiplication result" |
severity failure; |
end if; |
end if; |
end process; |
|
-- synthesis translate_on |
|
end architecture; |
--------------------------------------------------------------------- |
-- Optimized multiplier |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- This multiplier is designed for technologies that don't provide |
-- fast 16x16 multipliers. One multiplication takes 6 cycles. |
-- |
-- The multiplication algorithm is based on carry-save accumulation |
-- of partial products. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_mul_opt is |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
ce_i: in std_logic; |
op1_i: in std_logic_vector(31 downto 0); |
op2_i: in std_logic_vector(31 downto 0); |
ce_o: out std_logic; |
result_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_mul_opt is |
|
function csa_sum(a: unsigned; b: unsigned; c: unsigned; n: integer) return unsigned is |
variable r: unsigned(n-1 downto 0); |
begin |
for i in r'range loop |
r(i):=a(i) xor b(i) xor c(i); |
end loop; |
return r; |
end function; |
|
function csa_carry(a: unsigned; b: unsigned; c: unsigned; n: integer) return unsigned is |
variable r: unsigned(n-1 downto 0); |
begin |
for i in r'range loop |
r(i):=(a(i) and b(i)) or (a(i) and c(i)) or (b(i) and c(i)); |
end loop; |
return r&"0"; |
end function; |
|
signal reg1: unsigned(op1_i'range); |
signal reg2: unsigned(op2_i'range); |
|
type pp_type is array (7 downto 0) of unsigned(31 downto 0); |
signal pp: pp_type; |
|
type pp_sum_type is array (7 downto 0) of unsigned(31 downto 0); |
signal pp_sum: pp_sum_type; |
|
type pp_carry_type is array (7 downto 0) of unsigned(32 downto 0); |
signal pp_carry: pp_carry_type; |
|
signal acc_sum: unsigned(31 downto 0); |
signal acc_carry: unsigned(31 downto 0); |
|
signal cnt: integer range 0 to 4:=0; |
|
signal result: std_logic_vector(result_o'range); |
signal ceo: std_logic:='0'; |
|
begin |
|
-- Calculate 8 partial products in parallel |
|
pp_gen: for i in pp'range generate |
pp(i)<=shift_left(reg1,i) when reg2(i)='1' else (others=>'0'); |
end generate; |
|
-- Add partial products to the accumulator using carry-save adder tree |
|
pp_sum(0)<=csa_sum(pp(0),pp(1),pp(2),32); |
pp_carry(0)<=csa_carry(pp(0),pp(1),pp(2),32); |
|
pp_sum(1)<=csa_sum(pp(3),pp(4),pp(5),32); |
pp_carry(1)<=csa_carry(pp(3),pp(4),pp(5),32); |
|
pp_sum(2)<=csa_sum(pp(6),pp(7),acc_sum,32); |
pp_carry(2)<=csa_carry(pp(6),pp(7),acc_sum,32); |
|
pp_sum(3)<=csa_sum(pp_sum(0),pp_carry(0),pp_sum(1),32); |
pp_carry(3)<=csa_carry(pp_sum(0),pp_carry(0),pp_sum(1),32); |
|
pp_sum(4)<=csa_sum(pp_carry(1),pp_sum(2),pp_carry(2),32); |
pp_carry(4)<=csa_carry(pp_carry(1),pp_sum(2),pp_carry(2),32); |
|
pp_sum(5)<=csa_sum(pp_sum(3),pp_carry(3),pp_sum(4),32); |
pp_carry(5)<=csa_carry(pp_sum(3),pp_carry(3),pp_sum(4),32); |
|
pp_sum(6)<=csa_sum(pp_sum(5),pp_carry(5),pp_carry(4),32); |
pp_carry(6)<=csa_carry(pp_sum(5),pp_carry(5),pp_carry(4),32); |
|
pp_sum(7)<=csa_sum(pp_sum(6),pp_carry(6),acc_carry,32); |
pp_carry(7)<=csa_carry(pp_sum(6),pp_carry(6),acc_carry,32); |
|
-- Multiplier state machine |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
ceo<='0'; |
cnt<=0; |
reg1<=(others=>'-'); |
reg2<=(others=>'-'); |
acc_sum<=(others=>'-'); |
acc_carry<=(others=>'-'); |
else |
if cnt=1 then |
ceo<='1'; |
else |
ceo<='0'; |
end if; |
|
if ce_i='1' then |
cnt<=4; |
reg1<=unsigned(op1_i); |
reg2<=unsigned(op2_i); |
acc_sum<=(others=>'0'); |
acc_carry<=(others=>'0'); |
else |
acc_sum<=pp_sum(7); |
acc_carry<=pp_carry(7)(acc_carry'range); |
reg1<=reg1(reg1'high-8 downto 0)&X"00"; |
reg2<=X"00"®2(reg2'high downto 8); |
if cnt>0 then |
cnt<=cnt-1; |
end if; |
end if; |
end if; |
end if; |
end process; |
|
result<=std_logic_vector(acc_sum+acc_carry); |
|
result_o<=result; |
ce_o<=ceo; |
|
-- A simulation-time multiplication check |
|
-- synthesis translate_off |
|
process (clk_i) is |
variable p: unsigned(op1_i'length+op2_i'length-1 downto 0); |
begin |
if rising_edge(clk_i) then |
if ce_i='1' then |
p:=unsigned(op1_i)*unsigned(op2_i); |
elsif ceo='1' then |
assert result=std_logic_vector(p(result'range)) |
report "Incorrect multiplication result" |
severity failure; |
end if; |
end if; |
end process; |
|
-- synthesis translate_on |
|
end architecture; |
/lxp32_mul_seq.vhd
1,77 → 1,77
--------------------------------------------------------------------- |
-- Sequential multiplier |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- The smallest possible multiplier. Implemented using |
-- an accumulator. One multiplication takes 34 cycles. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_mul_seq is |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
ce_i: in std_logic; |
op1_i: in std_logic_vector(31 downto 0); |
op2_i: in std_logic_vector(31 downto 0); |
ce_o: out std_logic; |
result_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_mul_seq is |
|
signal reg1: unsigned(op1_i'range); |
signal reg2: unsigned(op2_i'range); |
signal pp: unsigned(31 downto 0); |
signal acc_sum: unsigned(31 downto 0); |
signal cnt: integer range 0 to 32:=0; |
signal ceo: std_logic:='0'; |
|
begin |
|
pp<=reg1 when reg2(0)='1' else (others=>'0'); |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
ceo<='0'; |
cnt<=0; |
reg1<=(others=>'-'); |
reg2<=(others=>'-'); |
acc_sum<=(others=>'-'); |
else |
if cnt=1 then |
ceo<='1'; |
else |
ceo<='0'; |
end if; |
|
if ce_i='1' then |
cnt<=32; |
reg1<=unsigned(op1_i); |
reg2<=unsigned(op2_i); |
acc_sum<=(others=>'0'); |
else |
acc_sum<=acc_sum+pp; |
reg1<=reg1(reg1'high-1 downto 0)&"0"; |
reg2<="0"®2(reg2'high downto 1); |
if cnt>0 then |
cnt<=cnt-1; |
end if; |
end if; |
end if; |
end if; |
end process; |
|
result_o<=std_logic_vector(acc_sum); |
ce_o<=ceo; |
|
end architecture; |
--------------------------------------------------------------------- |
-- Sequential multiplier |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- The smallest possible multiplier. Implemented using |
-- an accumulator. One multiplication takes 34 cycles. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_mul_seq is |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
ce_i: in std_logic; |
op1_i: in std_logic_vector(31 downto 0); |
op2_i: in std_logic_vector(31 downto 0); |
ce_o: out std_logic; |
result_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_mul_seq is |
|
signal reg1: unsigned(op1_i'range); |
signal reg2: unsigned(op2_i'range); |
signal pp: unsigned(31 downto 0); |
signal acc_sum: unsigned(31 downto 0); |
signal cnt: integer range 0 to 32:=0; |
signal ceo: std_logic:='0'; |
|
begin |
|
pp<=reg1 when reg2(0)='1' else (others=>'0'); |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
ceo<='0'; |
cnt<=0; |
reg1<=(others=>'-'); |
reg2<=(others=>'-'); |
acc_sum<=(others=>'-'); |
else |
if cnt=1 then |
ceo<='1'; |
else |
ceo<='0'; |
end if; |
|
if ce_i='1' then |
cnt<=32; |
reg1<=unsigned(op1_i); |
reg2<=unsigned(op2_i); |
acc_sum<=(others=>'0'); |
else |
acc_sum<=acc_sum+pp; |
reg1<=reg1(reg1'high-1 downto 0)&"0"; |
reg2<="0"®2(reg2'high downto 1); |
if cnt>0 then |
cnt<=cnt-1; |
end if; |
end if; |
end if; |
end if; |
end process; |
|
result_o<=std_logic_vector(acc_sum); |
ce_o<=ceo; |
|
end architecture; |
/lxp32_ram256x32.vhd
1,70 → 1,70
--------------------------------------------------------------------- |
-- Generic dual-port memory |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- Portable description of a dual-port memory block with one write |
-- port. Major FPGA synthesis tools can infer on-chip block RAM |
-- from this description. Can be replaced with a library component |
-- wrapper if needed. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_ram256x32 is |
port( |
clk_i: in std_logic; |
|
we_i: in std_logic; |
waddr_i: in std_logic_vector(7 downto 0); |
wdata_i: in std_logic_vector(31 downto 0); |
|
re_i: in std_logic; |
raddr_i: in std_logic_vector(7 downto 0); |
rdata_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_ram256x32 is |
|
type ram_type is array(255 downto 0) of std_logic_vector(31 downto 0); |
signal ram: ram_type:=(others=>(others=>'0')); -- zero-initialize for SRAM-based FPGAs |
|
attribute syn_ramstyle: string; |
attribute syn_ramstyle of ram: signal is "no_rw_check"; |
attribute ram_style: string; -- for Xilinx |
attribute ram_style of ram: signal is "block"; |
|
begin |
|
-- Write port |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if we_i='1' then |
ram(to_integer(unsigned(waddr_i)))<=wdata_i; |
end if; |
end if; |
end process; |
|
-- Read port |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if re_i='1' then |
if is_x(raddr_i) then -- to avoid numeric_std warnings during simulation |
rdata_o<=(others=>'X'); |
else |
rdata_o<=ram(to_integer(unsigned(raddr_i))); |
end if; |
end if; |
end if; |
end process; |
|
end architecture; |
--------------------------------------------------------------------- |
-- Generic dual-port memory |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- Portable description of a dual-port memory block with one write |
-- port. Major FPGA synthesis tools can infer on-chip block RAM |
-- from this description. Can be replaced with a library component |
-- wrapper if needed. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
use ieee.numeric_std.all; |
|
entity lxp32_ram256x32 is |
port( |
clk_i: in std_logic; |
|
we_i: in std_logic; |
waddr_i: in std_logic_vector(7 downto 0); |
wdata_i: in std_logic_vector(31 downto 0); |
|
re_i: in std_logic; |
raddr_i: in std_logic_vector(7 downto 0); |
rdata_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_ram256x32 is |
|
type ram_type is array(255 downto 0) of std_logic_vector(31 downto 0); |
signal ram: ram_type:=(others=>(others=>'0')); -- zero-initialize for SRAM-based FPGAs |
|
attribute syn_ramstyle: string; |
attribute syn_ramstyle of ram: signal is "no_rw_check"; |
attribute ram_style: string; -- for Xilinx |
attribute ram_style of ram: signal is "block"; |
|
begin |
|
-- Write port |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if we_i='1' then |
ram(to_integer(unsigned(waddr_i)))<=wdata_i; |
end if; |
end if; |
end process; |
|
-- Read port |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if re_i='1' then |
if is_x(raddr_i) then -- to avoid numeric_std warnings during simulation |
rdata_o<=(others=>'X'); |
else |
rdata_o<=ram(to_integer(unsigned(raddr_i))); |
end if; |
end if; |
end if; |
end process; |
|
end architecture; |
/lxp32_scratchpad.vhd
1,93 → 1,93
--------------------------------------------------------------------- |
-- Scratchpad |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- LXP32 register file implemented as a RAM block. Since we need |
-- to read two registers simultaneously, the memory is duplicated. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
|
entity lxp32_scratchpad is |
port( |
clk_i: in std_logic; |
|
raddr1_i: in std_logic_vector(7 downto 0); |
rdata1_o: out std_logic_vector(31 downto 0); |
raddr2_i: in std_logic_vector(7 downto 0); |
rdata2_o: out std_logic_vector(31 downto 0); |
|
waddr_i: in std_logic_vector(7 downto 0); |
we_i: in std_logic; |
wdata_i: in std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_scratchpad is |
|
signal wdata_reg: std_logic_vector(wdata_i'range); |
signal ram1_rdata: std_logic_vector(31 downto 0); |
signal ram2_rdata: std_logic_vector(31 downto 0); |
|
signal ram1_collision: std_logic; |
signal ram2_collision: std_logic; |
|
begin |
|
-- RAM 1 |
|
ram_inst1: entity work.lxp32_ram256x32(rtl) |
port map( |
clk_i=>clk_i, |
|
we_i=>we_i, |
waddr_i=>waddr_i, |
wdata_i=>wdata_i, |
|
re_i=>'1', |
raddr_i=>raddr1_i, |
rdata_o=>ram1_rdata |
); |
|
-- RAM 2 |
|
ram_inst2: entity work.lxp32_ram256x32(rtl) |
port map( |
clk_i=>clk_i, |
|
we_i=>we_i, |
waddr_i=>waddr_i, |
wdata_i=>wdata_i, |
|
re_i=>'1', |
raddr_i=>raddr2_i, |
rdata_o=>ram2_rdata |
); |
|
-- Read/write collision detection |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
wdata_reg<=wdata_i; |
if waddr_i=raddr1_i and we_i='1' then |
ram1_collision<='1'; |
else |
ram1_collision<='0'; |
end if; |
if waddr_i=raddr2_i and we_i='1' then |
ram2_collision<='1'; |
else |
ram2_collision<='0'; |
end if; |
end if; |
end process; |
|
rdata1_o<=ram1_rdata when ram1_collision='0' else wdata_reg; |
rdata2_o<=ram2_rdata when ram2_collision='0' else wdata_reg; |
|
end architecture; |
--------------------------------------------------------------------- |
-- Scratchpad |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- LXP32 register file implemented as a RAM block. Since we need |
-- to read two registers simultaneously, the memory is duplicated. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
|
entity lxp32_scratchpad is |
port( |
clk_i: in std_logic; |
|
raddr1_i: in std_logic_vector(7 downto 0); |
rdata1_o: out std_logic_vector(31 downto 0); |
raddr2_i: in std_logic_vector(7 downto 0); |
rdata2_o: out std_logic_vector(31 downto 0); |
|
waddr_i: in std_logic_vector(7 downto 0); |
we_i: in std_logic; |
wdata_i: in std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_scratchpad is |
|
signal wdata_reg: std_logic_vector(wdata_i'range); |
signal ram1_rdata: std_logic_vector(31 downto 0); |
signal ram2_rdata: std_logic_vector(31 downto 0); |
|
signal ram1_collision: std_logic; |
signal ram2_collision: std_logic; |
|
begin |
|
-- RAM 1 |
|
ram_inst1: entity work.lxp32_ram256x32(rtl) |
port map( |
clk_i=>clk_i, |
|
we_i=>we_i, |
waddr_i=>waddr_i, |
wdata_i=>wdata_i, |
|
re_i=>'1', |
raddr_i=>raddr1_i, |
rdata_o=>ram1_rdata |
); |
|
-- RAM 2 |
|
ram_inst2: entity work.lxp32_ram256x32(rtl) |
port map( |
clk_i=>clk_i, |
|
we_i=>we_i, |
waddr_i=>waddr_i, |
wdata_i=>wdata_i, |
|
re_i=>'1', |
raddr_i=>raddr2_i, |
rdata_o=>ram2_rdata |
); |
|
-- Read/write collision detection |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
wdata_reg<=wdata_i; |
if waddr_i=raddr1_i and we_i='1' then |
ram1_collision<='1'; |
else |
ram1_collision<='0'; |
end if; |
if waddr_i=raddr2_i and we_i='1' then |
ram2_collision<='1'; |
else |
ram2_collision<='0'; |
end if; |
end if; |
end process; |
|
rdata1_o<=ram1_rdata when ram1_collision='0' else wdata_reg; |
rdata2_o<=ram2_rdata when ram2_collision='0' else wdata_reg; |
|
end architecture; |
/lxp32_shifter.vhd
1,99 → 1,99
--------------------------------------------------------------------- |
-- Barrel shifter |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- Performs logical (unsigned) and arithmetic (signed) shifts |
-- in both directions. Pipeline latency: 1 cycle. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
|
entity lxp32_shifter is |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
ce_i: in std_logic; |
d_i: in std_logic_vector(31 downto 0); |
s_i: in std_logic_vector(4 downto 0); |
right_i: in std_logic; |
sig_i: in std_logic; |
ce_o: out std_logic; |
d_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_shifter is |
|
signal data: std_logic_vector(d_i'range); |
signal data_shifted: std_logic_vector(d_i'range); |
|
signal fill: std_logic; -- 0 for unsigned shifts, sign bit for signed ones |
signal fill_v: std_logic_vector(3 downto 0); |
|
type cascades_type is array (4 downto 0) of std_logic_vector(d_i'range); |
signal cascades: cascades_type; |
|
signal stage2_data: std_logic_vector(d_i'range); |
signal stage2_s: std_logic_vector(s_i'range); |
signal stage2_fill: std_logic; |
signal stage2_fill_v: std_logic_vector(15 downto 0); |
signal stage2_right: std_logic; |
|
signal ceo: std_logic:='0'; |
|
begin |
|
-- Internally, data are shifted in left direction. For right shifts |
-- we reverse the argument's bit order |
|
data_gen: for i in data'range generate |
data(i)<=d_i(i) when right_i='0' else d_i(d_i'high-i); |
end generate; |
|
-- A set of cascaded shifters shifting by powers of two |
|
fill<=sig_i and data(0); |
fill_v<=(others=>fill); |
|
cascades(0)<=data(30 downto 0)&fill_v(0) when s_i(0)='1' else data; |
cascades(1)<=cascades(0)(29 downto 0)&fill_v(1 downto 0) when s_i(1)='1' else cascades(0); |
cascades(2)<=cascades(1)(27 downto 0)&fill_v(3 downto 0) when s_i(2)='1' else cascades(1); |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
ceo<='0'; |
stage2_data<=(others=>'-'); |
stage2_s<=(others=>'-'); |
stage2_fill<='-'; |
stage2_right<='-'; |
else |
ceo<=ce_i; |
stage2_data<=cascades(2); |
stage2_s<=s_i; |
stage2_fill<=fill; |
stage2_right<=right_i; |
end if; |
end if; |
end process; |
|
stage2_fill_v<=(others=>stage2_fill); |
|
cascades(3)<=stage2_data(23 downto 0)&stage2_fill_v(7 downto 0) when stage2_s(3)='1' else stage2_data; |
cascades(4)<=cascades(3)(15 downto 0)&stage2_fill_v(15 downto 0) when stage2_s(4)='1' else cascades(3); |
|
-- Reverse bit order back, if needed |
|
data_shifted_gen: for i in data_shifted'range generate |
data_shifted(i)<=cascades(4)(i) when stage2_right='0' else cascades(4)(cascades(4)'high-i); |
end generate; |
|
d_o<=data_shifted; |
ce_o<=ceo; |
|
end architecture; |
--------------------------------------------------------------------- |
-- Barrel shifter |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- Performs logical (unsigned) and arithmetic (signed) shifts |
-- in both directions. Pipeline latency: 1 cycle. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
|
entity lxp32_shifter is |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
ce_i: in std_logic; |
d_i: in std_logic_vector(31 downto 0); |
s_i: in std_logic_vector(4 downto 0); |
right_i: in std_logic; |
sig_i: in std_logic; |
ce_o: out std_logic; |
d_o: out std_logic_vector(31 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32_shifter is |
|
signal data: std_logic_vector(d_i'range); |
signal data_shifted: std_logic_vector(d_i'range); |
|
signal fill: std_logic; -- 0 for unsigned shifts, sign bit for signed ones |
signal fill_v: std_logic_vector(3 downto 0); |
|
type cascades_type is array (4 downto 0) of std_logic_vector(d_i'range); |
signal cascades: cascades_type; |
|
signal stage2_data: std_logic_vector(d_i'range); |
signal stage2_s: std_logic_vector(s_i'range); |
signal stage2_fill: std_logic; |
signal stage2_fill_v: std_logic_vector(15 downto 0); |
signal stage2_right: std_logic; |
|
signal ceo: std_logic:='0'; |
|
begin |
|
-- Internally, data are shifted in left direction. For right shifts |
-- we reverse the argument's bit order |
|
data_gen: for i in data'range generate |
data(i)<=d_i(i) when right_i='0' else d_i(d_i'high-i); |
end generate; |
|
-- A set of cascaded shifters shifting by powers of two |
|
fill<=sig_i and data(0); |
fill_v<=(others=>fill); |
|
cascades(0)<=data(30 downto 0)&fill_v(0) when s_i(0)='1' else data; |
cascades(1)<=cascades(0)(29 downto 0)&fill_v(1 downto 0) when s_i(1)='1' else cascades(0); |
cascades(2)<=cascades(1)(27 downto 0)&fill_v(3 downto 0) when s_i(2)='1' else cascades(1); |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
ceo<='0'; |
stage2_data<=(others=>'-'); |
stage2_s<=(others=>'-'); |
stage2_fill<='-'; |
stage2_right<='-'; |
else |
ceo<=ce_i; |
stage2_data<=cascades(2); |
stage2_s<=s_i; |
stage2_fill<=fill; |
stage2_right<=right_i; |
end if; |
end if; |
end process; |
|
stage2_fill_v<=(others=>stage2_fill); |
|
cascades(3)<=stage2_data(23 downto 0)&stage2_fill_v(7 downto 0) when stage2_s(3)='1' else stage2_data; |
cascades(4)<=cascades(3)(15 downto 0)&stage2_fill_v(15 downto 0) when stage2_s(4)='1' else cascades(3); |
|
-- Reverse bit order back, if needed |
|
data_shifted_gen: for i in data_shifted'range generate |
data_shifted(i)<=cascades(4)(i) when stage2_right='0' else cascades(4)(cascades(4)'high-i); |
end generate; |
|
d_o<=data_shifted; |
ce_o<=ceo; |
|
end architecture; |
/lxp32_ubuf.vhd
1,84 → 1,84
--------------------------------------------------------------------- |
-- Microbuffer |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- A small buffer with a FIFO-like interface, implemented |
-- using registers. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
|
entity lxp32_ubuf is |
generic( |
DATA_WIDTH: integer |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
we_i: in std_logic; |
d_i: in std_logic_vector(DATA_WIDTH-1 downto 0); |
re_i: in std_logic; |
d_o: out std_logic_vector(DATA_WIDTH-1 downto 0); |
|
empty_o: out std_logic; |
full_o: out std_logic |
); |
end entity; |
|
architecture rtl of lxp32_ubuf is |
|
signal we: std_logic; |
signal re: std_logic; |
|
signal empty: std_logic:='1'; |
signal full: std_logic:='0'; |
|
type regs_type is array (1 downto 0) of std_logic_vector(DATA_WIDTH-1 downto 0); |
signal regs: regs_type; |
signal regs_mux: regs_type; |
|
begin |
|
we<=we_i and not full; |
re<=re_i and not empty; |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
empty<='1'; |
full<='0'; |
regs<=(others=>(others=>'-')); |
else |
if re='0' then |
regs(0)<=regs_mux(0); |
else |
regs(0)<=regs_mux(1); |
end if; |
|
regs(1)<=regs_mux(1); |
|
if we='1' and re='0' then |
empty<='0'; |
full<=not empty; |
elsif we='0' and re='1' then |
empty<=not full; |
full<='0'; |
end if; |
end if; |
end if; |
end process; |
|
regs_mux(0)<=regs(0) when we='0' or empty='0' else d_i; |
regs_mux(1)<=regs(1) when we='0' or empty='1' else d_i; |
|
d_o<=regs(0); |
empty_o<=empty; |
full_o<=full; |
|
end architecture; |
--------------------------------------------------------------------- |
-- Microbuffer |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- A small buffer with a FIFO-like interface, implemented |
-- using registers. |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
|
entity lxp32_ubuf is |
generic( |
DATA_WIDTH: integer |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
we_i: in std_logic; |
d_i: in std_logic_vector(DATA_WIDTH-1 downto 0); |
re_i: in std_logic; |
d_o: out std_logic_vector(DATA_WIDTH-1 downto 0); |
|
empty_o: out std_logic; |
full_o: out std_logic |
); |
end entity; |
|
architecture rtl of lxp32_ubuf is |
|
signal we: std_logic; |
signal re: std_logic; |
|
signal empty: std_logic:='1'; |
signal full: std_logic:='0'; |
|
type regs_type is array (1 downto 0) of std_logic_vector(DATA_WIDTH-1 downto 0); |
signal regs: regs_type; |
signal regs_mux: regs_type; |
|
begin |
|
we<=we_i and not full; |
re<=re_i and not empty; |
|
process (clk_i) is |
begin |
if rising_edge(clk_i) then |
if rst_i='1' then |
empty<='1'; |
full<='0'; |
regs<=(others=>(others=>'-')); |
else |
if re='0' then |
regs(0)<=regs_mux(0); |
else |
regs(0)<=regs_mux(1); |
end if; |
|
regs(1)<=regs_mux(1); |
|
if we='1' and re='0' then |
empty<='0'; |
full<=not empty; |
elsif we='0' and re='1' then |
empty<=not full; |
full<='0'; |
end if; |
end if; |
end if; |
end process; |
|
regs_mux(0)<=regs(0) when we='0' or empty='0' else d_i; |
regs_mux(1)<=regs(1) when we='0' or empty='1' else d_i; |
|
d_o<=regs(0); |
empty_o<=empty; |
full_o<=full; |
|
end architecture; |
/lxp32c_top.vhd
1,122 → 1,122
--------------------------------------------------------------------- |
-- LXP32C CPU top-level module (C-series, with instruction cache) |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- This version uses Wishbone B3 interface for the instruction bus |
-- (IBUS). It is designed for high-latency program memory, such as |
-- external SDRAM chips. |
-- |
-- Parameters: |
-- DBUS_RMW: Use RMW cycle instead of SEL_O() signal |
-- for byte-granular access to data bus |
-- DIVIDER_EN: enable divider |
-- IBUS_BURST_SIZE: size of the burst |
-- IBUS_PREFETCH_SIZE: initiate read burst if number of words |
-- left in the buffer is less than specified |
-- MUL_ARCH: multiplier architecture ("dsp", "opt" |
-- or "seq") |
-- START_ADDR: address in program memory where execution |
-- starts |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
|
entity lxp32c_top is |
generic( |
DBUS_RMW: boolean:=false; |
DIVIDER_EN: boolean:=true; |
IBUS_BURST_SIZE: integer:=16; |
IBUS_PREFETCH_SIZE: integer:=32; |
MUL_ARCH: string:="dsp"; |
START_ADDR: std_logic_vector(31 downto 0):=(others=>'0') |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
ibus_cyc_o: out std_logic; |
ibus_stb_o: out std_logic; |
ibus_cti_o: out std_logic_vector(2 downto 0); |
ibus_bte_o: out std_logic_vector(1 downto 0); |
ibus_ack_i: in std_logic; |
ibus_adr_o: out std_logic_vector(29 downto 0); |
ibus_dat_i: in std_logic_vector(31 downto 0); |
|
dbus_cyc_o: out std_logic; |
dbus_stb_o: out std_logic; |
dbus_we_o: out std_logic; |
dbus_sel_o: out std_logic_vector(3 downto 0); |
dbus_ack_i: in std_logic; |
dbus_adr_o: out std_logic_vector(31 downto 2); |
dbus_dat_o: out std_logic_vector(31 downto 0); |
dbus_dat_i: in std_logic_vector(31 downto 0); |
|
irq_i: in std_logic_vector(7 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32c_top is |
|
signal lli_re: std_logic; |
signal lli_adr: std_logic_vector(29 downto 0); |
signal lli_dat: std_logic_vector(31 downto 0); |
signal lli_busy: std_logic; |
|
begin |
|
cpu_inst: entity work.lxp32_cpu(rtl) |
generic map( |
DBUS_RMW=>DBUS_RMW, |
DIVIDER_EN=>DIVIDER_EN, |
MUL_ARCH=>MUL_ARCH, |
START_ADDR=>START_ADDR |
) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
lli_re_o=>lli_re, |
lli_adr_o=>lli_adr, |
lli_dat_i=>lli_dat, |
lli_busy_i=>lli_busy, |
|
dbus_cyc_o=>dbus_cyc_o, |
dbus_stb_o=>dbus_stb_o, |
dbus_we_o=>dbus_we_o, |
dbus_sel_o=>dbus_sel_o, |
dbus_ack_i=>dbus_ack_i, |
dbus_adr_o=>dbus_adr_o, |
dbus_dat_o=>dbus_dat_o, |
dbus_dat_i=>dbus_dat_i, |
|
irq_i=>irq_i |
); |
|
icache_inst: entity work.lxp32_icache(rtl) |
generic map( |
BURST_SIZE=>IBUS_BURST_SIZE, |
PREFETCH_SIZE=>IBUS_PREFETCH_SIZE |
) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
lli_re_i=>lli_re, |
lli_adr_i=>lli_adr, |
lli_dat_o=>lli_dat, |
lli_busy_o=>lli_busy, |
|
wbm_cyc_o=>ibus_cyc_o, |
wbm_stb_o=>ibus_stb_o, |
wbm_cti_o=>ibus_cti_o, |
wbm_bte_o=>ibus_bte_o, |
wbm_ack_i=>ibus_ack_i, |
wbm_adr_o=>ibus_adr_o, |
wbm_dat_i=>ibus_dat_i |
); |
|
end architecture; |
--------------------------------------------------------------------- |
-- LXP32C CPU top-level module (C-series, with instruction cache) |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- This version uses Wishbone B3 interface for the instruction bus |
-- (IBUS). It is designed for high-latency program memory, such as |
-- external SDRAM chips. |
-- |
-- Parameters: |
-- DBUS_RMW: Use RMW cycle instead of SEL_O() signal |
-- for byte-granular access to data bus |
-- DIVIDER_EN: enable divider |
-- IBUS_BURST_SIZE: size of the burst |
-- IBUS_PREFETCH_SIZE: initiate read burst if number of words |
-- left in the buffer is less than specified |
-- MUL_ARCH: multiplier architecture ("dsp", "opt" |
-- or "seq") |
-- START_ADDR: address in program memory where execution |
-- starts |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
|
entity lxp32c_top is |
generic( |
DBUS_RMW: boolean:=false; |
DIVIDER_EN: boolean:=true; |
IBUS_BURST_SIZE: integer:=16; |
IBUS_PREFETCH_SIZE: integer:=32; |
MUL_ARCH: string:="dsp"; |
START_ADDR: std_logic_vector(31 downto 0):=(others=>'0') |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
ibus_cyc_o: out std_logic; |
ibus_stb_o: out std_logic; |
ibus_cti_o: out std_logic_vector(2 downto 0); |
ibus_bte_o: out std_logic_vector(1 downto 0); |
ibus_ack_i: in std_logic; |
ibus_adr_o: out std_logic_vector(29 downto 0); |
ibus_dat_i: in std_logic_vector(31 downto 0); |
|
dbus_cyc_o: out std_logic; |
dbus_stb_o: out std_logic; |
dbus_we_o: out std_logic; |
dbus_sel_o: out std_logic_vector(3 downto 0); |
dbus_ack_i: in std_logic; |
dbus_adr_o: out std_logic_vector(31 downto 2); |
dbus_dat_o: out std_logic_vector(31 downto 0); |
dbus_dat_i: in std_logic_vector(31 downto 0); |
|
irq_i: in std_logic_vector(7 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32c_top is |
|
signal lli_re: std_logic; |
signal lli_adr: std_logic_vector(29 downto 0); |
signal lli_dat: std_logic_vector(31 downto 0); |
signal lli_busy: std_logic; |
|
begin |
|
cpu_inst: entity work.lxp32_cpu(rtl) |
generic map( |
DBUS_RMW=>DBUS_RMW, |
DIVIDER_EN=>DIVIDER_EN, |
MUL_ARCH=>MUL_ARCH, |
START_ADDR=>START_ADDR |
) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
lli_re_o=>lli_re, |
lli_adr_o=>lli_adr, |
lli_dat_i=>lli_dat, |
lli_busy_i=>lli_busy, |
|
dbus_cyc_o=>dbus_cyc_o, |
dbus_stb_o=>dbus_stb_o, |
dbus_we_o=>dbus_we_o, |
dbus_sel_o=>dbus_sel_o, |
dbus_ack_i=>dbus_ack_i, |
dbus_adr_o=>dbus_adr_o, |
dbus_dat_o=>dbus_dat_o, |
dbus_dat_i=>dbus_dat_i, |
|
irq_i=>irq_i |
); |
|
icache_inst: entity work.lxp32_icache(rtl) |
generic map( |
BURST_SIZE=>IBUS_BURST_SIZE, |
PREFETCH_SIZE=>IBUS_PREFETCH_SIZE |
) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
lli_re_i=>lli_re, |
lli_adr_i=>lli_adr, |
lli_dat_o=>lli_dat, |
lli_busy_o=>lli_busy, |
|
wbm_cyc_o=>ibus_cyc_o, |
wbm_stb_o=>ibus_stb_o, |
wbm_cti_o=>ibus_cti_o, |
wbm_bte_o=>ibus_bte_o, |
wbm_ack_i=>ibus_ack_i, |
wbm_adr_o=>ibus_adr_o, |
wbm_dat_i=>ibus_dat_i |
); |
|
end architecture; |
/lxp32u_top.vhd
1,86 → 1,86
--------------------------------------------------------------------- |
-- LXP32U CPU top-level module (U-series, without instruction cache) |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- This version uses a Low Latency Interface for the instruction bus |
-- (IBUS). It is designed for low-latency slaves such as on-chip |
-- RAM blocks. |
-- |
-- Parameters: |
-- DBUS_RMW: Use RMW cycle instead of SEL_O() signal |
-- for byte-granular access to data bus |
-- DIVIDER_EN: enable divider |
-- MUL_ARCH: multiplier architecture ("dsp", "opt" |
-- or "seq") |
-- START_ADDR: address in program memory where execution |
-- starts |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
|
entity lxp32u_top is |
generic( |
DBUS_RMW: boolean:=false; |
DIVIDER_EN: boolean:=true; |
MUL_ARCH: string:="dsp"; |
START_ADDR: std_logic_vector(31 downto 0):=(others=>'0') |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
lli_re_o: out std_logic; |
lli_adr_o: out std_logic_vector(29 downto 0); |
lli_dat_i: in std_logic_vector(31 downto 0); |
lli_busy_i: in std_logic; |
|
dbus_cyc_o: out std_logic; |
dbus_stb_o: out std_logic; |
dbus_we_o: out std_logic; |
dbus_sel_o: out std_logic_vector(3 downto 0); |
dbus_ack_i: in std_logic; |
dbus_adr_o: out std_logic_vector(31 downto 2); |
dbus_dat_o: out std_logic_vector(31 downto 0); |
dbus_dat_i: in std_logic_vector(31 downto 0); |
|
irq_i: in std_logic_vector(7 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32u_top is |
|
begin |
|
cpu_inst: entity work.lxp32_cpu(rtl) |
generic map( |
DBUS_RMW=>DBUS_RMW, |
DIVIDER_EN=>DIVIDER_EN, |
MUL_ARCH=>MUL_ARCH, |
START_ADDR=>START_ADDR |
) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
lli_re_o=>lli_re_o, |
lli_adr_o=>lli_adr_o, |
lli_dat_i=>lli_dat_i, |
lli_busy_i=>lli_busy_i, |
|
dbus_cyc_o=>dbus_cyc_o, |
dbus_stb_o=>dbus_stb_o, |
dbus_we_o=>dbus_we_o, |
dbus_sel_o=>dbus_sel_o, |
dbus_ack_i=>dbus_ack_i, |
dbus_adr_o=>dbus_adr_o, |
dbus_dat_o=>dbus_dat_o, |
dbus_dat_i=>dbus_dat_i, |
|
irq_i=>irq_i |
); |
|
end architecture; |
--------------------------------------------------------------------- |
-- LXP32U CPU top-level module (U-series, without instruction cache) |
-- |
-- Part of the LXP32 CPU |
-- |
-- Copyright (c) 2016 by Alex I. Kuznetsov |
-- |
-- This version uses a Low Latency Interface for the instruction bus |
-- (IBUS). It is designed for low-latency slaves such as on-chip |
-- RAM blocks. |
-- |
-- Parameters: |
-- DBUS_RMW: Use RMW cycle instead of SEL_O() signal |
-- for byte-granular access to data bus |
-- DIVIDER_EN: enable divider |
-- MUL_ARCH: multiplier architecture ("dsp", "opt" |
-- or "seq") |
-- START_ADDR: address in program memory where execution |
-- starts |
--------------------------------------------------------------------- |
|
library ieee; |
use ieee.std_logic_1164.all; |
|
entity lxp32u_top is |
generic( |
DBUS_RMW: boolean:=false; |
DIVIDER_EN: boolean:=true; |
MUL_ARCH: string:="dsp"; |
START_ADDR: std_logic_vector(31 downto 0):=(others=>'0') |
); |
port( |
clk_i: in std_logic; |
rst_i: in std_logic; |
|
lli_re_o: out std_logic; |
lli_adr_o: out std_logic_vector(29 downto 0); |
lli_dat_i: in std_logic_vector(31 downto 0); |
lli_busy_i: in std_logic; |
|
dbus_cyc_o: out std_logic; |
dbus_stb_o: out std_logic; |
dbus_we_o: out std_logic; |
dbus_sel_o: out std_logic_vector(3 downto 0); |
dbus_ack_i: in std_logic; |
dbus_adr_o: out std_logic_vector(31 downto 2); |
dbus_dat_o: out std_logic_vector(31 downto 0); |
dbus_dat_i: in std_logic_vector(31 downto 0); |
|
irq_i: in std_logic_vector(7 downto 0) |
); |
end entity; |
|
architecture rtl of lxp32u_top is |
|
begin |
|
cpu_inst: entity work.lxp32_cpu(rtl) |
generic map( |
DBUS_RMW=>DBUS_RMW, |
DIVIDER_EN=>DIVIDER_EN, |
MUL_ARCH=>MUL_ARCH, |
START_ADDR=>START_ADDR |
) |
port map( |
clk_i=>clk_i, |
rst_i=>rst_i, |
|
lli_re_o=>lli_re_o, |
lli_adr_o=>lli_adr_o, |
lli_dat_i=>lli_dat_i, |
lli_busy_i=>lli_busy_i, |
|
dbus_cyc_o=>dbus_cyc_o, |
dbus_stb_o=>dbus_stb_o, |
dbus_we_o=>dbus_we_o, |
dbus_sel_o=>dbus_sel_o, |
dbus_ack_i=>dbus_ack_i, |
dbus_adr_o=>dbus_adr_o, |
dbus_dat_o=>dbus_dat_o, |
dbus_dat_i=>dbus_dat_i, |
|
irq_i=>irq_i |
); |
|
end architecture; |