Username:
Password:

Remember me

Browse

Projects
Forums
About
- Mission
- Logos
- Community
- Statistics
HowTo/FAQ
- FAQ
- Project
- SVN
- WISHBONE
- EDA Tools
Media
- News
- Articles
- Newsletter
Licensing
Commerce
- Shop
- Advertise
- Jobs
Partners
Maintainers
Contact us

Tools

URL https://opencores.org/ocsvn/zipcpu/zipcpu/trunk

Subversion Repositories zipcpu

Compare Revisions

This comparison shows the changes necessary to convert path
```
/
```
from Rev 38 to Rev 39
↔ Reverse comparison
Compare Path: Rev

With Path: Rev

Rev 38 → Rev 39

/zipcpu/trunk/bench/cpp/zippy_tb.cpp

43,6 → 43,7

#include "verilated.h"

#include "Vzipsystem.h"

#include "cpudefs.h"

#include "testb.h"

// #include "twoc.h"

184,18 → 185,18

                 int     ln= 0;
                 mvprintw(ln,0, "Peripherals-SS"); ln++;
+#ifdef  OPT_ILLEGAL_INSTRUCTION
                 printw(" %s",
                         // (m_core->v__DOT__thecpu__DOT__pf_illegal)?"PI":"  ",
                         (m_core->v__DOT__thecpu__DOT__dcd_illegal)?"DI":"  "
                         );
-                /*
-                printw(" %s%s%s",
-                        (m_core->v__DOT__thecpu__DOT__ill_err)?"IL":"  ",
+#endif
+#ifdef  OPT_EARLY_BRANCHING
+                printw(" %s%s",
                         (m_core->v__DOT__thecpu__DOT__dcd_early_branch)?"EB":"  ",
-                        (m_core->v__DOT__thecpu__DOT__dcd_early_branch_stb)?"S":" ",
-                        (m_core->v__DOT__thecpu__DOT__dcd_early_branch_stb)?"S":" ",
-                        );
-                */
+                        (m_core->v__DOT__thecpu__DOT__dcd_early_branch_stb)?"S":" ");
+#endif
                 /*
                 showval(ln, 1, "TRAP", m_core->v__DOT__trap_data);

305,6 → 306,9

                 attroff(A_BOLD);
                 ln+=1;
+#ifdef  OPT_SINGLE_FETCH
+        ln+=2;
+#else
                 mvprintw(ln, 0, "PFPIPE: rda=%08x/%d, bas=%08x, off=%08x, nv=%03x, ackw=%d,%d%d,%04x",
                         m_core->v__DOT__thecpu__DOT__pf__DOT__r_addr,
                         m_core->v__DOT__thecpu__DOT__pf__DOT__r_cv,

325,6 → 329,7

(m_core->v__DOT__thecpu__DOT__pf_ack)?"ACK":" ",

(m_core->v__DOT__thecpu__DOT__pf_stall)?"STL":" ",

(m_core->v__DOT__wb_data)); ln++;

#endif

mvprintw(ln, 0, "MEMBUS: %3s %3s %s @0x%08x[0x%08x] -> %s %s %08x",

(m_core->v__DOT__thecpu__DOT__mem_cyc_gbl)?"GCY"

336,7 → 341,17

                         (m_core->v__DOT__thecpu__DOT__mem_data),
                         (m_core->v__DOT__thecpu__DOT__mem_ack)?"ACK":"   ",
                         (m_core->v__DOT__thecpu__DOT__mem_stall)?"STL":"   ",
-                        (m_core->v__DOT__thecpu__DOT__mem_result)); ln++;
+                        (m_core->v__DOT__thecpu__DOT__mem_result));
+// #define      OPT_PIPELINED_BUS_ACCESS
+#ifdef  OPT_PIPELINED_BUS_ACCESS
+                printw(" %x%x%c%c",
+                        (m_core->v__DOT__thecpu__DOT__domem__DOT__wraddr),
+                        (m_core->v__DOT__thecpu__DOT__domem__DOT__rdaddr),
+                        (m_core->v__DOT__thecpu__DOT__op_pipe)?'P':'-',
+                        (mem_pipe_stalled())?'S':'-'); ln++;
+#else
+                ln++;
+#endif
                 mvprintw(ln, 0, "SYSBS%c: %3s %3s %s @0x%08x[0x%08x] -> %s %s %08x",
                         (m_core->v__DOT__thecpu__DOT__pformem__DOT__r_a_owner)?'M':'P',

348,7 → 363,40

                         (m_core->i_wb_ack)?"ACK":"   ",
                         (m_core->i_wb_stall)?"STL":"   ",
                         (m_core->i_wb_data)); ln+=2;
+#ifdef  OPT_PIPELINED_BUS_ACCESS
+                mvprintw(ln-1, 0, "Mem CE: %d = %d%d%d%d%d, stall: %d = %d%d(%d|%d%d|..)",
+                        (m_core->v__DOT__thecpu__DOT__mem_ce),
+                        (m_core->v__DOT__thecpu__DOT__master_ce),
+                        (m_core->v__DOT__thecpu__DOT__opvalid_mem),
+                        (!m_core->v__DOT__thecpu__DOT__clear_pipeline),
+                        (m_core->v__DOT__thecpu__DOT__set_cond),
+                        (!m_core->v__DOT__thecpu__DOT__mem_stalled),
+                        (m_core->v__DOT__thecpu__DOT__mem_stalled),
+                        (m_core->v__DOT__thecpu__DOT__opvalid_mem),
+                        (m_core->v__DOT__thecpu__DOT__master_ce),
+                        (mem_pipe_stalled()),
+                        (!m_core->v__DOT__thecpu__DOT__op_pipe),
+                        (m_core->v__DOT__thecpu__DOT__mem_busy));
+                printw(" op_pipe = %d%d%d%d%d(%d|%d)",
+                        (m_core->v__DOT__thecpu__DOT__dcdvalid),
+                        (m_core->v__DOT__thecpu__DOT__opvalid_mem),
+                        (m_core->v__DOT__thecpu__DOT__dcdM),
+                        (!((m_core->v__DOT__thecpu__DOT__dcdOp
+                                ^m_core->v__DOT__thecpu__DOT__opn)&1)),
+                        (m_core->v__DOT__thecpu__DOT__dcdB
+                                == m_core->v__DOT__thecpu__DOT__op_B),
+                        (m_core->v__DOT__thecpu__DOT__r_dcdI
+                                == m_core->v__DOT__thecpu__DOT__r_opI),
+                        (m_core->v__DOT__thecpu__DOT__r_dcdI+1
+                                == m_core->v__DOT__thecpu__DOT__r_opI));
+                mvprintw(4,4,"r_dcdI = 0x%06x, r_opI = 0x%06x",
+                        (m_core->v__DOT__thecpu__DOT__r_dcdI),
+                        (m_core->v__DOT__thecpu__DOT__r_opI));
+#endif
+                mvprintw(4,42,"0x%08x", m_core->v__DOT__thecpu__DOT__instruction);
                 showins(ln, "I ",
                         !m_core->v__DOT__thecpu__DOT__dcd_stalled,
                         m_core->v__DOT__thecpu__DOT__pf_valid,

364,6 → 412,13

                         m_core->v__DOT__thecpu__DOT__dcd_gie,
                         m_core->v__DOT__thecpu__DOT__dcd_stalled,
                         m_core->v__DOT__thecpu__DOT__dcd_pc-1); ln++;
+#ifdef  OPT_ILLEGAL_INSTRUCTION
+                if (m_core->v__DOT__thecpu__DOT__dcd_illegal)
+                        mvprintw(ln-1,10,"I");
+                else
+#endif
+                if (m_core->v__DOT__thecpu__DOT__dcdM)
+                        mvprintw(ln-1,10,"M");
                 showins(ln, "Op",
                         m_core->v__DOT__thecpu__DOT__op_ce,

370,7 → 425,16

                         m_core->v__DOT__thecpu__DOT__opvalid,
                         m_core->v__DOT__thecpu__DOT__op_gie,
                         m_core->v__DOT__thecpu__DOT__op_stall,
-                        m_core->v__DOT__thecpu__DOT__op_pc-1); ln++;
+                        op_pc()); ln++;
+#ifdef  OPT_ILLEGAL_INSTRUCTION
+                if (m_core->v__DOT__thecpu__DOT__op_illegal)
+                        mvprintw(ln-1,10,"I");
+                else
+#endif
+                if (m_core->v__DOT__thecpu__DOT__opvalid_mem)
+                        mvprintw(ln-1,10,"M");
+                else if (m_core->v__DOT__thecpu__DOT__opvalid_alu)
+                        mvprintw(ln-1,10,"A");
                 showins(ln, "Al",
                         m_core->v__DOT__thecpu__DOT__alu_ce,

377,9 → 441,11

                         m_core->v__DOT__thecpu__DOT__alu_pc_valid,
                         m_core->v__DOT__thecpu__DOT__alu_gie,
                         m_core->v__DOT__thecpu__DOT__alu_stall,
-                        m_core->v__DOT__thecpu__DOT__alu_pc-1); ln++;
+                        alu_pc()); ln++;
+                if (m_core->v__DOT__thecpu__DOT__wr_reg_ce)
+                        mvprintw(ln-1,10,"W");
-                mvprintw(ln-5, 48,"%s %s",
+                mvprintw(ln-5, 65,"%s %s",
                         (m_core->v__DOT__thecpu__DOT__op_break)?"OB":"  ",
                         (m_core->v__DOT__thecpu__DOT__clear_pipeline)?"CLRP":"    ");
                 mvprintw(ln-4, 48,

415,7 → 481,7

                         (m_core->v__DOT__thecpu__DOT__mem_ce)?"CE":"  ",
                         (m_core->v__DOT__thecpu__DOT__mem_we)?"Wr ":"Rd ",
                         (m_core->v__DOT__thecpu__DOT__mem_stalled)?"PIPE":"    ",
-                        (m_core->v__DOT__thecpu__DOT__mem_valid)?"MEMV":"    ",
+                        (m_core->v__DOT__thecpu__DOT__mem_valid)?"V":" ",
                         zop_regstr[(m_core->v__DOT__thecpu__DOT__mem_wreg&0x1f)^0x10]);
         }

599,7 → 665,7

                         m_core->v__DOT__thecpu__DOT__opvalid,
                         m_core->v__DOT__thecpu__DOT__op_gie,
                         m_core->v__DOT__thecpu__DOT__op_stall,
-                        m_core->v__DOT__thecpu__DOT__op_pc-1); ln++;
+                        op_pc()); ln++;
                 showins(ln, "Al",
                         m_core->v__DOT__thecpu__DOT__alu_ce,

606,7 → 672,7

                         m_core->v__DOT__thecpu__DOT__alu_pc_valid,
                         m_core->v__DOT__thecpu__DOT__alu_gie,
                         m_core->v__DOT__thecpu__DOT__alu_stall,
-                        m_core->v__DOT__thecpu__DOT__alu_pc-1); ln++;
+                        alu_pc()); ln++;
         }
         void    tick(void) {
                 int gie = m_core->v__DOT__thecpu__DOT__gie;

660,7 → 726,7

                                 m_core->v__DOT__thecpu__DOT__dcd_ce,
                                 m_core->v__DOT__thecpu__DOT__dcd_pc,
                                 m_core->v__DOT__thecpu__DOT__op_ce,
-                                m_core->v__DOT__thecpu__DOT__op_pc,
+                                op_pc(),
                                 m_core->v__DOT__thecpu__DOT__dcdA,
                                 m_core->v__DOT__thecpu__DOT__opR,
                                 m_core->v__DOT__cmd_halt,

751,13 → 817,13

                                 m_core->v__DOT__thecpu__DOT__opvalid,
                                 m_core->v__DOT__thecpu__DOT__op_gie,
                                 m_core->v__DOT__thecpu__DOT__op_stall,
-                                m_core->v__DOT__thecpu__DOT__op_pc-1);
+                                op_pc());
                         dbgins("Al - ",
                                 m_core->v__DOT__thecpu__DOT__alu_ce,
                                 m_core->v__DOT__thecpu__DOT__alu_pc_valid,
                                 m_core->v__DOT__thecpu__DOT__alu_gie,
                                 m_core->v__DOT__thecpu__DOT__alu_stall,
-                                m_core->v__DOT__thecpu__DOT__alu_pc-1);
+                                alu_pc());
                 }
         }

767,10 → 833,43

                         &&(m_core->v__DOT__thecpu__DOT__sleep));
         }
+        unsigned        op_pc(void) {
+                /*
+                unsigned r = m_core->v__DOT__thecpu__DOT__dcd_pc-1;
+                if (m_core->v__DOT__thecpu__DOT__dcdvalid)
+                        r--;
+                return r;
+                */
+                return m_core->v__DOT__thecpu__DOT__op_pc-1;
+        }
+        unsigned        alu_pc(void) {
+                /*
+                unsigned        r = op_pc();
+                if (m_core->v__DOT__thecpu__DOT__opvalid)
+                        r--;
+                return r;
+                */
+                return m_core->v__DOT__thecpu__DOT__alu_pc-1;
+        }
+#ifdef  OPT_PIPELINED_BUS_ACCESS
+        int     mem_pipe_stalled(void) {
+                int     r = 0;
+                r = ((m_core->v__DOT__thecpu__DOT__mem_cyc_gbl)
+                 ||(m_core->v__DOT__thecpu__DOT__mem_cyc_lcl));
+                r = r && ((m_core->v__DOT__thecpu__DOT__mem_stall)
+                        ||(
+                                ((!m_core->v__DOT__thecpu__DOT__mem_stb_gbl)
+                                &&(!m_core->v__DOT__thecpu__DOT__mem_stb_lcl))));
+                return r;
+                // return m_core->v__DOT__thecpu__DOT__mem_pipe_stalled;
+        }
+#endif
         bool    test_failure(void) {
                 return ((m_core->v__DOT__thecpu__DOT__alu_pc_valid)
-                        &&(m_mem[m_core->v__DOT__thecpu__DOT__alu_pc-1]
-                                == 0x2f0f7fff)
+                        &&(m_mem[alu_pc()] == 0x2f0f7fff)
                         &&(!m_core->v__DOT__thecpu__DOT__clear_pipeline));
         }

1117,7 → 1216,7

                                 halted = true;
                                 erase();
                                 break;
-                        case 's': case 'S':
+                        case 's':
                                 if (!halted)
                                         erase();
                                 tb->wb_write(CMD_REG, CMD_STEP);

1124,10 → 1223,30

                                 manual = false;
                                 halted = true;
                                 break;
-                        case 't': case 'T':
+                        case 'S':
                                 if ((!manual)||(halted))
                                         erase();
                                 manual = true;
+                                halted = true;
+                                tb->m_core->v__DOT__cmd_halt = 0;
+                                tb->m_core->v__DOT__cmd_step = 1;
+                                tb->eval();
+                                tb->tick();
+                                break;
+                        case 'T': //
+                                if ((!manual)||(halted))
+                                        erase();
+                                manual = true;
+                                halted = true;
+                                tb->m_core->v__DOT__cmd_halt = 1;
+                                tb->m_core->v__DOT__cmd_step = 0;
+                                tb->eval();
+                                tb->tick();
+                                break;
+                        case 't':
+                                if ((!manual)||(halted))
+                                        erase();
+                                manual = true;
                                 halted = false;
                 //              tb->m_core->v__DOT__thecpu__DOT__step = 0;
                 //              tb->m_core->v__DOT__cmd_halt = 0;

/zipcpu/trunk/bench/cpp/Makefile

39,7 → 39,8

 CXX     := g++
 FLAGS   := -Wall -Og -g
 ZASM    := ../../sw/zasm
-INCS    := -I../../rtl/obj_dir/ -I/usr/share/verilator/include -I../../sw/zasm
+RTLD    := ../../rtl
+INCS    := -I$(RTLD)/obj_dir/ -I$(RTLD) -I/usr/share/verilator/include -I../../sw/zasm
 SOURCES := zippy_tb.cpp memsim.cpp twoc.cpp $(ZASM)/zopcodes.cpp $(ZASM)/zparser.cpp
 RAWLIB  := /usr/share/verilator/include/verilated.cpp ../../rtl/obj_dir/Vzipsystem__ALL.a
 LIBS    := $(RAWLIB) -lncurses

46,6 → 47,7

TESTF := ../../sw/zasm/z.out

zippy_tb: $(SOURCES) $(RAWLIB) $(ZASM)/zopcodes.h $(ZASM)/zparser.h testb.h

zippy_tb: $(RTLD)/cpudefs.h

$(CXX) $(FLAGS) $(INCS) $(SOURCES) $(LIBS) -o $@

.PHONY: stest

/zipcpu/trunk/doc/spec.pdf Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream

/zipcpu/trunk/doc/src/spec.tex

48,7 → 48,7

 \title{Specification}
 \author{Dan Gisselquist, Ph.D.}
 \email{dgisselq (at) opencores.org}
-\revision{Rev.~0.4}
+\revision{Rev.~0.5}
 \definecolor{webred}{rgb}{0.2,0,0}
 \definecolor{webgreen}{rgb}{0,0.2,0}
 \usepackage[dvips,ps2pdf,colorlinks=true,

76,6 → 76,7

copy.

\end{license}

\begin{revisionhistory}

0.5 & 9/29/2015 & Gisselquist & Added pipelined memory access discussion.\\\hline

0.4 & 9/19/2015 & Gisselquist & Added DMA controller, improved stall information, and self--assessment info.\\\hline

0.3 & 8/22/2015 & Gisselquist & First completed draft\\\hline

0.2 & 8/19/2015 & Gisselquist & Still Draft, more complete \\\hline

411,6 → 412,25

 supervisor, in supervisor mode, to determine whether it got to supervisor
 mode from a trap or from an external interrupt or both.
+These status register bits are summarized in Tbl.~\ref{tbl:ccbits}.
+\begin{table}
+\begin{center}
+\begin{tabular}{l|l}
+Bit & Meaning \\\hline
+& Soft trap, set on a trap from user mode, cleared when returning to user mode\\\hline
+& (Reserved for) Floating point enable \\\hline
+& Halt on break, to support an external debugger \\\hline
+& Step, single step the CPU in user mode\\\hline
+& GIE, or Global Interrupt Enable \\\hline
+& Sleep \\\hline
+& V, or overflow bit.\\\hline
+& N, or negative bit.\\\hline
+& C, or carry bit.\\\hline
+& Z, or zero bit. \\\hline
+\end{tabular}
+\caption{Condition Code / Status Register Bits}\label{tbl:ccbits}
+\end{center}\end{table}
 \section{Conditional Instructions}
 Most, although not quite all, instructions may be conditionally executed.  From
 the four condition code flags, eight conditions are defined.  These are shown

546,17 → 566,17

 Op Code & \multicolumn{8}{c|}{31\ldots24} & \multicolumn{8}{c|}{23\ldots 16}
         & \multicolumn{8}{c|}{15\ldots 8} & \multicolumn{8}{c|}{7\ldots 0}
         & Sets CC? \\\hline\hline
-CMP(Sub) & \multicolumn{4}{l|}{4'h0}
+{\tt CMP(Sub)} & \multicolumn{4}{l|}{4'h0}
                 & \multicolumn{4}{l|}{D. Reg}
                 & \multicolumn{3}{l|}{Cond.}
                 & \multicolumn{21}{l|}{Operand B}
                 & Yes \\\hline
-TST(And) & \multicolumn{4}{l|}{4'h1}
+{\tt TST(And)} & \multicolumn{4}{l|}{4'h1}
                 & \multicolumn{4}{l|}{D. Reg}
                 & \multicolumn{3}{l|}{Cond.}
                 & \multicolumn{21}{l|}{Operand B}
         & Yes \\\hline
-MOV & \multicolumn{4}{l|}{4'h2}
+{\tt MOV} & \multicolumn{4}{l|}{4'h2}
                 & \multicolumn{4}{l|}{D. Reg}
                 & \multicolumn{3}{l|}{Cond.}
                 & A-Usr

564,15 → 584,15

                 & B-Usr
                 & \multicolumn{15}{l|}{15'bit signed offset}
                 & \\\hline
-LODI & \multicolumn{4}{l|}{4'h3}
+{\tt LODI} & \multicolumn{4}{l|}{4'h3}
                 & \multicolumn{4}{l|}{R. Reg}
                 & \multicolumn{24}{l|}{24'bit Signed Immediate}
                 & \\\hline
-NOOP & \multicolumn{4}{l|}{4'h4}
+{\tt NOOP} & \multicolumn{4}{l|}{4'h4}
                 & \multicolumn{4}{l|}{4'he}
                 & \multicolumn{24}{l|}{24'h00}
                 & \\\hline
-BREAK & \multicolumn{4}{l|}{4'h4}
+{\tt BREAK} & \multicolumn{4}{l|}{4'h4}
                 & \multicolumn{4}{l|}{4'he}
                 & \multicolumn{24}{l|}{24'h01}
                 & \\\hline

580,7 → 600,7

                 & \multicolumn{4}{l|}{4'he}
                 & \multicolumn{24}{l|}{24'bits, but not 0 or 1.}
                 & \\\hline
-LODIHI & \multicolumn{4}{l|}{4'h4}
+{\tt LODIHI }& \multicolumn{4}{l|}{4'h4}
                 & \multicolumn{4}{l|}{4'hf}
                 & \multicolumn{3}{l|}{Cond.}
                 & 1'b1

587,7 → 607,7

                 & \multicolumn{4}{l|}{R. Reg}
                 & \multicolumn{16}{l|}{16-bit Immediate}
                 & \\\hline
-LODILO & \multicolumn{4}{l|}{4'h4}
+{\tt LODILO} & \multicolumn{4}{l|}{4'h4}
                 & \multicolumn{4}{l|}{4'hf}
                 & \multicolumn{3}{l|}{Cond.}
                 & 1'b0

594,81 → 614,81

                 & \multicolumn{4}{l|}{R. Reg}
                 & \multicolumn{16}{l|}{16-bit Immediate}
                 & \\\hline
--b MPYU & \multicolumn{4}{l|}{4'h4}
+-b {\tt MPYU} & \multicolumn{4}{l|}{4'h4}
                 & \multicolumn{4}{l|}{R. Reg}
                 & \multicolumn{3}{l|}{Cond.}
                 & 1'b0 & \multicolumn{4}{l|}{Reg}
                 & \multicolumn{16}{l|}{16-bit Offset}
                 & Yes \\\hline
--b MPYU(I) & \multicolumn{4}{l|}{4'h4}
+-b {\tt MPYU}(I) & \multicolumn{4}{l|}{4'h4}
                 & \multicolumn{4}{l|}{R. Reg}
                 & \multicolumn{3}{l|}{Cond.}
                 & 1'b0 & \multicolumn{4}{l|}{4'hf}
                 & \multicolumn{16}{l|}{16-bit Offset}
                 & Yes \\\hline
--b MPYS & \multicolumn{4}{l|}{4'h4}
+-b {\tt MPYS} & \multicolumn{4}{l|}{4'h4}
                 & \multicolumn{4}{l|}{R. Reg}
                 & \multicolumn{3}{l|}{Cond.}
                 & 1'b1 & \multicolumn{4}{l|}{Reg}
                 & \multicolumn{16}{l|}{16-bit Offset}
                 & Yes \\\hline
--b MPYS(I) & \multicolumn{4}{l|}{4'h4}
+-b {\tt MPYS}(I) & \multicolumn{4}{l|}{4'h4}
                 & \multicolumn{4}{l|}{R. Reg}
                 & \multicolumn{3}{l|}{Cond.}
                 & 1'b1 & \multicolumn{4}{l|}{4'hf}
                 & \multicolumn{16}{l|}{16-bit Offset}
                 & Yes \\\hline
-ROL & \multicolumn{4}{l|}{4'h5}
+{\tt ROL} & \multicolumn{4}{l|}{4'h5}
                 & \multicolumn{4}{l|}{R. Reg}
                 & \multicolumn{3}{l|}{Cond.}
                 & \multicolumn{21}{l|}{Operand B, truncated to low order 5 bits}
                 & \\\hline
-LOD & \multicolumn{4}{l|}{4'h6}
+{\tt LOD} & \multicolumn{4}{l|}{4'h6}
                 & \multicolumn{4}{l|}{R. Reg}
                 & \multicolumn{3}{l|}{Cond.}
                 & \multicolumn{21}{l|}{Operand B address}
                 & \\\hline
-STO & \multicolumn{4}{l|}{4'h7}
+{\tt STO} & \multicolumn{4}{l|}{4'h7}
                 & \multicolumn{4}{l|}{D. Reg}
                 & \multicolumn{3}{l|}{Cond.}
                 & \multicolumn{21}{l|}{Operand B address}
                 & \\\hline
-SUB & \multicolumn{4}{l|}{4'h8}
+{\tt SUB} & \multicolumn{4}{l|}{4'h8}
         &       \multicolumn{4}{l|}{R. Reg}
         &       \multicolumn{3}{l|}{Cond.}
         &       \multicolumn{21}{l|}{Operand B}
         & Yes \\\hline
-AND & \multicolumn{4}{l|}{4'h9}
+{\tt AND} & \multicolumn{4}{l|}{4'h9}
         &       \multicolumn{4}{l|}{R. Reg}
         &       \multicolumn{3}{l|}{Cond.}
         &       \multicolumn{21}{l|}{Operand B}
         & Yes \\\hline
-ADD & \multicolumn{4}{l|}{4'ha}
+{\tt ADD} & \multicolumn{4}{l|}{4'ha}
         &       \multicolumn{4}{l|}{R. Reg}
         &       \multicolumn{3}{l|}{Cond.}
         &       \multicolumn{21}{l|}{Operand B}
         & Yes \\\hline
-OR & \multicolumn{4}{l|}{4'hb}
+{\tt OR} & \multicolumn{4}{l|}{4'hb}
         &       \multicolumn{4}{l|}{R. Reg}
         &       \multicolumn{3}{l|}{Cond.}
         &       \multicolumn{21}{l|}{Operand B}
         & Yes \\\hline
-XOR & \multicolumn{4}{l|}{4'hc}
+{\tt XOR} & \multicolumn{4}{l|}{4'hc}
         &       \multicolumn{4}{l|}{R. Reg}
         &       \multicolumn{3}{l|}{Cond.}
         &       \multicolumn{21}{l|}{Operand B}
         & Yes \\\hline
-LSL/ASL & \multicolumn{4}{l|}{4'hd}
+{\tt LSL/ASL} & \multicolumn{4}{l|}{4'hd}
         &       \multicolumn{4}{l|}{R. Reg}
         &       \multicolumn{3}{l|}{Cond.}
         &       \multicolumn{21}{l|}{Operand B, imm. truncated to 6 bits}
         & Yes \\\hline
-ASR & \multicolumn{4}{l|}{4'he}
+{\tt ASR} & \multicolumn{4}{l|}{4'he}
         &       \multicolumn{4}{l|}{R. Reg}
         &       \multicolumn{3}{l|}{Cond.}
         &       \multicolumn{21}{l|}{Operand B, imm. truncated to 6 bits}
         & Yes \\\hline
-LSR & \multicolumn{4}{l|}{4'hf}
+{\tt LSR} & \multicolumn{4}{l|}{4'hf}
         &       \multicolumn{4}{l|}{R. Reg}
         &       \multicolumn{3}{l|}{Cond.}
         &       \multicolumn{21}{l|}{Operand B, imm. truncated to 6 bits}

692,51 → 712,49

 \begin{table}\begin{center}
 \begin{tabular}{p{1.4in}p{1.5in}p{3in}}\\\hline
 Mapped & Actual  & Notes \\\hline
-ABS Rx
-        & \parbox[t]{1.5in}{TST -1,Rx\\NEG.LT Rx}
+{\tt ABS Rx}
+        & \parbox[t]{1.5in}{\tt TST -1,Rx\\NEG.LT Rx}
         & Absolute value, depends upon derived NEG.\\\hline
-\parbox[t]{1.4in}{ADD Ra,Rx\\ADDC Rb,Ry}
-        & \parbox[t]{1.5in}{Add Ra,Rx\\ADD.C \$1,Ry\\Add Rb,Ry}
+\parbox[t]{1.4in}{\tt ADD Ra,Rx\\ADDC Rb,Ry}
+        & \parbox[t]{1.5in}{\tt Add Ra,Rx\\ADD.C \$1,Ry\\Add Rb,Ry}
         & Add with carry \\\hline
-BRA.Cond +/-\$Addr
-        & \hbox{MOV.cond \$Addr+PC,PC}
+{\tt BRA.Cond +/-\$Addr}
+        & \hbox{\tt MOV.cond \$Addr+PC,PC}
         & Branch or jump on condition.  Works for 15--bit
                 signed address offsets.\\\hline
-BRA.Cond +/-\$Addr
-        & \parbox[t]{1.5in}{LDI \$Addr,Rx \\ ADD.cond Rx,PC}
+{\tt BRA.Cond +/-\$Addr}
+        & \parbox[t]{1.5in}{\tt LDI \$Addr,Rx \\ ADD.cond Rx,PC}
         & Branch/jump on condition.  Works for
 bit address offsets, but costs a register, an extra instruction,
         and sets the flags. \\\hline
-BNC PC+\$Addr
-        & \parbox[t]{1.5in}{Test \$Carry,CC \\ MOV.Z PC+\$Addr,PC}
+{\tt BNC PC+\$Addr}
+        & \parbox[t]{1.5in}{\tt Test \$Carry,CC \\ MOV.Z PC+\$Addr,PC}
         & Example of a branch on an unsupported
                 condition, in this case a branch on not carry \\\hline
-BUSY & MOV \$-1(PC),PC & Execute an infinite loop \\\hline
-CLRF.NZ Rx
-        & XOR.NZ Rx,Rx
+{\tt BUSY } & {\tt MOV \$-1(PC),PC} & Execute an infinite loop \\\hline
+{\tt CLRF.NZ Rx }
+        & {\tt XOR.NZ Rx,Rx}
         & Clear Rx, and flags, if the Z-bit is not set \\\hline
-CLR Rx
-        & LDI \$0,Rx
+{\tt CLR Rx }
+        & {\tt LDI \$0,Rx}
         & Clears Rx, leaves flags untouched.  This instruction cannot be
                 conditional. \\\hline
-EXCH.W Rx
-        & ROL \$16,Rx
+{\tt EXCH.W Rx }
+        & {\tt ROL \$16,Rx}
         & Exchanges the top and bottom 16'bit words of Rx \\\hline
-HALT
-        & Or \$SLEEP,CC
-        & Executed while in interrupt mode.  In user mode this is simply a
-        wait until interrupt instruction. \\\hline
-INT & LDI \$0,CC
-        & Since we're using the CC register as a trap vector as well, this
-        executes TRAP \#0. \\\hline
-IRET
-        & OR \$GIE,CC
-        & Also an RTU instruction (Return to Userspace) \\\hline
-JMP R6+\$Addr
-        & MOV \$Addr(R6),PC
+{\tt HALT }
+        & {\tt Or \$SLEEP,CC}
+        & This only works when issued in interrupt/supervisor mode.  In user
+        mode this is simply a wait until interrupt instruction. \\\hline
+{\tt INT } & {\tt LDI \$0,CC} &  \\\hline
+{\tt IRET}
+        & {\tt OR \$GIE,CC}
+        & Also known as an RTU instruction (Return to Userspace) \\\hline
+{\tt JMP R6+\$Addr}
+        & {\tt MOV \$Addr(R6),PC}
         & \\\hline
-JSR PC+\$Addr
-        & \parbox[t]{1.5in}{SUB \$1,SP \\\
+{\tt JSR PC+\$Addr}
+        & \parbox[t]{1.5in}{\tt SUB \$1,SP \\\
         MOV \$3+PC,R0 \\
         STO R0,1(SP) \\
         MOV \$Addr+PC,PC \\

746,16 → 764,18

         operand, removing the preliminary stack instruction before and
         the cleanup after, by adjusting how any stack frame was built for
         this routine to include space at the top of the stack for the PC.
+        Note also that jumping to a subroutine costs a copy register, {\tt R0}
+        in this case.
         \\\hline
-JSR PC+\$Addr
-        & \parbox[t]{1.5in}{MOV \$3+PC,R12 \\ MOV \$addr+PC,PC}
+{\tt JSR PC+\$Addr  }
+        & \parbox[t]{1.5in}{\tt MOV \$3+PC,R12 \\ MOV \$addr+PC,PC}
         &This is the high speed
         version of a subroutine call, necessitating a register to hold the
         last PC address.  In its favor, this method doesn't suffer the
         mandatory memory access of the other approach. \\\hline
-LDI.l \$val,Rx
-        & \parbox[t]{1.5in}{LDIHI (\$val$>>$16)\&0x0ffff, Rx \\
-                        LDILO (\$val \& 0x0ffff)}
+{\tt LDI.l \$val,Rx }
+        & \parbox[t]{1.8in}{\tt LDIHI (\$val$>>$16)\&0x0ffff, Rx \\
+                        LDILO (\$val\&0x0ffff),Rx}
         & Sadly, there's not enough instruction
                 space to load a complete immediate value into any register.
                 Therefore, fully loading any register takes two cycles.

767,8 → 787,8

 \begin{table}\begin{center}
 \begin{tabular}{p{1.4in}p{1.5in}p{3in}}\\\hline
 Mapped & Actual  & Notes \\\hline
-LOD.b \$addr,Rx
-        & \parbox[t]{1.5in}{%
+{\tt LOD.b \$addr,Rx}
+        & \parbox[t]{1.5in}{\tt %
         LDI     \$addr,Ra \\
         LDI     \$addr,Rb \\
         LSR     \$2,Ra \\

788,8 → 808,8

         we needed to drop the bottom two bits.  This also limits the address
         space of character accesses using this method from 16 MB down to 4MB.}
                 \\\hline
-\parbox[t]{1.5in}{LSL \$1,Rx\\ LSLC \$1,Ry}
-        & \parbox[t]{1.5in}{LSL \$1,Ry \\
+\parbox[t]{1.5in}{\tt LSL \$1,Rx\\ LSLC \$1,Ry}
+        & \parbox[t]{1.5in}{\tt LSL \$1,Ry \\
         LSL \$1,Rx \\
         OR.C \$1,Ry}
         & Logical shift left with carry.  Note that the

797,23 → 817,23

         That is, LSL sets the carry flag, so if we did this the other way
         with Rx before Ry, then the condition flag wouldn't have been right
         for an OR correction at the end. \\\hline
-\parbox[t]{1.5in}{LSR \$1,Rx \\ LSRC \$1,Ry}
-        & \parbox[t]{1.5in}{CLR Rz \\
+\parbox[t]{1.5in}{\tt LSR \$1,Rx \\ LSRC \$1,Ry}
+        & \parbox[t]{1.5in}{\tt CLR Rz \\
         LSR \$1,Ry \\
         LDIHI.C \$8000h,Rz \\
         LSR \$1,Rx \\
         OR Rz,Rx}
         & Logical shift right with carry \\\hline
-NEG Rx & \parbox[t]{1.5in}{XOR \$-1,Rx \\ ADD \$1,Rx} & \\\hline
-NEG.C Rx & \parbox[t]{1.5in}{MOV.C \$-1+Rx,Rx\\XOR.C \$-1,Rx} & \\\hline
-NOOP & NOOP & While there are many
+{\tt NEG Rx} & \parbox[t]{1.5in}{\tt XOR \$-1,Rx \\ ADD \$1,Rx} & \\\hline
+{\tt NEG.C Rx} & \parbox[t]{1.5in}{\tt MOV.C \$-1+Rx,Rx\\XOR.C \$-1,Rx} & \\\hline
+{\tt NOOP} & {\tt NOOP} & While there are many
         operations that do nothing, such as MOV Rx,Rx, or OR \$0,Rx, these
         operations have consequences in that they might stall the bus if
         Rx isn't ready yet.  For this reason, we have a dedicated NOOP
         instruction. \\\hline
-NOT Rx & XOR \$-1,Rx & \\\hline
-POP Rx
-        & \parbox[t]{1.5in}{LOD \$1(SP),Rx \\ ADD \$1,SP}
+{\tt NOT Rx } & {\tt XOR \$-1,Rx } & \\\hline
+{\tt POP Rx }
+        & \parbox[t]{1.5in}{\tt LOD \$1(SP),Rx \\ ADD \$1,SP}
         & Note
         that for interrupt purposes, one can never depend upon the value at
         (SP).  Hence you read from it, then increment it, lest having

824,12 → 844,14

 \end{center}\end{table}
 \begin{table}\begin{center}
 \begin{tabular}{p{1.4in}p{1.5in}p{3in}}\\\hline
-PUSH Rx
+{\tt PUSH Rx}
         & \parbox[t]{1.5in}{SUB \$1,SP \\
         STO Rx,\$1(SP)}
-        & \\\hline
-PUSH Rx-Ry
-        & \parbox[t]{1.5in}{SUB \$n,SP \\
+        & Note that for pipelined operation, it helps to coalesce all the
+        {\tt SUB}'s into one command, and place the {\tt STO}'s right
+        after each other.\\\hline
+{\tt PUSH Rx-Ry}
+        & \parbox[t]{1.5in}{\tt SUB \$n,SP \\
         STO Rx,\$n(SP)
         \ldots \\
         STO Ry,\$1(SP)}

836,27 → 858,28

         & Multiple pushes at once only need the single subtract from the
         stack pointer.  This derived instruction is analogous to a similar one
         on the Motoroloa 68k architecture, although the Zip Assembler
-        does not support this instruction (yet).\\\hline
-RESET
-        & \parbox[t]{1in}{STO \$1,\$watchdog(R12)\\NOOP\\NOOP}
-        & \parbox[t]{3in}{This depends upon the peripheral base address being
+        does not support this instruction (yet).  This instruction
+        also supports pipelined memory access.\\\hline
+{\tt RESET}
+        & \parbox[t]{1in}{\tt STO \$1,\$watchdog(R12)\\NOOP\\NOOP}
+        & This depends upon the peripheral base address being
         in R12.
         Another opportunity might be to jump to the reset address from within
-        supervisor mode.}\\\hline
-RET & \parbox[t]{1.5in}{LOD \$1(SP),PC}
+        supervisor mode.\\\hline
+{\tt RET} & \parbox[t]{1.5in}{\tt LOD \$1(SP),PC}
         & Note that this depends upon the calling context to clean up the
         stack, as outlined for the JSR instruction.  \\\hline
-RET & MOV R12,PC
+{\tt RET} & {\tt MOV R12,PC}
         & This is the high(er) speed version, that doesn't touch the stack.
         As such, it doesn't suffer a stall on memory read/write to the stack.
         \\\hline
-STEP Rr,Rt
-        & \parbox[t]{1.5in}{LSR \$1,Rr \\ XOR.C Rt,Rr}
+{\tt STEP Rr,Rt}
+        & \parbox[t]{1.5in}{\tt LSR \$1,Rr \\ XOR.C Rt,Rr}
         & Step a Galois implementation of a Linear Feedback Shift Register, Rr,
                 using taps Rt \\\hline
-STO.b Rx,\$addr
-        & \parbox[t]{1.5in}{%
+{\tt STO.b Rx,\$addr}
+        & \parbox[t]{1.5in}{\tt %
         LDI \$addr,Ra \\
         LDI \$addr,Rb \\
         LSR \$2,Ra \\

864,7 → 887,7

         SUB \$32,Rb \\
         LOD (Ra),Ry \\
         AND \$0ffh,Rx \\
-        AND \$-0ffh,Ry \\
+        AND \~\$0ffh,Ry \\
         ROL Rb,Rx \\
         OR Rx,Ry \\
         STO Ry,(Ra) }

877,15 → 900,15

         of character accesses from 16 MB down to 4MB.F
         Further, this instruction implies a byte ordering,
         such as big or little endian.} \\\hline
-SWAP Rx,Ry
-        & \parbox[t]{1.5in}{
+{\tt SWAP Rx,Ry }
+        & \parbox[t]{1.5in}{\tt
         XOR Ry,Rx \\
         XOR Rx,Ry \\
         XOR Ry,Rx}
         & While no extra registers are needed, this example
         does take 3-clocks. \\\hline
-TRAP \#X
-        & \parbox[t]{1.5in}{LDI \$x,R0 \\ AND ~\$GIE,CC }
+{\tt TRAP \#X}
+        & \parbox[t]{1.5in}{\tt LDI \$x,R0 \\ AND \~\$GIE,CC }
         & This works because whenever a user lowers the \$GIE flag, it sets
         a TRAP bit within the CC register.  Therefore, upon entering the
         supervisor state, the CPU only need check this bit to know that it

898,16 → 921,16

 \end{center}\end{table}
 \begin{table}\begin{center}
 \begin{tabular}{p{1.4in}p{1.5in}p{3in}}\\\hline
-TST Rx
-        & TST \$-1,Rx
+{\tt TST Rx}
+        & {\tt TST \$-1,Rx}
         & Set the condition codes based upon Rx.  Could also do a CMP \$0,Rx,
         ADD \$0,Rx, SUB \$0,Rx, etc, AND \$-1,Rx, etc.  The TST and CMP
         approaches won't stall future pipeline stages looking for the value
         of Rx. \\\hline
-WAIT
-        & Or \$SLEEP,CC
-        & Wait 'til interrupt.  In an interrupts disabled context, this
-        becomes a HALT instruction.
+{\tt WAIT}
+        & {\tt Or \$GIE | \$SLEEP,CC}
+        & Wait until the next interrupt, then jump to supervisor/interrupt
+        mode.
 \end{tabular}
 \caption{Derived Instructions, continued}\label{tbl:derived-4}
 \end{center}\end{table}

1073,9 → 1096,13

 memory unit is busy with the STO instruction, but otherwise this pipeline will
 stall waiting for it to complete.
-Note that even though the Wishbone bus can support pipelined accesses at
-one access per clock, only the prefetch stage can take advantage of this.
-Load and Store instructions are stuck at one wishbone cycle per instruction.
+The Zip CPU does have the capability of supporting pipelined memory access,
+but only under the following conditions: all accesses within the pipeline
+must all be reads or all be writes, all must use the same register for their
+address, and there can be no stalls or other instructions between pipelined
+memory access instructions.  Further, the offset to memory must be increasing
+by one address each instruction.  These conditions work well for saving or
+storing registers to the stack.
 \item When waiting for a conditional memory read operation to complete
 \begin{enumerate}

1235,7 → 1262,7

 When coupled with a peripheral, the DMA controller can be configured to start
 a memory copy on an interrupt line going high.  Further, the controller can be
-configured to issue reads from (or two) the same address instead of incrementing
+configured to issue reads from (or to) the same address instead of incrementing
 the address at each clock.  The DMA completes once the total number of items
 specified (not the transfer length) have been transferred.

1402,19 → 1429,43

 \begin{table}\begin{center}
 \begin{tabular}{ll}
 {\tt swap\_out:} \\
-&        {\tt MOV -15(uSP),R1} \\
-&        {\tt STO R1,stack(R12)} \\
-&        {\tt MOV uPC,R0} \\
-&        {\tt STO R0,15(R1)} \\
-&        {\tt MOV uCC,R0} \\
-&        {\tt STO R0,14(R1)} \\
+&        {\tt MOV -15(uSP),R5} \\
+&        {\tt STO R5,stack(R12)} \\
+&        {\tt MOV uR0,R0} \\
+&        {\tt MOV uR1,R1} \\
+&        {\tt MOV uR2,R2} \\
+&        {\tt MOV uR3,R3} \\
+&        {\tt MOV uR4,R4} \\
+&        {\tt STO R0,1(R5)} {\em ; Exploit memory pipelining: }\\
+&        {\tt STO R1,2(R5)} {\em ; All instructions write to stack }\\
+&        {\tt STO R2,3(R5)} {\em ; All offsets increment by one }\\
+&        {\tt STO R3,4(R5)} {\em ; Longest pipeline is 5 cycles.}\\
+&        {\tt STO R4,5(R5)} \\
+        & \ldots {\em ; Need to repeat for all user registers} \\
+\iffalse
+&        {\tt MOV uR5,R0} \\
+&        {\tt MOV uR6,R1} \\
+&        {\tt MOV uR7,R2} \\
+&        {\tt MOV uR8,R3} \\
+&        {\tt MOV uR9,R4} \\
+&        {\tt STO R0,6(R5) }\\
+&        {\tt STO R1,7(R5) }\\
+&        {\tt STO R2,8(R5) }\\
+&        {\tt STO R3,9(R5) }\\
+&        {\tt STO R4,10(R5)} \\
+\fi
+&        {\tt MOV uR10,R0} \\
+&        {\tt MOV uR11,R1} \\
+&        {\tt MOV uR12,R2} \\
+&        {\tt MOV uCC,R3} \\
+&        {\tt MOV uPC,R4} \\
+&        {\tt STO R0,11(R5)}\\
+&        {\tt STO R1,12(R5)}\\
+&        {\tt STO R2,13(R5)}\\
+&        {\tt STO R3,14(R5)}\\
+&        {\tt STO R4,15(R5)} \\
 &       {\em ; We can skip storing the stack, uSP, since it'll be stored}\\
 &       {\em ; elsewhere (in the task structure) }\\
-&        {\tt MOV uR13,R0} \\
-&        {\tt STO R0,13(R1)} \\
-        & \ldots {\em ; Need to repeat for all user registers} \\
-&        {\tt MOV uR0,R0} \\
-&        {\tt STO R0,1(R1)} \\
 \end{tabular}
 \caption{Example Storing User Task Context}\label{tbl:context-out}
 \end{center}\end{table}

1509,17 → 1560,31

 \begin{table}\begin{center}
 \begin{tabular}{ll}
 {\tt swap\_in:} \\
-&       {\tt LOD stack(R12),R1} \\
+&       {\tt LOD stack(R12),R5} \\
 &       {\tt MOV 15(R1),uSP} \\
-&       {\tt LOD 15(R1),R0} \\
-&       {\tt MOV R0,uPC} \\
-&       {\tt LOD 14(R1),R0} \\
-&       {\tt MOV R0,uCC} \\
-&       {\tt LOD 13(R1),R0} \\
-&       {\tt MOV R0,uR12} \\
+        & {\em ; Be sure to exploit the memory pipelining capability} \\
+&       {\tt LOD 1(R5),R0} \\
+&       {\tt LOD 2(R5),R1} \\
+&       {\tt LOD 3(R5),R2} \\
+&       {\tt LOD 4(R5),R3} \\
+&       {\tt LOD 5(R5),R4} \\
+&       {\tt MOV R0,uR0} \\
+&       {\tt MOV R1,uR1} \\
+&       {\tt MOV R2,uR2} \\
+&       {\tt MOV R3,uR3} \\
+&       {\tt MOV R4,uR4} \\
         & \ldots {\em ; Need to repeat for all user registers} \\
-&       {\tt LOD 1(R1),R0} \\
-&       {\tt MOV R0,uR0} \\
+&       {\tt LOD 11(R5),R0} \\
+&       {\tt LOD 12(R5),R1} \\
+&       {\tt LOD 13(R5),R2} \\
+&       {\tt LOD 14(R5),R3} \\
+&       {\tt LOD 15(R5),R4} \\
+&       {\tt MOV R0,uR10} \\
+&       {\tt MOV R1,uR11} \\
+&       {\tt MOV R2,uR12} \\
+&       {\tt MOV R3,uCC} \\
+&       {\tt MOV R4,uPC} \\
 &       {\tt BRA return\_to\_user} \\
 \end{tabular}
 \caption{Example Restoring User Task Context}\label{tbl:context-in}

1716,9 → 1781,10

 \begin{table}\begin{center}
 \begin{bitlist}
 & R & DMA Active\\\hline
-& R & Wishbone error, transaction aborted (cleared on any write)\\\hline
+& R & Wishbone error, transaction aborted.  This bit is cleared the next time
+        this register is written to.\\\hline
 & R/W & Set to '1' to prevent the controller from incrementing the source address, '0' for normal memory copy. \\\hline
-& R/W & Set to '0' to prevent the controller from incrementing the
+& R/W & Set to '1' to prevent the controller from incrementing the
         destination address, '0' for normal memory copy. \\\hline
 \ldots 16 & W & The DMA Key.  Write a 12'hfed to these bits to start the
         activate any DMA transfer.  \\\hline

1795,7 → 1861,6

uPC & 31 & 32 & R/W & User Program Counter\\\hline

PIC & 32 & 32 & R/W & Primary Interrupt Controller \\\hline

WDT & 33 & 32 & R/W & Watchdog Timer\\\hline

CCHE & 34 & 32 & R/W & Manual Cache Controller\\\hline

CTRIC & 35 & 32 & R/W & Secondary Interrupt Controller\\\hline

TMRA & 36 & 32 & R/W & Timer A\\\hline

TMRB & 37 & 32 & R/W & Timer B\\\hline

1809,6 → 1874,10

 UMSTL & 45 & 32 & R/W & User memory stall counter\\\hline
 UPSTL & 46 & 32 & R/W & User Pre-Fetch Stall counter\\\hline
 UICNT & 47 & 32 & R/W & User instruction counter\\\hline
+DMACMD & 48 & 32 & R/W & DMA command and status register\\\hline
+DMALEN & 49 & 32 & R/W & DMA transfer length\\\hline
+DMARD & 50 & 32 & R/W & DMA read address\\\hline
+DMAWR & 51 & 32 & R/W & DMA write address\\\hline
 \end{reglist}
 \caption{Debug Register Addresses}\label{tbl:dbgaddrs}
 \end{center}\end{table}

2115,13 → 2184,14

         realized and at this rate may not be realized. (I've been intimidated
         by the challenge everytime I've looked through those codes.)
+\iffalse
 \item While the Wishbone Bus (B4) supports a pipelined mode with single cycle
         execution, the Zip CPU is unable to exploit this parallelism. Instead,
         apart from the DMA and the pipelined prefetch, all loads and stores
         are single wishbone bus operations requiring a minimum of 3 clocks.
         (In practice, this has turned into 7-clocks.)
+        % Addressed, 20150929
-\iffalse
 \item There is no control over whether or not an instruction sets the
         condition codes--certain instructions always set the condition codes,
         other instructions never set them. This effectively limits conditional

2173,6 → 2243,17

         off, keeping the CPU lightweight?  The same holds for the prefetch
         cache.
+\item The `{\tt .V}' condition was never used in any code other than my test
+        code.  Suggest changing it to a `{\tt .LE}' condition, which seems
+        to be more useful.
+\item {\bf Consider a more traditional Instruction Cache.}  The current
+        pipelined instruction cache just reads a window of memory into
+        its cache.  If the CPU leaves that window, the entire cache is
+        invalidated.  A more traditional cache, however, might allow
+        common subroutines to stay within the cache without invalidating the
+        entire cache structure.
 \iffalse
 \item {\bf Adjust the Zip CPU so that conditional instructions do not set
         flags}, although they may explicitly set condition codes if writing

2180,13 → 2261,7

         This is a simple change to the core, and may show up in new releases.
         % Fixed, 20150918
-\fi
-\item The `{\tt .V}' condition was never used in any code other than my test
-        code.  Suggest changing it to a `{\tt .LE}' condition, which seems
-        to be more useful.
-\iffalse
 \item Add in an {\bf unpredictable branch delay slot}, so that on any branch
         the delay slot may or may not be executed before the branch.
         Instructions that do not depend upon the branch, and that should be

2226,18 → 2301,8

         as soon as the decoder knows the instruction will need the bus.
         Indeed, if done in the decode stage, this might drop the seven cycle
         access down by two cycles.
         % FIXED: 20150918
-\fi
-\item {\bf Consider a more traditional Instruction Cache.}  The current
-        pipelined instruction cache just reads a window of memory into
-        its cache.  If the CPU leaves that window, the entire cache is
-        invalidated.  A more traditional cache, however, might allow
-        common subroutines to stay within the cache without invalidating the
-        entire cache structure.
-\iffalse
 \item {\bf Very Long Instruction Word (VLIW).}  Now, to speed up operation, I
         propose that the Zip CPU instruction set be modified towards a Very
         Long Instruction Word (VLIW) implementation. In this implementation,

/zipcpu/trunk/sw/zasm/test.S

519,6 → 519,33

         cmp     r0,r7
         trap.ne r11
 #endif
+#define PIPELINE_STACK_TEST
+#ifdef  PIPELINE_STACK_TEST
+        ldi     $0x0f000,r11    // Mark our test
+        LDI     1,R0
+        MOV     1(R0),R1
+        MOV     1(R1),R2
+        MOV     1(R2),R3
+        MOV     1(R3),R4
+        MOV     1(R4),R5
+        MOV     1(R5),R6
+        JSR(pipeline_stack_test,R7)
+        CMP     1,R0
+        trap.ne R11
+        CMP     2,R1
+        trap.ne R11
+        CMP     3,R2
+        trap.ne R11
+        CMP     4,R3
+        trap.ne R11
+        CMP     5,R4
+        trap.ne R11
+        CMP     6,R5
+        trap.ne R11
+        CMP     7,R6
+        trap.ne R11
+#endif
 // Return success / Test the trap interrupt
         clr     r11
         trap    r11

548,6 → 575,53

         POP(R1,SP)
         RET
 #endif
+#ifdef  PIPELINE_STACK_TEST
+pipeline_stack_test:
+        SUB     13,SP
+        STO     R0,1(SP)
+        STO     R1,2(SP)
+        STO     R2,3(SP)
+        STO     R3,4(SP)
+        STO     R4,5(SP)
+        STO     R5,6(SP)
+        STO     R6,7(SP)
+        STO     R7,8(SP)
+        STO     R8,9(SP)
+        STO     R9,10(SP)
+        STO     R10,11(SP)
+        STO     R11,12(SP)
+        STO     R12,13(SP)
+        XOR     -1,R0
+        XOR     -1,R1
+        XOR     -1,R2
+        XOR     -1,R3
+        XOR     -1,R4
+        XOR     -1,R5
+        XOR     -1,R6
+        XOR     -1,R7
+        XOR     -1,R8
+        XOR     -1,R9
+        XOR     -1,R10
+        XOR     -1,R11
+        XOR     -1,R12
+        LOD     1(SP),R0
+        LOD     2(SP),R1
+        LOD     3(SP),R2
+        LOD     4(SP),R3
+        LOD     5(SP),R4
+        LOD     6(SP),R5
+        LOD     7(SP),R6
+        LOD     8(SP),R7
+        LOD     9(SP),R8
+        LOD     10(SP),R9
+        LOD     11(SP),R10
+        LOD     12(SP),R11
+        LOD     13(SP),R12
+        ADD     13,SP
+        LOD     1(SP),PC
+#endif // PIPELINE_STACK_TEST
         fill    512,0
 stack:  // Must point to a valid word initially
         word    0