1 |
145 |
lanttu |
\documentclass[a4paper,10pt,oneside,final]{article}
|
2 |
|
|
%\documentclass[12pt,a4paper,english]{tutthesis}
|
3 |
|
|
%\documentclass[11pt,final]{tutdrthesis}
|
4 |
|
|
%\documentclass[11pt,final]{IEEETran}
|
5 |
|
|
|
6 |
|
|
% Otetaan tarvittavat paketit mukaan
|
7 |
|
|
\usepackage[dvips]{graphicx}
|
8 |
|
|
\usepackage{enumerate}
|
9 |
|
|
\usepackage[UKenglish]{babel}
|
10 |
|
|
\usepackage{cite}
|
11 |
|
|
\usepackage{subfigure}
|
12 |
|
|
|
13 |
|
|
% 'pslatex' is otherwise equal to 'times'
|
14 |
|
|
% but courier font is narrower
|
15 |
|
|
\usepackage{pslatex}
|
16 |
|
|
%\usepackage{times}
|
17 |
|
|
|
18 |
|
|
% 2 pkgs for Scandinavian alphabets
|
19 |
|
|
\usepackage[T1]{fontenc}
|
20 |
|
|
\usepackage[latin1]{inputenc}
|
21 |
|
|
|
22 |
|
|
\usepackage{listings}
|
23 |
|
|
\usepackage{color}
|
24 |
|
|
\definecolor{gray95}{gray}{.95}
|
25 |
|
|
|
26 |
|
|
|
27 |
|
|
|
28 |
|
|
\lstdefinestyle{ccc}
|
29 |
|
|
{
|
30 |
|
|
numbers=none,
|
31 |
|
|
basicstyle=\small\ttfamily,
|
32 |
|
|
keywordstyle=\bf\color[rgb]{0,0,0},
|
33 |
|
|
%commentstyle=\color[rgb]{0.133,0.545,0.133},
|
34 |
|
|
stringstyle=\color[rgb]{0.627,0.126,0.941},
|
35 |
|
|
backgroundcolor=\color{white},
|
36 |
|
|
frame=tb, %frame= lrtb,
|
37 |
|
|
framerule=0.5pt,
|
38 |
|
|
linewidth=\textwidth,
|
39 |
|
|
%aboveskip=-4.0pt,
|
40 |
|
|
%belowskip=-4.0pt,
|
41 |
|
|
lineskip=-5.0pt,
|
42 |
|
|
}
|
43 |
|
|
|
44 |
|
|
|
45 |
|
|
% style for transgen xml listings
|
46 |
|
|
\lstdefinestyle{a1listing}
|
47 |
|
|
{
|
48 |
|
|
numbers=none,
|
49 |
|
|
language=bash,
|
50 |
|
|
basicstyle=\small\bf\ttfamily,
|
51 |
|
|
emphstyle=\color[rgb]{0.0, 0.7, 0.3},
|
52 |
|
|
keywordstyle=\color[rgb]{0.0, 0.0, 1.0},
|
53 |
|
|
commentstyle=\color[rgb]{0.8, 0.0, 0.0},
|
54 |
|
|
stringstyle=\color[rgb]{0.737, 0.560, 0.560},
|
55 |
|
|
backgroundcolor=\color{gray95},
|
56 |
|
|
frame= lrtb,
|
57 |
|
|
framerule=0.5pt,
|
58 |
|
|
linewidth=\textwidth,
|
59 |
|
|
}
|
60 |
|
|
|
61 |
|
|
% console listings style
|
62 |
|
|
\lstdefinestyle{console}
|
63 |
|
|
{
|
64 |
|
|
numbers=none,
|
65 |
|
|
basicstyle=\small\bf\ttfamily,
|
66 |
|
|
backgroundcolor=\color{gray95},
|
67 |
|
|
frame=lrtb,
|
68 |
|
|
framerule=0.5pt,
|
69 |
|
|
linewidth=\textwidth,
|
70 |
|
|
}
|
71 |
|
|
|
72 |
|
|
|
73 |
|
|
|
74 |
|
|
|
75 |
|
|
% 2 Completely strange definitions
|
76 |
|
|
\newcommand{\longpage}{\enlargethispage*{100cm} \pagebreak}
|
77 |
|
|
\newcommand{\nohyphens}{\hyphenpenalty=10000\exhyphenpenalty=10000\relax}
|
78 |
|
|
|
79 |
|
|
|
80 |
|
|
% Poikkeukselliset tavutusmuodot, erotettu välilyönneillä
|
81 |
|
|
\hyphenation{Sal-mi-nen Kan-gas Rii-hi-mä-ki Kuu-si-lin-na
|
82 |
|
|
Hä-mä-läi-nen Kuk-ka-la HIBI TUTMAC Koski}
|
83 |
|
|
% \hyphenation{de-vel-oped pro-vides multi-stage Rctrl}
|
84 |
|
|
|
85 |
|
|
% Koitetaan estaa kuvien sijoittelu sivulle yksinaan ilman tekstia
|
86 |
|
|
% http://dcwww.camd.dtu.dk/~schiotz/comp/LatexTips/LatexTips.html
|
87 |
|
|
% Be careful not to make \floatpagefraction larger than \topfraction
|
88 |
|
|
\renewcommand{\topfraction}{0.85}
|
89 |
|
|
\renewcommand{\textfraction}{0.1}
|
90 |
|
|
\renewcommand{\floatpagefraction}{0.75}
|
91 |
|
|
|
92 |
|
|
|
93 |
|
|
%
|
94 |
|
|
% Define author(s) and component's name
|
95 |
|
|
%
|
96 |
|
|
\def\defauthor{Salminen, Hämäläinen}
|
97 |
|
|
\def\deftitle{HIBI v.3 \\Reference Manual}
|
98 |
|
|
|
99 |
|
|
\author{\defauthor}
|
100 |
|
|
\title{\deftitle}
|
101 |
|
|
|
102 |
|
|
\usepackage{fancyhdr}
|
103 |
|
|
\pagestyle{fancy}
|
104 |
|
|
\lhead{\bfseries Department of Computer Systems\\
|
105 |
|
|
Faculty of Computing and Electrical Engineering}
|
106 |
|
|
\chead{}
|
107 |
|
|
\rhead{\bfseries \deftitle}
|
108 |
|
|
\lfoot{\thepage}
|
109 |
|
|
\cfoot{}
|
110 |
|
|
\rfoot{TUT}
|
111 |
|
|
%\rfoot{\includegraphics[height=1.0cm]{../Fig/Eps/tut_logo.eps}}
|
112 |
|
|
\renewcommand{\headrulewidth}{0.4pt}
|
113 |
|
|
\renewcommand{\footrulewidth}{0.4pt}
|
114 |
|
|
|
115 |
|
|
\def\deftablecolora{blue!10!white}
|
116 |
|
|
\def\deftablecolorb{white}
|
117 |
|
|
|
118 |
|
|
\begin{document}
|
119 |
|
|
|
120 |
|
|
|
121 |
|
|
%\maketitle
|
122 |
|
|
%\thispagestyle{empty}
|
123 |
|
|
|
124 |
|
|
\begin{titlepage}
|
125 |
|
|
\begin{center}
|
126 |
|
|
|
127 |
|
|
\vspace{6.0cm}
|
128 |
|
|
\begin{center}
|
129 |
|
|
\includegraphics[height=1.0cm]{../Fig/Eps/tut_logo.eps}
|
130 |
|
|
\end{center}
|
131 |
|
|
\textsc{Faculty of Computing and Electrical Engineering}\\[1.0cm]
|
132 |
|
|
\textsc{Department of Computer Systems}\\[1.0cm]
|
133 |
|
|
%\textsc{\LARGE Tampere University of Technology}\\[1.0cm]
|
134 |
|
|
%\textsc{\Large Faculty of Computing and Electrical Engineering}\\[1.0cm]
|
135 |
|
|
%\textsc{\Large Department of Computer Systems}\\[1.0cm]
|
136 |
|
|
|
137 |
|
|
\vspace{6.0cm}
|
138 |
|
|
\hrule
|
139 |
|
|
\vspace{0.4cm}
|
140 |
|
|
{ \huge \bfseries Heterogenerous IP Block Interconnection (HIBI) \\ version 3 \\ [0.5cm]Reference Manual}
|
141 |
|
|
\vspace{0.4cm}
|
142 |
|
|
\hrule
|
143 |
|
|
|
144 |
|
|
%\vspace{2.0cm}
|
145 |
|
|
|
146 |
|
|
\vfill
|
147 |
|
|
|
148 |
|
|
\begin{minipage}{0.4\textwidth}
|
149 |
|
|
\begin{flushleft} \large
|
150 |
|
|
\emph{Author:}\\
|
151 |
|
|
Erno Salminen, \\Timo Hämäläinen
|
152 |
|
|
\end{flushleft}
|
153 |
|
|
\end{minipage}
|
154 |
|
|
\begin{minipage}{0.4\textwidth}
|
155 |
|
|
\begin{flushright} \large
|
156 |
|
|
\emph{Updated:} \\
|
157 |
|
|
\today
|
158 |
|
|
\end{flushright}
|
159 |
|
|
\end{minipage}
|
160 |
|
|
|
161 |
|
|
\end{center}
|
162 |
|
|
\end{titlepage}
|
163 |
|
|
|
164 |
|
|
|
165 |
|
|
%\title{HIBI data sheet - September 2011}
|
166 |
|
|
%\author{Erno Salminen}
|
167 |
|
|
%\begin{document}
|
168 |
|
|
% \onecolumn
|
169 |
|
|
%\include{cover}
|
170 |
|
|
|
171 |
|
|
|
172 |
|
|
\setcounter{secnumdepth}{-1}
|
173 |
|
|
|
174 |
|
|
|
175 |
|
|
% Add some space between lines
|
176 |
|
|
\linespread{1.25}\normalsize
|
177 |
|
|
|
178 |
|
|
\tableofcontents
|
179 |
|
|
|
180 |
|
|
|
181 |
|
|
\newpage \thispagestyle{empty}
|
182 |
|
|
\listoffigures
|
183 |
|
|
\listoftables
|
184 |
|
|
|
185 |
|
|
% \twocolumn
|
186 |
|
|
|
187 |
|
|
\newpage \thispagestyle{empty}
|
188 |
|
|
\setcounter{secnumdepth}{2}
|
189 |
|
|
|
190 |
|
|
|
191 |
|
|
\section{Introduction}
|
192 |
|
|
\label{ch:hibi}
|
193 |
|
|
|
194 |
|
|
|
195 |
|
|
This data sheet presents the third version of \textit{Heterogeneous IP
|
196 |
|
|
Block Interconnection} (HIBI). HIBI is intended for integrating
|
197 |
|
|
coarse-grain components such as intellectual property blocks that have
|
198 |
|
|
size of thousands of gates, see \cite{salminen04} for examples.
|
199 |
|
|
Topology, arbitration and data transfers are presented first. After
|
200 |
|
|
that, data buffering and the structure of wrapper component are
|
201 |
|
|
discussed. Finally, the developed runtime configuration is presented
|
202 |
|
|
followed by comparison to the previous version of HIBI.
|
203 |
|
|
|
204 |
|
|
|
205 |
|
|
|
206 |
|
|
HIBI is a communication network designed for System-on-Chips. It can
|
207 |
|
|
be used both in FPGA and ASIC designs (field-programmable gate-array,
|
208 |
|
|
application-specific integrated circuit). Fig.~\ref{fig:soc_concept}
|
209 |
|
|
shows an example SoC at conceptual level. There are many different
|
210 |
|
|
types of IP blocks (intellectual property), namely CPU (central
|
211 |
|
|
processing unit) for executing software, memories and IP blocks that
|
212 |
|
|
are either fixed function accelerators or interfaces to external
|
213 |
|
|
components. All these are connected using an on-chip network.
|
214 |
|
|
|
215 |
|
|
|
216 |
|
|
\begin{figure} [b]
|
217 |
|
|
\begin{center}
|
218 |
|
|
{\includegraphics[width=0.60\textwidth]{../Fig/Eps/fig_soc_concept.eps}}
|
219 |
|
|
\caption{Conceptual structure of system-on-chip}
|
220 |
|
|
\label{fig:soc_concept}
|
221 |
|
|
\end{center}
|
222 |
|
|
\end{figure}
|
223 |
|
|
|
224 |
|
|
\subsection{Main points}
|
225 |
|
|
The major design choices for HIBI were
|
226 |
|
|
\begin{itemize}
|
227 |
|
|
\item IP-block granularity for functional units
|
228 |
|
|
\item Application independent interface to allow re-use of processors and IP-blocks
|
229 |
|
|
\item Communication and computation separated
|
230 |
|
|
\item Communication network used in all transfers, no ad-hoc wires between IPs
|
231 |
|
|
\item support local clock domains for IP granularity
|
232 |
|
|
\end{itemize}
|
233 |
|
|
|
234 |
|
|
A parameterizable HW component, called HIBI wrapper, is used to
|
235 |
|
|
construct modular, hierarchical bus structures with distributed
|
236 |
|
|
arbitration and multiple clock domains as shown in Fig
|
237 |
|
|
\ref{fig:hierarchy} (explained later in detail). This simplifies
|
238 |
|
|
design and allows reuse since the same wrapper can always be utilized.
|
239 |
|
|
Configuration takes place both at synthesis time (e.g. data width and
|
240 |
|
|
buffer sizes) and on runtime (arbitration parameters).
|
241 |
|
|
|
242 |
|
|
In addition, since we are targeting also FPGAs, there are some additional constraints
|
243 |
|
|
\begin{itemize}
|
244 |
|
|
\item keep the number of wires low - to avoid exhausting routing resources
|
245 |
|
|
\item avoid global connections - to avoid long combinatorial routing delays
|
246 |
|
|
\item avoid 3-state wires - to simplify testing and synthesis (most FPGAs allow three-state logic onlu in I/O pins)
|
247 |
|
|
\end{itemize}
|
248 |
|
|
|
249 |
|
|
|
250 |
|
|
\subsection{Versions}
|
251 |
|
|
The development of HIBI \cite{kuusilinna98, lahtinen02, lahtinen04,
|
252 |
|
|
salminen10} started in 1997 in Tampere University of Technology.
|
253 |
|
|
Currently, there are 3 versions of HIBI, denoted as v1-v.3. However,
|
254 |
|
|
certain basics have remained unchanged. Hence, in the remainder the
|
255 |
|
|
version number is omitted unless, it is necessary.
|
256 |
|
|
|
257 |
|
|
In version 2, the biggest changes were removing tri-state logic and
|
258 |
|
|
increasing modularity and configurability.
|
259 |
|
|
|
260 |
|
|
For version 3, address decoder logic was modified to simplify
|
261 |
|
|
usage. Furthermore, the tx and rx state machines were re-factored,
|
262 |
|
|
which also necessitated minor change in bus timing. These latter FSM
|
263 |
|
|
changes do not affect the IP, though.
|
264 |
|
|
|
265 |
|
|
|
266 |
|
|
|
267 |
|
|
|
268 |
|
|
\section {HIBI topology}
|
269 |
|
|
|
270 |
|
|
\begin{figure*}
|
271 |
|
|
\begin{center}
|
272 |
|
|
{\includegraphics[width=0.85\textwidth]{../Fig/Eps/fig_topo_hibi_hierarchy.eps}}
|
273 |
|
|
\caption{Example of a hierarchical HIBI network with multiple clock domains and bus segments}
|
274 |
|
|
\label{fig:hierarchy}
|
275 |
|
|
\end{center}
|
276 |
|
|
\end{figure*}
|
277 |
|
|
|
278 |
|
|
|
279 |
|
|
The topology in HIBI is not fixed, but configurable by the
|
280 |
|
|
designer. HIBI network consists of wrappers, bus segments, and
|
281 |
|
|
bridges. These are the basic building blocks from which the whole
|
282 |
|
|
network is constructed and configured. All wrappers in the system are
|
283 |
|
|
instantiated from the same parameterizable HDL (HW description
|
284 |
|
|
language) entity and bridges are constructed by connecting two
|
285 |
|
|
wrappers together. If the connected segments use different data
|
286 |
|
|
widths, the bridges are responsible for the data width adaptation.
|
287 |
|
|
|
288 |
|
|
All wrappers can act both as a \textit{master} and a
|
289 |
|
|
\textit{slave}. Masters can initiate transfers and slaves can only
|
290 |
|
|
respond. In many buses, most units operate in on mode only and only
|
291 |
|
|
few in both modes. In the most simple case, there is only segment and
|
292 |
|
|
the topology is hence single shared bus. However, HIBI network can
|
293 |
|
|
have multiple segments which form a hierarchical bus
|
294 |
|
|
structure. Segments are connected together using bridges. Bridges
|
295 |
|
|
increase latency but, on the other hand, hierarchical structure allows
|
296 |
|
|
multiple parallel transactions. Bridge are simply constructed from 2
|
297 |
|
|
wrappers.
|
298 |
|
|
|
299 |
|
|
For the IP, the wrapper offers FIFO-based (first in, first out)
|
300 |
|
|
interface, as depicted in Fig. In network side, all signals inside a
|
301 |
|
|
segment are shared between wrappers and no dedicated point-to-point
|
302 |
|
|
signals are used. Arbitration decides which wrapper (or bridge)
|
303 |
|
|
controls the segment and the utilized arbitration algorithms
|
304 |
|
|
distributed to wrappers without any central controller.
|
305 |
|
|
|
306 |
|
|
\subsection{Example of hierarchical topology}
|
307 |
|
|
|
308 |
|
|
Bus performance can be scaled up by using bridges. Segments having
|
309 |
|
|
only simple peripheral devices can have a slow and narrow bus while
|
310 |
|
|
the main processing parts have higher capacity buses.
|
311 |
|
|
|
312 |
|
|
Fig.~\ref{fig:hierarchy} depicts an irregular HIBI network. The
|
313 |
|
|
example has a point-to-point link ($Seg A$), hierarchical bus ($Seg B$
|
314 |
|
|
and $SegC$), and multibus topology ($Seg C$ and $SegD$). Furthermore,
|
315 |
|
|
$Seg B$ is wider than other segments and thus offers greater
|
316 |
|
|
bandwidth. In the multibus configuration, each IP must decide which
|
317 |
|
|
bus to use while sending. Note that $Seg A$ could be implemented
|
318 |
|
|
without wrappers since there is no need for arbitration.
|
319 |
|
|
|
320 |
|
|
The example shows four clock domains. Agents in $Seg A$ and $SegB$ are
|
321 |
|
|
inside one domain and HIBI wrappers on $Seg C$ are in one domain.
|
322 |
|
|
However, two IPs in the top right corner use different clock than the
|
323 |
|
|
wrappers of $Seg C$. The IPs in the bottom right corner and all
|
324 |
|
|
wrappers in $Seg D$ are in one domain. The number of clock domains is
|
325 |
|
|
not otherwise restricted but all wrappers in one bus segment must use
|
326 |
|
|
the same clock. Handshaking between the clock domains is done in the
|
327 |
|
|
IP-wrapper interface or inside the bridge \cite{kulmala06b,
|
328 |
|
|
kulmala06e}. This allows the construction of GALS systems. The
|
329 |
|
|
example shows only one bridge but HIBI does not restrict either the
|
330 |
|
|
number of bridges or hierarchy levels in contrast to many bus
|
331 |
|
|
architectures.
|
332 |
|
|
|
333 |
|
|
\subsection{Switching}
|
334 |
|
|
Transfers inside a bus segment are circuit-switched and use a common
|
335 |
|
|
clock due to (current) implementation of the distributed arbitration.
|
336 |
|
|
However, HIBI bridges utilize switching principle that resembles
|
337 |
|
|
packet-switching so that bus segments are not circuit-switched
|
338 |
|
|
together. Instead, the data is stored inside the bridge until it gets
|
339 |
|
|
an access to the other segment. The data is forwarded to next segment
|
340 |
|
|
as soon as possible like in wormhole routing. However, no guarantees
|
341 |
|
|
are given for the minimum length of continuous transfer. If the
|
342 |
|
|
bridge cannot buffer all the data, the transfer is interrupted and the
|
343 |
|
|
source segment is free for other transfers. The interrupted wrapper
|
344 |
|
|
will continue the transfer on its next turn. It is also possible that
|
345 |
|
|
a bridge buffers parts from multiple transfers.
|
346 |
|
|
|
347 |
|
|
|
348 |
|
|
|
349 |
|
|
\section {Data transfer operations}
|
350 |
|
|
|
351 |
|
|
In HIBI, all transfers are bursts. In practice, there is always 1
|
352 |
|
|
address word followed by n data words. The max. n is wrapper-specific
|
353 |
|
|
arbitration parameters. HIBI v2. used multiplexed address and data
|
354 |
|
|
lines, but HIBI v.3 allows transmitting them in parallel. Due to
|
355 |
|
|
multiplexed addr/data lines, it is beneficial to send many data into
|
356 |
|
|
single address. This is quite different from ``traditional'' memory
|
357 |
|
|
accesses, with address and data at the same time. Hence, the
|
358 |
|
|
destination IP should keep track of received data count, e.g. TUT's
|
359 |
|
|
SDRAM controller can do this to avoid excess transmitting addr + data
|
360 |
|
|
pairs
|
361 |
|
|
|
362 |
|
|
The transfers are pipelined with arbitration, and hence the next
|
363 |
|
|
transfer can start immediately when the previous ends. The protocol on
|
364 |
|
|
the bus side is optimized so that there no wait cycles are allowed
|
365 |
|
|
during a transfer. This means that is sender runs out of data or the
|
366 |
|
|
receiver does not accept it fast enough, the transfer is
|
367 |
|
|
interrupted. On the next arbitration turn, the wrapper it continues
|
368 |
|
|
automatically. Note that IP may transfer data at pace it wishes. IP
|
369 |
|
|
has only to ensure that there is space in TX FIFO while writing and
|
370 |
|
|
that RX FIFO is not empty while reading.
|
371 |
|
|
|
372 |
|
|
In order to increase bus utilization, HIBI uses so called
|
373 |
|
|
split-transactions in read operation. It means that single read
|
374 |
|
|
operation is split into two phases: request and response. The bus
|
375 |
|
|
segment is released while the addressed IP handles the read request
|
376 |
|
|
and prepares its response. The other wrappers may use bus during that
|
377 |
|
|
period and this increases the overall performance, although a single
|
378 |
|
|
read becomes a little slower due additional arbitration round.
|
379 |
|
|
|
380 |
|
|
\begin{figure}
|
381 |
|
|
\begin{center}
|
382 |
|
|
{\includegraphics[width=0.7\textwidth]{../Fig/Eps/fig_basic_tx.eps}}
|
383 |
|
|
\caption{Example of read and write operations.}
|
384 |
|
|
\label{fig:basic_tx}
|
385 |
|
|
\end{center}
|
386 |
|
|
\end{figure}
|
387 |
|
|
|
388 |
|
|
\begin{figure}
|
389 |
|
|
\begin{center}
|
390 |
|
|
{\includegraphics[width=0.4\textwidth]{../Fig/Eps/fig_basic_tx2.eps}}
|
391 |
|
|
\caption{Basic transactions are write and read.}
|
392 |
|
|
\label{fig:basic_tx2}
|
393 |
|
|
\end{center}
|
394 |
|
|
\end{figure}
|
395 |
|
|
|
396 |
|
|
|
397 |
|
|
Write operation
|
398 |
|
|
\begin{itemize}
|
399 |
|
|
\item Includes destination address
|
400 |
|
|
\item Data is sent in words (=HIBI bus width)
|
401 |
|
|
\item Several words can follow: all will be sent to the same destination address
|
402 |
|
|
\end{itemize}
|
403 |
|
|
Read operation
|
404 |
|
|
\begin{itemize}
|
405 |
|
|
\item Includes exactly two words: destination address and return address (where to put the data)
|
406 |
|
|
\item Data is received in words
|
407 |
|
|
\item Several words can be received (all to same return address)
|
408 |
|
|
\begin{itemize}
|
409 |
|
|
\item No handshaking: data is transmitted/received when bus, sender, or receiver are available
|
410 |
|
|
\item No acknowledgements or flow control
|
411 |
|
|
\end{itemize}
|
412 |
|
|
\end{itemize}
|
413 |
|
|
|
414 |
|
|
|
415 |
|
|
Figs.~\ref{fig:basic_tx} and~\ref{fig:basic_tx2} depict the two basic
|
416 |
|
|
transfers: sending the read request, write, and the response to
|
417 |
|
|
read. IP can send multiple read requests before the previous ones have
|
418 |
|
|
completed. It is the responsibility of the requestor to keep track
|
419 |
|
|
which response belongs to which request. This can be implemented with
|
420 |
|
|
appropriate use of return addresses. The reader does not get data any
|
421 |
|
|
faster but the advantage is that the shared medium is available for
|
422 |
|
|
other agents in the middle of the transmission process and
|
423 |
|
|
consequently the achieved total throughput increases. In
|
424 |
|
|
packet-switched networks the split-transactions are commonly used and
|
425 |
|
|
also in modern bus protocols, such as AMBA
|
426 |
|
|
|
427 |
|
|
Since there is exactly one path between each source and destination,
|
428 |
|
|
all data is guaranteed to arrive in-order and hence no reordering
|
429 |
|
|
buffers are needed at the receiver. Data can be sent with different
|
430 |
|
|
relative priorities. High priority data, such as control messages,
|
431 |
|
|
bypass the normal data transfers inside the wrappers and bridges
|
432 |
|
|
resulting in smaller latency. This does not change the timing of bus
|
433 |
|
|
reservations, but it selects what is transferred first.
|
434 |
|
|
|
435 |
|
|
\subsection{HIBI Basic Transaction Motivation}
|
436 |
|
|
HIBI was motivated by streaming applications where continuous flow of
|
437 |
|
|
data is transmitted between IPs. Destinations are merely ports than
|
438 |
|
|
random accessed memory locations. Hence, HIBI is not natively a
|
439 |
|
|
processor memory bus but can be used for it as well.
|
440 |
|
|
|
441 |
|
|
HIBI does not implement end-to-end flow control but the IPs must do
|
442 |
|
|
not explicitly. The FIFO buffers and rx and tx side may get full if
|
443 |
|
|
the receiver does not eject data fast enough, and this will throttle
|
444 |
|
|
the transmitter as well. The wrappers takes care of retransmission at
|
445 |
|
|
the link level. (HIBI v.1 dropped data if the receiving buffer got
|
446 |
|
|
full but usage of v.1 is not recommended anymore).
|
447 |
|
|
|
448 |
|
|
|
449 |
|
|
\begin{figure*}
|
450 |
|
|
\begin{center}
|
451 |
|
|
{\includegraphics[width=0.8\textwidth]{../Fig/Eps/fig_tx_steps.eps}}
|
452 |
|
|
\caption{Logical steps that IP does during transaction.}
|
453 |
|
|
\label{fig:tx_steps}
|
454 |
|
|
\end{center}
|
455 |
|
|
\end{figure*}
|
456 |
|
|
|
457 |
|
|
|
458 |
|
|
Fig.~\ref{fig:tx_steps} shows the steps that IP needs to take when
|
459 |
|
|
communicating using HIBI. On the left, IP sends data when the TX FIFO
|
460 |
|
|
is not full. It must assign data, address valid (strobe), command, and
|
461 |
|
|
write enable signals at the same time. When receiving data, IP first
|
462 |
|
|
checks is the incoming value address or data word. This is done by
|
463 |
|
|
examining the address valid signal. One word is removed from the FIFO
|
464 |
|
|
on every clock cycle when receiver assigns read enable signal. Next,
|
465 |
|
|
IP must check is the operation write or read. In case of write, it
|
466 |
|
|
stores the incoming data to location defined by the address. In case
|
467 |
|
|
of read, the second word denotes the return address. It is the
|
468 |
|
|
address, where the read data word must be transmitted.
|
469 |
|
|
|
470 |
|
|
\section{Addressing}
|
471 |
|
|
|
472 |
|
|
All IP-blocks have unique address and register space defined at design
|
473 |
|
|
time and every transfer starts with single destination address.
|
474 |
|
|
Source identification not included in basic transfer and hence
|
475 |
|
|
|
476 |
|
|
a) Use data payload to define source, e.g. first world in a data packet
|
477 |
|
|
|
478 |
|
|
b) Use unique address inside IP block for each source (IP knows from
|
479 |
|
|
the destination address the sender)
|
480 |
|
|
|
481 |
|
|
Every wrappers has a set of addresses and they set with a VHDL generic
|
482 |
|
|
(automatic by Kactus). Wrappers may have varying address space sizes,
|
483 |
|
|
e.g. simple UART has only 2 addresses whereas memory has 16K
|
484 |
|
|
addresses. Incoming Addresses go through the receiving wrapper to the
|
485 |
|
|
receiving IP and it can identify the incoming data by its address. For
|
486 |
|
|
example, the uppermost bits define which IP is addressed and the
|
487 |
|
|
lowermost define the register of that IP.
|
488 |
|
|
|
489 |
|
|
There are wo ways to set addresses
|
490 |
|
|
1. manually
|
491 |
|
|
|
492 |
|
|
2. A generator script in Kactus tool does this automatically according
|
493 |
|
|
to system specification
|
494 |
|
|
|
495 |
|
|
IP may write arbitrarily long bursts to wrapper. Perhaps only one
|
496 |
|
|
address in the beginning followed by arbitrary number of data
|
497 |
|
|
words. Moreover, IP writes data in arbitrary pace to wrapper. There
|
498 |
|
|
can be any number of idle cycles between data words. Therefore, the
|
499 |
|
|
bursts sent by the IP do not necessarily have the same length in the
|
500 |
|
|
bus (between wrapper). For example, wrapper may split long IP-transfer
|
501 |
|
|
into multiple bus transfers if the arbitration algorithms gives
|
502 |
|
|
ownership to another wrapper in the middle. Each part of the transfer
|
503 |
|
|
starts with the same address as previous. On the other hand, a
|
504 |
|
|
wrapper may send many short IP-transfers consecutively at one turn.
|
505 |
|
|
|
506 |
|
|
These properties have two consequences:
|
507 |
|
|
|
508 |
|
|
1. Bursts from multiple source IP will be interleaved
|
509 |
|
|
|
510 |
|
|
2. Destination may get different number of addresses than sender.
|
511 |
|
|
|
512 |
|
|
Note that the destination IP does not know the sender unless it is
|
513 |
|
|
separately encoded into data or address
|
514 |
|
|
|
515 |
|
|
|
516 |
|
|
\subsection{HIBI destination addresses and channels}
|
517 |
|
|
|
518 |
|
|
In HIBI v.2, all transfers are bursts, i.e. address is transmitted
|
519 |
|
|
only in the beginning of the transfer and it is followed by one or
|
520 |
|
|
more data words. The maximum burst length is wrapper-specific. HIBI
|
521 |
|
|
uses mainly two-level addressing scheme: the upper bits of the address
|
522 |
|
|
identify the target terminal (e.g. $destination_0$) whereas the lower
|
523 |
|
|
bits define the additional identifier. This identifier can be used
|
524 |
|
|
either as an address to local memory, to select the correct reception
|
525 |
|
|
channel on DMA, to identify the source of the data, or to select
|
526 |
|
|
requested service. Certain packet-switched networks (at least those
|
527 |
|
|
implemented in this work) allow only one address per terminal. In that
|
528 |
|
|
case, the second level address must increase the header length.
|
529 |
|
|
|
530 |
|
|
|
531 |
|
|
HIBI destination addresses are
|
532 |
|
|
|
533 |
|
|
1. internal registers
|
534 |
|
|
|
535 |
|
|
2. ports (to/from IPs internal logic)
|
536 |
|
|
|
537 |
|
|
3. IPs memory locations transparent to outside
|
538 |
|
|
|
539 |
|
|
Burst transfers use channels (or ports) and IP block must perform
|
540 |
|
|
addressing (increment) internally since all data is sent to one
|
541 |
|
|
address. If IP's memory is transparent, the address seen outside
|
542 |
|
|
includes also IP-block address (e.g. in address 0xB100, oxB000 defines
|
543 |
|
|
the target IP and 0x100 internal memory)
|
544 |
|
|
|
545 |
|
|
|
546 |
|
|
\begin{figure}
|
547 |
|
|
\begin{center}
|
548 |
|
|
{\includegraphics[width=0.5\textwidth]{../Fig/Eps/fig_chan_addr.eps}}
|
549 |
|
|
\caption{Relation between addresses and channels.}
|
550 |
|
|
\label{fig:chan_addr}
|
551 |
|
|
\end{center}
|
552 |
|
|
\end{figure}
|
553 |
|
|
|
554 |
|
|
|
555 |
|
|
|
556 |
|
|
HIBI transfers can be abstracted as channels at IP-block side (but not
|
557 |
|
|
formally specified how). Easiest way to separate channels is to use
|
558 |
|
|
unique HIBI addresses. It is IP/System level design issue is to give
|
559 |
|
|
meaning to the channels. For example, accelerator receives data from
|
560 |
|
|
CPU0 via channel 0 and from CPU1 via channel 1 and so on. Basic HIBI
|
561 |
|
|
transactions are used to handle possible flow control and handshaking
|
562 |
|
|
in addition to transfers. Fig.~\ref{fig:chan_addr} shows an example
|
563 |
|
|
with 6 channels (addressing style of HIBI v.2) .
|
564 |
|
|
|
565 |
|
|
Note that all incoming channels 4-6 have the same 4 upper bits in
|
566 |
|
|
their addresses. In other words, the example uses a convention that
|
567 |
|
|
the base address of IP1 is 0xC00 and therefore its uppermost address
|
568 |
|
|
is implcitly 0xCFF. The channels can be easily distinguished from the
|
569 |
|
|
lowest address bits. In HIBI v.3 the addressing defined using two
|
570 |
|
|
parameters: start and end address. Designer can use the same addresses
|
571 |
|
|
as in HIBI v.2 based systems, but this scheme allows more freedom is
|
572 |
|
|
address definitions, which especially beneficial in hierarchical
|
573 |
|
|
systems
|
574 |
|
|
|
575 |
|
|
\subsection{Implementing flow control}
|
576 |
|
|
|
577 |
|
|
Flow control and handshaking must be implemented in IP-blocks. In practise leads to IP-block specific methods which must be carefully specified at design time.
|
578 |
|
|
Minimum issues to be agreed
|
579 |
|
|
\begin{enumerate}
|
580 |
|
|
\item Sender identification (e.g. unique channel address ties Ip block and purpose together)
|
581 |
|
|
\item Transfer size
|
582 |
|
|
\item Size unit in addressing(bytes/words)
|
583 |
|
|
\item Are byte enables utilized
|
584 |
|
|
\item Messages for non-posted transactions (Acknowledgements to
|
585 |
|
|
write/read)
|
586 |
|
|
\end{enumerate}
|
587 |
|
|
|
588 |
|
|
\subsection{Example: Overlapping and breaking transfers}
|
589 |
|
|
|
590 |
|
|
It was noted that the transfers may split due to arbitration. Example
|
591 |
|
|
in Fig.~\ref{fig:addr_interleaving} clarifies the phenomenon. Let us
|
592 |
|
|
assume that IP 1 and IP 2 send data to IP 3. We notice that IP 1 gets
|
593 |
|
|
the first turn in the bus its two first data words arrive to IP
|
594 |
|
|
3. However, after that IP 3 gets two consecutive words from IP 2, then
|
595 |
|
|
from IP 1 and so on. Note that in realistic case, the arbitration
|
596 |
|
|
happens less frequently but the example highlights the issue.
|
597 |
|
|
|
598 |
|
|
\begin{figure}
|
599 |
|
|
\begin{center}
|
600 |
|
|
{\includegraphics[width=0.5\textwidth]{../Fig/Eps/fig_addr_interleaving.eps}}
|
601 |
|
|
\caption{The transfers may get intereleaved due to arbitration.}
|
602 |
|
|
\label{fig:addr_interleaving}
|
603 |
|
|
\end{center}
|
604 |
|
|
\end{figure}
|
605 |
|
|
|
606 |
|
|
|
607 |
|
|
\begin{figure*}
|
608 |
|
|
\begin{center}
|
609 |
|
|
{\includegraphics[width=0.65\textwidth]{../Fig/Eps/fig_hibi_wrapper.eps}}
|
610 |
|
|
\caption{Structure of HIBI v.2 wrapper and configuration memory}
|
611 |
|
|
\label{fig:wrapper}
|
612 |
|
|
\end{center}
|
613 |
|
|
\end{figure*}
|
614 |
|
|
|
615 |
|
|
|
616 |
|
|
As a conclusion
|
617 |
|
|
\begin{enumerate}
|
618 |
|
|
\item Data is transferred in order through FIFO
|
619 |
|
|
\item If tx is interrupted in bus, wrapper re-sends address and
|
620 |
|
|
continues tx of rest of data to destination
|
621 |
|
|
\item Sender tx FIFO can not be cleared once written
|
622 |
|
|
\item Receiver can identify to which channel data is coming based on
|
623 |
|
|
address
|
624 |
|
|
\end{enumerate}
|
625 |
|
|
|
626 |
|
|
\section{Wrapper structure}
|
627 |
|
|
|
628 |
|
|
HIBI network is constructed using parameterizable builgin blocks
|
629 |
|
|
called wrappers. The wrappers take care of arbitration, link-level
|
630 |
|
|
transmission, data buffering, and optional clock-domain crossing. All
|
631 |
|
|
signals on both sides of the wrapper are unidirectional. For example,
|
632 |
|
|
there are separate multibit signals data\_in and data\_out. Let us
|
633 |
|
|
first consider the bus side, i.e. the signals between wrappers.
|
634 |
|
|
|
635 |
|
|
The structure of the HIBI v.2 wrapper is depicted in Fig
|
636 |
|
|
\ref{fig:wrapper}. The modular wrapper structure can be tuned to
|
637 |
|
|
better meet the application requirements by using different versions
|
638 |
|
|
of the internal units or leaving out properties that are not needed in
|
639 |
|
|
a particular application.
|
640 |
|
|
|
641 |
|
|
On IP side, there can be separate interfaces for every data priority
|
642 |
|
|
or they can be multiplexed into one interface. Furthermore, the power
|
643 |
|
|
control signals can be routed out of the wrapper if the IP block can
|
644 |
|
|
utilize them.
|
645 |
|
|
|
646 |
|
|
|
647 |
|
|
The main parts are buffers for transferring and receiving data and the
|
648 |
|
|
corresponding controllers. The transfer controller takes care of
|
649 |
|
|
distributed arbitration. The configuration memory stores the
|
650 |
|
|
arbitration parameters. Relative data priority is implemented by
|
651 |
|
|
adding extra FIFOs. A (de)multiplexer is placed between the FIFOs and
|
652 |
|
|
the corresponding controller so that the controller operates only on a
|
653 |
|
|
single FIFO interface. The separate (de)multiplexer allows adding
|
654 |
|
|
FIFOs to support priorities in excess of two without changing the
|
655 |
|
|
control. Currently, transmit multiplexer uses pre-emptive scheduling.
|
656 |
|
|
|
657 |
|
|
|
658 |
|
|
|
659 |
|
|
HIBI v.2 has multiplexed address and data lines whereas HIBI v.1 uses
|
660 |
|
|
separate address and data lines. Multiplexing decreases implementation
|
661 |
|
|
area because signal lines are removed and less buffering capacity is
|
662 |
|
|
needed for the addresses. This causes overhead in control logic but
|
663 |
|
|
that is less than the saving in buffering. Having fewer wires allows
|
664 |
|
|
wider spacing between wires and hence lower coupling capacitance. On
|
665 |
|
|
the other hand, the saved wiring area can be used for wider data
|
666 |
|
|
transfers to increase the available bandwidth. The HIBI protocol does
|
667 |
|
|
not require any specific control signals, but message-passing is
|
668 |
|
|
utilized when needed. HIBI v.1 assumes strictly non-blocking transfers
|
669 |
|
|
and omits handshake signals to minimize transfer latency but one
|
670 |
|
|
handshake signal \textit{Full} was added to HIBI v.2 to avoid FIFO
|
671 |
|
|
overflow at the receiver. As a result, blocking models of computation
|
672 |
|
|
can be used in system design and, in addition, the depths of FIFOs can
|
673 |
|
|
be considerably smaller than in HIBI v.1.
|
674 |
|
|
|
675 |
|
|
\subsection{Bus-side signals}
|
676 |
|
|
|
677 |
|
|
All outputs from wrappers are ``ORed'' together and OR-gates' outputs
|
678 |
|
|
are connected to all wrappers' inputs. This scheme avoids the
|
679 |
|
|
tri-state logic that was used in HIBI v.1.
|
680 |
|
|
Table~\ref{table:bus_signals} lists the bus side signals and
|
681 |
|
|
Fig.~\ref{fig:3_wrappers} illustrates the connection between wrapper
|
682 |
|
|
and OR-gates. The cycle-accurate bus timing is omitted from this used
|
683 |
|
|
guide for brevity. All bus side outputs come directly from register
|
684 |
|
|
except the handshaking signal full.
|
685 |
|
|
|
686 |
|
|
|
687 |
|
|
\begin{table*}
|
688 |
|
|
\caption {The signals at bus side, i.e. between the wrappers, in
|
689 |
|
|
\label{table:bus_signals}
|
690 |
|
|
v.2 and v.3 }
|
691 |
|
|
\begin{center}
|
692 |
|
|
\begin{tabular}{l | l | l | l}
|
693 |
|
|
\hline
|
694 |
|
|
Signal & Width & Dir. & Meaning \\
|
695 |
|
|
\hline \hline
|
696 |
|
|
data & generic & i+o & Data and address are multiplexed into single set of wires \\
|
697 |
|
|
av & 1 & i+o & Address valid. Notifies when address is transmitted \\
|
698 |
|
|
cmd & 3 & i+o & Command: read or write, data or conficuration etc. \\
|
699 |
|
|
full & 1 & i+o & Target wrapper is full and acannot accept the data. Current transfer will be repeated later \\
|
700 |
|
|
lock & 1 & i+o & Bus is reserved \\
|
701 |
|
|
\hline
|
702 |
|
|
\end{tabular}
|
703 |
|
|
\end{center}
|
704 |
|
|
\end{table*}
|
705 |
|
|
|
706 |
|
|
|
707 |
|
|
\begin{figure*}
|
708 |
|
|
\begin{center}
|
709 |
|
|
{\includegraphics[width=0.65\textwidth]{../Fig/Eps/fig_hibi_3_wrappers.eps}}
|
710 |
|
|
\caption{Structure of HIBI v.2 wrapper and configuration memory}
|
711 |
|
|
\label{fig:3_wrappers}
|
712 |
|
|
\end{center}
|
713 |
|
|
\end{figure*}
|
714 |
|
|
|
715 |
|
|
The number of data bits can be freely chosen. This is beneficial, for
|
716 |
|
|
example, when error correcting or detecting codes are added to data
|
717 |
|
|
and the resulting total data width is not equal to any power of two.
|
718 |
|
|
Active master asserts $Lock$ signal when it reserves the bus.
|
719 |
|
|
Handshaking is done with the $Full$ signal. When $Full$ is asserted,
|
720 |
|
|
the data word on the bus must be retransmitted by the wrapper. To
|
721 |
|
|
improve modularity, all signals are shared by all wrappers within a
|
722 |
|
|
segment and no point-to-point signaling is required. Consequently, the
|
723 |
|
|
interface of a wrapper does not depend on the number of agents and the
|
724 |
|
|
wrapper can be reused more easily. An OR network was selected for bus
|
725 |
|
|
signal resolution.
|
726 |
|
|
|
727 |
|
|
The HIBI implementation pays special attention on minimizing the
|
728 |
|
|
transfer latency by removing empty cycles from the arbitration process
|
729 |
|
|
by pipelining. Empty cycles are here defined as cycles when at least
|
730 |
|
|
one wrapper has data to send but the bus segment is not reserved. An
|
731 |
|
|
optimized protocol allows lower frequency, and hence lower power, for
|
732 |
|
|
certain performance level than inefficient protocol. Empty cycles
|
733 |
|
|
appear also when bus utilization is low as distributed round-robin
|
734 |
|
|
arbitration takes one cycle per agent. If only one agent is
|
735 |
|
|
transmitting, it has to wait a whole round-robin cycle between
|
736 |
|
|
transfers. In such cases, the priority-based arbitration is useful.
|
737 |
|
|
|
738 |
|
|
|
739 |
|
|
\subsection{IP-side signals}
|
740 |
|
|
The signals at IP interface are mostly the same signals as in the bus side. Interface signals are connected to FIFO buffers inside the wrapper and all output signals of the wrapper come from registers.
|
741 |
|
|
|
742 |
|
|
Most signals are driven by both IP and wrapper
|
743 |
|
|
\begin{itemize}
|
744 |
|
|
\item Command
|
745 |
|
|
\item Address / Address valid
|
746 |
|
|
\item Data
|
747 |
|
|
\begin{itemize}
|
748 |
|
|
\item May have high (message) and low (data) priotities (depends on wrapper type)
|
749 |
|
|
\item Priority is defined by transmissting IP-block (source)
|
750 |
|
|
\end{itemize}
|
751 |
|
|
\end{itemize}
|
752 |
|
|
|
753 |
|
|
On the other hand, the FIFO access control signals depend on the
|
754 |
|
|
direction. Both control signals Write enable and Read enable and
|
755 |
|
|
driven by wrapper. The status signals are driven by wrapper. There are
|
756 |
|
|
always at least two status signals FIFO full and FIFO empty. In
|
757 |
|
|
addition, the FIFO buffers developed for HIBI offer two others: One
|
758 |
|
|
data left at FIFO and One place left at FIFO, which may simplify the
|
759 |
|
|
logic IP.
|
760 |
|
|
|
761 |
|
|
The address signals at IP side offer few choices that described next.
|
762 |
|
|
|
763 |
|
|
\begin{figure*}
|
764 |
|
|
\begin{center}
|
765 |
|
|
{\includegraphics[width=0.6\textwidth]{../Fig/Eps/fig_ip_signals.eps}}
|
766 |
|
|
\caption{The signals between IP and wrapper}
|
767 |
|
|
\label{fig:ip_signals}
|
768 |
|
|
\end{center}
|
769 |
|
|
\end{figure*}
|
770 |
|
|
|
771 |
|
|
Fig~\ref{fig:ip_signals} depicts the signals between IP and wrapper and
|
772 |
|
|
Table~\ref{table:ip_signals} list their details.
|
773 |
|
|
|
774 |
|
|
\begin{table*}
|
775 |
|
|
\caption {The signals at wrapper's IP interface}
|
776 |
|
|
\label{table:ip_signals}
|
777 |
|
|
\begin{center}
|
778 |
|
|
\begin{tabular}{l | l | l | l}
|
779 |
|
|
\hline
|
780 |
|
|
Signal & Width & Dir. & Meaning \\
|
781 |
|
|
\hline \hline
|
782 |
|
|
rst\_n & 1 & i & Active low reset \\
|
783 |
|
|
clk & 1 & i & Clock, active on rising edge. Same for all wrappers inside one segment \\
|
784 |
|
|
data & generic & i+o & Data and address are multiplexed into single set of wires \\
|
785 |
|
|
av & 1 & i+o & Address valid. Notifies when address is transmitted \\
|
786 |
|
|
cmd & 3 & i+o & Command: read or write, data or conficuration etc. \\
|
787 |
|
|
re & 1 & i & Read enable. Wrapper can remove the first data from FIFO \\
|
788 |
|
|
we & 1 & i & Write enable. Adds the data from IP to TX FIFO \\
|
789 |
|
|
full & 1 & o & TX FIFO is full \\
|
790 |
|
|
empty & 1 & o & RX FIFO is empty \\
|
791 |
|
|
one\_p & 1 & o & TX FIFO has one place left, i.e. almost full \\
|
792 |
|
|
one\_d & 1 & o & RX FIFO has one data left, i.e. almost empty \\
|
793 |
|
|
\hline
|
794 |
|
|
\end{tabular}
|
795 |
|
|
\end{center}
|
796 |
|
|
\end{table*}
|
797 |
|
|
|
798 |
|
|
|
799 |
|
|
\subsection{Variants of IP interface}
|
800 |
|
|
There are 4 variants of the IP interface depending on how to handle
|
801 |
|
|
|
802 |
|
|
a) high/low priority data: one or two interfaces
|
803 |
|
|
|
804 |
|
|
b) address and data: separate interfaces or one multiplexed
|
805 |
|
|
|
806 |
|
|
The different wrapper are denoted with postfix $\_r<x>$
|
807 |
|
|
|
808 |
|
|
r1: a) 2 interfaces hi+lo; b) muxed a/d
|
809 |
|
|
|
810 |
|
|
r2: a) 1 interface hi/lo; b) separate a+d
|
811 |
|
|
|
812 |
|
|
r3: a) 2 interfaces hi+lo; b) separate a+d
|
813 |
|
|
|
814 |
|
|
r4: a) 1 interface hi/lo; b) muxed a/d
|
815 |
|
|
|
816 |
|
|
Since these options affect only the IP side, different wrapper types
|
817 |
|
|
can co-exist in the same system, and the wrappers' bus side interface
|
818 |
|
|
is always the same. Furthermore, the addresses work directly between
|
819 |
|
|
wrapper types. However, hi-priority data cannot bypass lo-prior data
|
820 |
|
|
in wrapper types r2 and r4. However, all data is always transmitted
|
821 |
|
|
|
822 |
|
|
For example, Nios subsystems utilize commonly r4 but SDRAM utilizes
|
823 |
|
|
r3. This is because SDRAM ctrl distinguishes DMA configuration and
|
824 |
|
|
memory data traffic with priority of incoming data. It also prevents
|
825 |
|
|
dead-lock. Fig ~\ref{fig:ip_interface_variants} depicts variants of
|
826 |
|
|
wrapper's IP side signals. Interface type r1 is the ``native''
|
827 |
|
|
interface that is used inside all other variants.
|
828 |
|
|
|
829 |
|
|
\begin{figure*}
|
830 |
|
|
\begin{center}
|
831 |
|
|
{\includegraphics[width=0.8\textwidth]{../Fig/Eps/fig_ip_interface_variants.eps}}
|
832 |
|
|
\caption{There are 4 variants of IP interface. There are two
|
833 |
|
|
selectable features, namely separations of hi/lo-prior data and
|
834 |
|
|
separate/multiplexed addressing.}
|
835 |
|
|
\label{fig:ip_interface_variants}
|
836 |
|
|
\end{center}
|
837 |
|
|
\end{figure*}
|
838 |
|
|
|
839 |
|
|
\subsection{Signal naming in VHDL}
|
840 |
|
|
The side and direction are marked into signal name in HIBI wrapper VHDL, for example
|
841 |
|
|
\begin{enumerate}
|
842 |
|
|
\item agent\_data\_in, agent\_data\_out,
|
843 |
|
|
\item bus\_data\_in, bus\_data\_out
|
844 |
|
|
\end{enumerate}
|
845 |
|
|
Fig.~\ref{fig:sgn_naming} clarifies the naming scheme.
|
846 |
|
|
|
847 |
|
|
\begin{figure*}
|
848 |
|
|
\begin{center}
|
849 |
|
|
{\includegraphics[width=0.5\textwidth]{../Fig/Eps/fig_sgn_naming.eps}}
|
850 |
|
|
\caption{The naming convention of ports}
|
851 |
|
|
\label{fig:sgn_naming}
|
852 |
|
|
\end{center}
|
853 |
|
|
\end{figure*}
|
854 |
|
|
|
855 |
|
|
\subsection{Cycle-accurate timing}
|
856 |
|
|
|
857 |
|
|
For brevity, only the IP side timing is explained. It is actually very simple.
|
858 |
|
|
The timing when transmitting is depicted in Fig
|
859 |
|
|
1) IP checks that tx FIFO is not full
|
860 |
|
|
2) IP sets data, command, addr/av, and write\_enable=1 for one clk cycle
|
861 |
|
|
|
862 |
|
|
\begin{figure*}
|
863 |
|
|
\begin{center}
|
864 |
|
|
\subfigure[IP sends.]{\includegraphics[width=0.95\textwidth]{../Fig/Eps/fig_tx_timing.eps}
|
865 |
|
|
\label{subfig:tx_timing}}
|
866 |
|
|
\subfigure[IP receives data]{\includegraphics[width=0.95\textwidth]{../Fig/Eps/fig_rx_timing.eps}
|
867 |
|
|
\label{subfig:rx_timing}}
|
868 |
|
|
\caption{Examples of timing at IP interface.}
|
869 |
|
|
\label{fig:interface_timing}
|
870 |
|
|
\end{center}
|
871 |
|
|
\end{figure*}
|
872 |
|
|
|
873 |
|
|
The timing when receiving is depicted in Fig
|
874 |
|
|
1) IP checks that rx FIFO is not empty
|
875 |
|
|
2) IP captures data, command, and addr/av
|
876 |
|
|
3) IP sets read\_enable=1 for one clk cycle
|
877 |
|
|
|
878 |
|
|
|
879 |
|
|
|
880 |
|
|
Notes on signal timing
|
881 |
|
|
\begin{enumerate}
|
882 |
|
|
\item Very easy to write/read on every other cycle
|
883 |
|
|
\item Almost as easy to write/read on every cycle. Needs a bit more
|
884 |
|
|
care with checking empty and full
|
885 |
|
|
\item IP may keep we=1 and re=1 continuously and just change/store
|
886 |
|
|
data according to full/empty
|
887 |
|
|
\item Signal FIFO full comes from register. It goes high on the next
|
888 |
|
|
cycle after the write, if at all. In the Tx example, writing value
|
889 |
|
|
0xacdc filled the FIFO
|
890 |
|
|
\item Setting we=1 when FIFO is full has no effect
|
891 |
|
|
\item Setting re=1 when FIFO is empty has no effect
|
892 |
|
|
\item Received data, addr/av and command appear to interface, if FIFO
|
893 |
|
|
was empty before. IP can use them directly. They are ``removed'' only
|
894 |
|
|
when read enable is activated o Checking empty==0 ensures validity
|
895 |
|
|
\item Data and command values are undefined when FIFO is empty. Most
|
896 |
|
|
likely the old values remain
|
897 |
|
|
\end{enumerate}
|
898 |
|
|
|
899 |
|
|
A Simple example VHDL code can be found in SVN
|
900 |
|
|
/release\_1/lib/hw\_lib/ips/computation/image\_xor/tb/tb\_image\_xor\_linemaker.vhd
|
901 |
|
|
It shows how to send address and data.
|
902 |
|
|
|
903 |
|
|
Fig.~\ref{fig:ip_fsm} shows the simple example FSM of the IP.
|
904 |
|
|
\begin{figure*}
|
905 |
|
|
\begin{center}
|
906 |
|
|
{\includegraphics[width=0.9\textwidth]{../Fig/Eps/fig_ip_fsm.eps}}
|
907 |
|
|
\caption{Example FSM of an IP}
|
908 |
|
|
\label{fig:ip_fsm}
|
909 |
|
|
\end{center}
|
910 |
|
|
\end{figure*}
|
911 |
|
|
|
912 |
|
|
Sometimes the output registers of the IP may cause unexpected behavior
|
913 |
|
|
for novices. Even if FIFO appears ``not full'', IP cannot necessarily
|
914 |
|
|
write new data. That happens if it was already writing and there was
|
915 |
|
|
only one place left at the FIFO. Hence, remember to check if IP is
|
916 |
|
|
already writing!
|
917 |
|
|
|
918 |
|
|
The following code snippet should clarify correct writing
|
919 |
|
|
\begin{lstlisting}[language=vhdl, style=console, basicstyle=\footnotesize,
|
920 |
|
|
title={Example code of IP's sending control}]
|
921 |
|
|
if (we_r ='1' and one_p_in='1') or full_in ='0' then
|
922 |
|
|
we_r <= '0'; //FIFO is becoming or already full
|
923 |
|
|
else
|
924 |
|
|
we_r <= '1'; // There is room in FIFO
|
925 |
|
|
data_r <= new_value;
|
926 |
|
|
end if;
|
927 |
|
|
\end{lstlisting}
|
928 |
|
|
|
929 |
|
|
|
930 |
|
|
|
931 |
|
|
|
932 |
|
|
HIBI wrapper shows the data as soon as it comes from the bus. Same
|
933 |
|
|
data might get used (counted) twice, if IP only checks the empty
|
934 |
|
|
signal. Remember to check if IP is already reading! The following
|
935 |
|
|
code snippet should clarify correct reading
|
936 |
|
|
|
937 |
|
|
\begin{lstlisting}[language=vhdl, style=console, basicstyle=\footnotesize,
|
938 |
|
|
title={Example code of IP's reception handling}]
|
939 |
|
|
if (re_r = '1' and one_d_in = '1') or empty_in = '1' then
|
940 |
|
|
re_r <= '0'; // Stop reading
|
941 |
|
|
else
|
942 |
|
|
re_r <= '1'; // Start or continue reading
|
943 |
|
|
end if;
|
944 |
|
|
|
945 |
|
|
if re_r = '1' then
|
946 |
|
|
if hibi_av_in = '0' then
|
947 |
|
|
// handle the incoming address
|
948 |
|
|
else
|
949 |
|
|
// handle the incoming data
|
950 |
|
|
end if;
|
951 |
|
|
end if;
|
952 |
|
|
\end{lstlisting}
|
953 |
|
|
|
954 |
|
|
Common pitfalls
|
955 |
|
|
\begin{itemize}
|
956 |
|
|
\item Not noticing that tx FIFO fills while writing. Consequence: Some
|
957 |
|
|
data are lost (not written to FIFO)
|
958 |
|
|
\item Write enable remains 1 for one cycle too long. Undefined data
|
959 |
|
|
written to FIFO, or the same data is written twice o In both of
|
960 |
|
|
above, the likely cause is not acocunting to output register of the
|
961 |
|
|
IP
|
962 |
|
|
\item Not noticing that rx FIFO goes empty while reading. Data
|
963 |
|
|
consumed by IP is undefined
|
964 |
|
|
\item Read enable remains 1 for one cycle too long. Next data is
|
965 |
|
|
accidentally read away from the FIFO unless FIFO was empty
|
966 |
|
|
\item Not noticing that rx data changes only after the clock edge when
|
967 |
|
|
re=1. IP uses the same data twice
|
968 |
|
|
\end{itemize}
|
969 |
|
|
|
970 |
|
|
|
971 |
|
|
|
972 |
|
|
|
973 |
|
|
|
974 |
|
|
\section{Arbitration}
|
975 |
|
|
A distinct feature in HIBI is that arbitration is distributed to
|
976 |
|
|
wrappers, meaning that they can decide the correct time to access the
|
977 |
|
|
bus by themselves. Therefore, no central arbiter is required. In
|
978 |
|
|
practice, Bus is ``offered'' to one wrapper on each cycle. The wrapper
|
979 |
|
|
reserves the bus using signal lock if has data to send.
|
980 |
|
|
|
981 |
|
|
Multiple policies are supported
|
982 |
|
|
\begin{enumerate}
|
983 |
|
|
\item Fixed priority, Round-robin
|
984 |
|
|
\item Dynamically adaptive arbitration (DAA)
|
985 |
|
|
\item Time-division multiple access (TDMA)
|
986 |
|
|
\item Random
|
987 |
|
|
\item Combination of above
|
988 |
|
|
\end{enumerate}
|
989 |
|
|
|
990 |
|
|
A scheme called Dynamically Adaptive Arbitration (DAA) was presented
|
991 |
|
|
in \cite{kulmala08b}. In most cases, designers should use round-robin
|
992 |
|
|
or DAA. If there is minor performance bottleneck, one can easily
|
993 |
|
|
configure the arbitration parameters.
|
994 |
|
|
|
995 |
|
|
\begin{figure*}
|
996 |
|
|
\begin{center}
|
997 |
|
|
{\includegraphics[width=0.8\textwidth]{../Fig/Eps/fig_arb_example.eps}}
|
998 |
|
|
\caption{Example timing in 3 arvitration policies.}
|
999 |
|
|
\label{fig:arb_example}
|
1000 |
|
|
\end{center}
|
1001 |
|
|
\end{figure*}
|
1002 |
|
|
|
1003 |
|
|
|
1004 |
|
|
Fig.~\ref{fig:arb_example} shows an example of different policies. A
|
1005 |
|
|
two-level arbitration scheme, a combination of time division multiple
|
1006 |
|
|
access (TDMA) and competition, is used in HIBI. In TDMA, time is
|
1007 |
|
|
divided into repeating time frames. Inside frames, agents are provided
|
1008 |
|
|
time slots when they are guaranteed an access to the communication
|
1009 |
|
|
channel. This way the throughput of each wrapper can be guaranteed.
|
1010 |
|
|
The worst-case response time for a bus access through TDMA is the
|
1011 |
|
|
interval of the adjacent time slots. TDMA in HIBI supports two flavors
|
1012 |
|
|
for handling the slots when there is no data send: keeping them or
|
1013 |
|
|
releasing the bus for competition.
|
1014 |
|
|
\begin{figure*}
|
1015 |
|
|
\begin{center}
|
1016 |
|
|
\subfigure[Low contention (send probability ~4\% per agent).]{\includegraphics[width=0.85\textwidth]{../Fig/Eps/fig_arb_recfg_lowcontention_v2.eps}
|
1017 |
|
|
\label{subfig:wave_arb_lowcont}}
|
1018 |
|
|
\subfigure[High contention (send probability ~30\% per agent).]{\includegraphics[width=0.85\textwidth]{../Fig/Eps/fig_arb_recfg_highcontention_v2.eps}
|
1019 |
|
|
\label{subfig:wave_arb_highcont}}
|
1020 |
|
|
\caption{Various arbitration schemes for 8-agent single bus and
|
1021 |
|
|
uniform random traffic. The differences become evident on highly
|
1022 |
|
|
utilized bus.}
|
1023 |
|
|
\label{fig:wave_arb}
|
1024 |
|
|
\end{center}
|
1025 |
|
|
\end{figure*}
|
1026 |
|
|
|
1027 |
|
|
|
1028 |
|
|
Competition is based either on round-robin or non-pre-emptive priority
|
1029 |
|
|
arbitration. The second level mechanism is used to arbitrate the
|
1030 |
|
|
unassigned or unused time slots. If the agent does not have anything
|
1031 |
|
|
to send in the beginning of its time slot, the time slot can be given
|
1032 |
|
|
away to allow maximal bus utilization. Priority arbitration as a
|
1033 |
|
|
second level method attempts to guarantee a small latency for high
|
1034 |
|
|
priority agents whereas round-robin provides a fair arbitration
|
1035 |
|
|
scheme. When the bus is freed and priority scheme is utilized, the
|
1036 |
|
|
agent with the highest priority can reserve the bus on the first
|
1037 |
|
|
cycle. If the bus has been idle for two cycles, the agent with the
|
1038 |
|
|
second highest priority may reserve it and so on. The maximum
|
1039 |
|
|
transfer length is restricted with runtime configurable parameter
|
1040 |
|
|
$max\_send$. For round-robin, the maximum wait time for accessing the
|
1041 |
|
|
bus is obtained by summing all $max\_send$ values. For priority-based
|
1042 |
|
|
arbitration, the maximum wait time can be defined only for the two
|
1043 |
|
|
highest priorities. This means that the low-priority agents may
|
1044 |
|
|
suffer starvation and system may end up in deadlock. Therefore, using
|
1045 |
|
|
only priority arbitration is not recommended.
|
1046 |
|
|
|
1047 |
|
|
|
1048 |
|
|
\subsection{Detailed timing example}
|
1049 |
|
|
|
1050 |
|
|
Fig.~\ref{fig:wave_arb} shows the differences in various arbitration
|
1051 |
|
|
policies and two traffic loads (low and high contention). HIBI is
|
1052 |
|
|
configured as single bus with 8 agents. Agent 0 performs dynamic
|
1053 |
|
|
reconfiguration (time instants $i-v$) and other agents generate
|
1054 |
|
|
uniformly distributed random traffic. The reconfiguration changes the
|
1055 |
|
|
arbitration policy at runtime. The exact configuration procedure is
|
1056 |
|
|
explained in more detail later %in Section\ref{ch:hibi:reconf}.
|
1057 |
|
|
The utilized arbitration policies are
|
1058 |
|
|
\begin{enumerate}[i)]
|
1059 |
|
|
\item round-robin
|
1060 |
|
|
\item combination of priority and round-robin
|
1061 |
|
|
\item priority
|
1062 |
|
|
\item random
|
1063 |
|
|
\item round-robin (again).
|
1064 |
|
|
\end{enumerate}
|
1065 |
|
|
Round-robin offers fair arbitration (each agent has its share) whereas
|
1066 |
|
|
priority favors the highest priority agents and leads to starvation of
|
1067 |
|
|
others. Their combination switches between them at user-defined
|
1068 |
|
|
intervals. Arbitration policy does not play a major role when bus is
|
1069 |
|
|
lightly loaded, as illustrated in Fig.~\ref{subfig:wave_arb_lowcont}.
|
1070 |
|
|
The differences are clear with higher load,
|
1071 |
|
|
Fig.~\ref{subfig:wave_arb_highcont}.
|
1072 |
|
|
|
1073 |
|
|
\subsection{Performance implications}
|
1074 |
|
|
|
1075 |
|
|
\begin{figure*}
|
1076 |
|
|
\begin{center}
|
1077 |
|
|
{\includegraphics[width=0.5\textwidth]{../Fig/Eps/gra_hibi_arb_rel_perf.eps}}
|
1078 |
|
|
\caption{Relative performance of arbitration algorithms in MPEG-4
|
1079 |
|
|
encoding \cite{kulmala08b}}
|
1080 |
|
|
\label{fig:hibi_arb_rel_perf}
|
1081 |
|
|
\end{center}
|
1082 |
|
|
\end{figure*}
|
1083 |
|
|
|
1084 |
|
|
% !!! ks. myös $http://ieeexplore.ieee.org/iel5/10626/33561/01594751.pdf$
|
1085 |
|
|
|
1086 |
|
|
Various arbitration methods of HIBI were compared in
|
1087 |
|
|
\cite{kulmala08b}. The test case was MPEG-4 encoding on MPSoC. HIBI
|
1088 |
|
|
has $6$ arbitrated components: $4$ CPUs, SDRAM, and performance
|
1089 |
|
|
monitor; all operating at $50 MHz$ frequency. The maximum transfer
|
1090 |
|
|
length was varied from 5 words (denoted as $tx=5$) to non-limited.
|
1091 |
|
|
Transfer length has major impact but all lengths of 50 words or over
|
1092 |
|
|
(tx>49) resulted in equal performance. The bus frequency was set to
|
1093 |
|
|
$1, 2, 5$, or $50~MHz$ in order to achieve varying bus utilization
|
1094 |
|
|
($75\%, 56\%, 26\%$, and $3\%$, respectively) with single application.
|
1095 |
|
|
The best and worse algorithms vary case by case but DAA performed well
|
1096 |
|
|
in general.
|
1097 |
|
|
|
1098 |
|
|
Fig.~\ref{fig:hibi_arb_rel_perf} plots the relative encoding
|
1099 |
|
|
performance between the worst and best algorithms. The curves denote
|
1100 |
|
|
different transfer lengths, and $1.0$ is the best algorithm for each
|
1101 |
|
|
case. Tx lengths over $49$ are joined for clarity because they yield
|
1102 |
|
|
practically the same results. With short transfers, the worst
|
1103 |
|
|
algorithm at $1~MHz$ HIBI ($75\%$ utilization) offers only $0.62x$ the
|
1104 |
|
|
performance of the best, at $2~MHz~0.73x$, at $5~MHz~0.98x$, and at
|
1105 |
|
|
$50~MHz$ there are no differences.
|
1106 |
|
|
|
1107 |
|
|
\section{Commands}
|
1108 |
|
|
|
1109 |
|
|
|
1110 |
|
|
Source IP sets the command and most commands are forwarded to the receiving IP.
|
1111 |
|
|
The most common commands are:
|
1112 |
|
|
\begin{itemize}
|
1113 |
|
|
\item Write data - regular send operation, so called posted write
|
1114 |
|
|
\item Read request - split-transaction, the requested data is returned
|
1115 |
|
|
later with regular write command
|
1116 |
|
|
\end{itemize}
|
1117 |
|
|
The other, less common commands are
|
1118 |
|
|
\begin{itemize}
|
1119 |
|
|
\item Idle - IPs never use this command, but this appears on the bus
|
1120 |
|
|
when no-one sends anything
|
1121 |
|
|
\item High priority - bypasses normal data in the wrappers, otherwise
|
1122 |
|
|
just like regular operation, can be added to many commands
|
1123 |
|
|
\item Write and read config - access the configuration memories inside
|
1124 |
|
|
the wrappers. Not forwarded to the IP at the receiving end
|
1125 |
|
|
\item Multicast - send the same data to multiple targets (only in HIBI
|
1126 |
|
|
v.2)
|
1127 |
|
|
\item Non-posted write - Receveir IP must provide some response (ACK
|
1128 |
|
|
or NACK) (v.3 only)
|
1129 |
|
|
\item Linked read + conditional write - to perform
|
1130 |
|
|
read-modify-write (v.3 only)
|
1131 |
|
|
\item Exclusive access - reserve the whole path to the destination,
|
1132 |
|
|
read, write, and remove the lock (v.3 only)
|
1133 |
|
|
\end{itemize}
|
1134 |
|
|
|
1135 |
|
|
HIBI v.3 has 5 command bits and v.2 had only 3 bits,see
|
1136 |
|
|
Tables~\ref{table:hibi_v3_cmd} and~\ref{table:hibi_v2_cmd}.
|
1137 |
|
|
|
1138 |
|
|
\begin{table*}
|
1139 |
|
|
\caption {The command codes in HIBI v.3}
|
1140 |
|
|
\label{table:hibi_v3_cmd}
|
1141 |
|
|
\begin{center}
|
1142 |
|
|
\begin{tabular}{l | l | r |l}
|
1143 |
|
|
\hline
|
1144 |
|
|
Cmd & Code & Code & Meaning \\
|
1145 |
|
|
& [4:0] & [decimal]& \\
|
1146 |
|
|
\hline \hline
|
1147 |
|
|
idle & 0 0000 & 0 & Appears on the bus when it is free \\
|
1148 |
|
|
<reserved> & 0 0001 & 1 & not used, most unused codes hidden from the table \\
|
1149 |
|
|
wr data & 0 0010 & 2 & Regular write \\
|
1150 |
|
|
wr data hi-prior & 0 0011 & 3 & - `` - w/ high priority \\
|
1151 |
|
|
\hline
|
1152 |
|
|
rd data & 0 0100 & 4 & Request of the split-transaction \\
|
1153 |
|
|
rd data hi-prior & 0 0101 & 5 & - `` - w/ high priority \\
|
1154 |
|
|
rd data linked & 0 0110 & 6 & \\
|
1155 |
|
|
rd d. linked hi-p& 0 0111 & 7 & - `` - w/ high priority \\
|
1156 |
|
|
\hline
|
1157 |
|
|
|
1158 |
|
|
wr data non-post & 0 1000 & 8 & Write that expects response\\
|
1159 |
|
|
wr d. non-post hi-p& 0 1001 & 9 & - `` - w/ high priority \\
|
1160 |
|
|
wr conditional & 0 1010 & 10 & Write that follows rd linked \\
|
1161 |
|
|
wr cond. hi-p & 0 1011 & 11 & - `` - w/ high priority \\
|
1162 |
|
|
\hline
|
1163 |
|
|
|
1164 |
|
|
% <reserved> & 0 1100 & 12 & not used \\
|
1165 |
|
|
excl. lock & 0 1101 & 13 & Locks the path to the destination \\
|
1166 |
|
|
% <reserved> & 0 1110 & 14 & not used \\
|
1167 |
|
|
excl. wr & 0 1111 & 15 & Exclusive write, must follow excl.lock \\
|
1168 |
|
|
\hline
|
1169 |
|
|
% <reserved> & 1 0000 & 16 & not used \\
|
1170 |
|
|
excl. rd & 1 0001 & 17 & Exclusive read request, must follow excl.lock \\
|
1171 |
|
|
% <reserved> & 1 0010 & 18 & not used \\
|
1172 |
|
|
excl. release & 1 0011 & 19 & Removed the lock from the path\\
|
1173 |
|
|
\hline
|
1174 |
|
|
% <reserved> & 1 0100 & 20 & not used \\
|
1175 |
|
|
wr config & 1 0101 & 21 & \\
|
1176 |
|
|
% <reserved> & 1 0110 & 22 & not used \\
|
1177 |
|
|
rd config & 1 0111 & 23 & \\
|
1178 |
|
|
\hline
|
1179 |
|
|
<reserved> & 1 1xxx & 24-31 & not used \\
|
1180 |
|
|
\hline
|
1181 |
|
|
|
1182 |
|
|
|
1183 |
|
|
\hline
|
1184 |
|
|
\end{tabular}
|
1185 |
|
|
\end{center}
|
1186 |
|
|
\end{table*}
|
1187 |
|
|
|
1188 |
|
|
|
1189 |
|
|
\begin{table*}
|
1190 |
|
|
\caption {The command codes in HIBI v.2}
|
1191 |
|
|
\label{table:hibi_v2_cmd}
|
1192 |
|
|
\begin{center}
|
1193 |
|
|
\begin{tabular}{l | l | l}
|
1194 |
|
|
\hline
|
1195 |
|
|
Cmd & Code [2:0] & Meaning \\
|
1196 |
|
|
\hline \hline
|
1197 |
|
|
idle & 000 & Appears on the bus when it is free \\
|
1198 |
|
|
wr config data & 001 & Updates config mem inside the wrapper \\
|
1199 |
|
|
wr data & 010 & Regular write \\
|
1200 |
|
|
wr data hi-prior & 011 & High-priority data bypasses the regualr one \\
|
1201 |
|
|
\hline
|
1202 |
|
|
rd data & 100 & Request of the split-transaction \\
|
1203 |
|
|
rd config data & 101 & Requests a value from wrapper's config mem \\
|
1204 |
|
|
multicast data & 110 & Sends to all wrappers whose uppemost addr bits match \\
|
1205 |
|
|
multicast config & 111 & Same as above for high-priority data\\
|
1206 |
|
|
\hline
|
1207 |
|
|
\end{tabular}
|
1208 |
|
|
\end{center}
|
1209 |
|
|
\end{table*}
|
1210 |
|
|
|
1211 |
|
|
|
1212 |
|
|
\section {Buffering and signaling}
|
1213 |
|
|
The model of computation used in HIBI design approach assumes bounded
|
1214 |
|
|
first-in-first-out (FIFO) buffers between processes. A simple FIFO
|
1215 |
|
|
interface can be adapted to other interfaces such as the OCP
|
1216 |
|
|
(Open Core Protocol)\cite{ocp03}.
|
1217 |
|
|
% The basic principle of OCP is shown in
|
1218 |
|
|
% Fig \ref{fig:hibi_ocp}.
|
1219 |
|
|
% Transfers are initiated by $masters$ and $slaves$
|
1220 |
|
|
% only respond to requests. The OCP transfers are translated to underlying
|
1221 |
|
|
% network protocol, in this case HIBI, and back by OCP wrappers.
|
1222 |
|
|
% \begin{figure} [t]
|
1223 |
|
|
% \begin{center}
|
1224 |
|
|
% \includegraphics[width=0.7\textwidth]{../Fig/Eps/fig_hibi_ocp.eps}
|
1225 |
|
|
% \caption{Using OCP with HIBI. The OCP interface is located between IP and HIBI wrapper}
|
1226 |
|
|
% \label{fig:hibi_ocp}
|
1227 |
|
|
% \end{center}
|
1228 |
|
|
% \end{figure}
|
1229 |
|
|
Consequently, IP components use only OCP protocol and are isolated
|
1230 |
|
|
from the actual network implementation. Ideally, network can be
|
1231 |
|
|
chosen freely without affecting the IPs. However, not all features of
|
1232 |
|
|
HIBI, such as relative data priorities or dynamic reconfiguration, can
|
1233 |
|
|
be used with OCP directly but only the basic transfers.
|
1234 |
|
|
|
1235 |
|
|
To avoid excess buffering or retransfers, the received data must be
|
1236 |
|
|
read from the FIFO as soon as possible, for example by using a direct
|
1237 |
|
|
memory access controller. As a result, the receiver buffer space is
|
1238 |
|
|
not dictated by the \emph{amount} of transferred data, but the
|
1239 |
|
|
\emph{latency} of reading data from the wrapper. This scheme resembles
|
1240 |
|
|
wormhole routing, but the links are not reserved if the receiver is
|
1241 |
|
|
stalled.
|
1242 |
|
|
|
1243 |
|
|
\section {Configuration}
|
1244 |
|
|
|
1245 |
|
|
HIBI is both modular and configurable. At design time: structural and
|
1246 |
|
|
functional settings are made, whereas at run-time, one can modify data
|
1247 |
|
|
transfer properties (arbitration types, wrapper specific QoS
|
1248 |
|
|
settings).
|
1249 |
|
|
|
1250 |
|
|
Fig.~\ref{fig:cfg_mem} shows the structure of the configuration
|
1251 |
|
|
memory.
|
1252 |
|
|
\begin{figure}
|
1253 |
|
|
\begin{center}
|
1254 |
|
|
{\includegraphics[width=0.5\textwidth]{../Fig/Eps/fig_cfg_mem.eps}}
|
1255 |
|
|
\caption{Structure of the wrapper's configuration memory}
|
1256 |
|
|
\label{fig:cfg_mem}
|
1257 |
|
|
\end{center}
|
1258 |
|
|
\end{figure}
|
1259 |
|
|
|
1260 |
|
|
\subsection{Generic parameters in VHDL}
|
1261 |
|
|
HIBI has a large set of generic parameters. They are categorized as
|
1262 |
|
|
follows
|
1263 |
|
|
\begin{enumerate}
|
1264 |
|
|
\item Stuctural
|
1265 |
|
|
\begin{itemize}
|
1266 |
|
|
\item Widths of interface ports: data, command, debug port
|
1267 |
|
|
\item Widths of internal signals: address, wrapper identifier field,
|
1268 |
|
|
counters
|
1269 |
|
|
\item Sizes of tx and rx FIFOs, both lo and hi priorities
|
1270 |
|
|
\item Use 0, 2, 3 etc.
|
1271 |
|
|
\item Run-time configuration: number of cfg pages, num of
|
1272 |
|
|
app-specific extra registers
|
1273 |
|
|
\end{itemize}
|
1274 |
|
|
\item Synchronization
|
1275 |
|
|
\begin{itemize}
|
1276 |
|
|
\item Type of the synchronizing FIFO buffers
|
1277 |
|
|
\item Relative frequencies of IP and bus
|
1278 |
|
|
\end{itemize}
|
1279 |
|
|
\item Functional
|
1280 |
|
|
\begin{itemize}
|
1281 |
|
|
\item Identifier, own address
|
1282 |
|
|
\item For bridges: base identifier, inverted address space
|
1283 |
|
|
\item Arbitration: type, priority, how many words to at one turn,
|
1284 |
|
|
number of agents in the same segment
|
1285 |
|
|
\item For TDMA: number of time slots, how to handle unused slots
|
1286 |
|
|
(keep/give away)
|
1287 |
|
|
\item Enable/disable multicast functionality
|
1288 |
|
|
\item Enable/disable runtime configuration functionality (affects
|
1289 |
|
|
structure=area as well)
|
1290 |
|
|
\end{itemize}
|
1291 |
|
|
\end{enumerate}
|
1292 |
|
|
|
1293 |
|
|
Table~\ref{tab:generics} lists all the generics. Certain parameters
|
1294 |
|
|
are system-wide settings, for example the width of the command. Some
|
1295 |
|
|
are segment-wide, for example bus clock, data width, and number of
|
1296 |
|
|
wrappers in that segment. The rest are instance-specific, for example
|
1297 |
|
|
buffer sizes and priorities.
|
1298 |
|
|
\begin{table*}
|
1299 |
|
|
\caption{Properties of HIBI v.1 and v.2.}
|
1300 |
|
|
\label{tab:generics}
|
1301 |
|
|
\begin{center}
|
1302 |
|
|
\includegraphics[width=0.95\textwidth]{../Fig/Eps/tab_generics.eps}
|
1303 |
|
|
\end{center}
|
1304 |
|
|
\end{table*}
|
1305 |
|
|
|
1306 |
|
|
|
1307 |
|
|
\subsection{Clocking}
|
1308 |
|
|
HIBI can support may clock domains. The border is either between IP
|
1309 |
|
|
and wrapper, or in the middle of a bridge. There are five options:
|
1310 |
|
|
\begin{enumerate}
|
1311 |
|
|
\item Fully synchronous
|
1312 |
|
|
\item Synchronous multi-clk: Clock frequencies are integer-multiples
|
1313 |
|
|
of each other. Clocks are in the same phase. Easy to use with FPGA's
|
1314 |
|
|
PLLs
|
1315 |
|
|
\item GALS: No assumptions about relations (phase, speed) between
|
1316 |
|
|
clocks. Has longer synch. latency than synch.multiclock.
|
1317 |
|
|
\item Gray FIFO: FIFO depth limited to power of two ($=2^n$)
|
1318 |
|
|
\item Mixed clock pausible
|
1319 |
|
|
\end{enumerate}
|
1320 |
|
|
|
1321 |
|
|
The method must be decide at synthesis time.
|
1322 |
|
|
|
1323 |
|
|
\subsection {Runtime reconfiguration}
|
1324 |
|
|
\label{ch:hibi:reconf}
|
1325 |
|
|
Wrapper has config memory that stores all information for
|
1326 |
|
|
distributed arbitration. It can be synthesized in many ways:
|
1327 |
|
|
\begin{itemize}
|
1328 |
|
|
\item Permanent: ROM, 1 page
|
1329 |
|
|
\item Partial run-time configurable: ROM with several pages
|
1330 |
|
|
\item Full run-time configurable: RAM, with pages
|
1331 |
|
|
\item Kactus supports currently 1-page ROM
|
1332 |
|
|
\end{itemize}
|
1333 |
|
|
|
1334 |
|
|
|
1335 |
|
|
|
1336 |
|
|
HIBI allows the runtime configuration of all arbitration parameters to
|
1337 |
|
|
maximize performance. This is achieved so that one of the agents (e.g.
|
1338 |
|
|
system controller CPU) writes the new configuration values to all
|
1339 |
|
|
wrappers. The configuration values are sent through the regular data
|
1340 |
|
|
lines. During the normal operation, i.e. when the configuration is
|
1341 |
|
|
not changed, the controller CPU can perform its computation tasks. In
|
1342 |
|
|
the best case, other PEs can continue their transfers even if HIBI is
|
1343 |
|
|
being configured. However, some operations, such as swapping
|
1344 |
|
|
priorities of two wrappers, necessitate disabling other transfers
|
1345 |
|
|
momentarily.
|
1346 |
|
|
|
1347 |
|
|
|
1348 |
|
|
The structure of the configuration memory is illustrated at the bottom
|
1349 |
|
|
of Fig \ref{fig:wrapper}. It includes multiple configuration pages
|
1350 |
|
|
for storing the parameter values, a register storing the number of
|
1351 |
|
|
currently active page, clock cycle counter, and logic that checks the
|
1352 |
|
|
start and end of times of the time slots. The receive controller
|
1353 |
|
|
takes care of writing new configuration values whereas the
|
1354 |
|
|
configuration values and time slot signals are fed to the transfer
|
1355 |
|
|
controller. Configuration values can be written to non-active pages
|
1356 |
|
|
before they are used to minimize the risk of conflict when the
|
1357 |
|
|
configuration is performed.
|
1358 |
|
|
|
1359 |
|
|
|
1360 |
|
|
|
1361 |
|
|
|
1362 |
|
|
For very regular traffic, the TDMA slots can be set to minimize the
|
1363 |
|
|
latency, i.e. slot starts shortly after the availability of data. For
|
1364 |
|
|
TDMA, each wrapper has an internal cycle counter to decide correct
|
1365 |
|
|
times to access the bus. For this reason, wrappers in one bus segment
|
1366 |
|
|
must be synchronized. When data is produced with varying time
|
1367 |
|
|
intervals or quantities, the time slots cannot be optimally located.
|
1368 |
|
|
By runtime reconfiguration, the cycle counters can be reset to an
|
1369 |
|
|
arbitrary clock cycle value within the time frame to keep time slots
|
1370 |
|
|
in the correct place with respect to data availability. Also the
|
1371 |
|
|
length and owner of the slots can be changed. The resynchronization
|
1372 |
|
|
can be triggered explicitly from software or automatically by a
|
1373 |
|
|
specific monitor unit, which monitors how effectively time slots are
|
1374 |
|
|
used and starts the reconfiguration if needed \cite{kangas02}.
|
1375 |
|
|
Roughly 10 \% improvement in HIBI v.1 throughput in video encoding due
|
1376 |
|
|
to dynamic reconfiguration was reported in \cite{lahtinen02}. Larger
|
1377 |
|
|
gains are expected when several applications are executed on a single
|
1378 |
|
|
platform. Reconfiguration was used in \cite{kulmala08b} to speed-up
|
1379 |
|
|
the exploration on FPGA. It allowed notably less synthesis runs, each
|
1380 |
|
|
of which took several hours.
|
1381 |
|
|
|
1382 |
|
|
As a new feature in HIBI v.2, the second-level arbitration method can
|
1383 |
|
|
be changed at runtime between priority and round-robin or both of them
|
1384 |
|
|
can be disabled. When the second-level arbitration is disabled, only
|
1385 |
|
|
the basic TDMA is used and the slot owner reserves the bus always for
|
1386 |
|
|
the whole allocated time slot. Similarly, only the second-level
|
1387 |
|
|
arbitration is utilized when no time slots are allocated.
|
1388 |
|
|
|
1389 |
|
|
\begin{figure*} [t]
|
1390 |
|
|
\begin{center}
|
1391 |
|
|
{\includegraphics[width=0.75\textwidth]{../Fig/Eps/fig_hibi_cfg_mem_wave.eps}}
|
1392 |
|
|
\caption{Example of runtime configuration}
|
1393 |
|
|
\label{fig:cfg_mem_wave}
|
1394 |
|
|
\end{center}
|
1395 |
|
|
\end{figure*}
|
1396 |
|
|
|
1397 |
|
|
In HIBI v.2, three methods are used to improve the configuration
|
1398 |
|
|
procedure. First, by making use of the bus nature, each common
|
1399 |
|
|
parameter can be broadcast to all wrappers. Second, enabling the
|
1400 |
|
|
reading of configuration values simplifies the procedure as the whole
|
1401 |
|
|
configuration does not have to be stored in the configuring agent. In
|
1402 |
|
|
contrast, the configuring agent can read the old parameter values to
|
1403 |
|
|
help determining the new ones. Third, additional storage capacity for
|
1404 |
|
|
multiple parameter pages has been added to enable rapid change of all
|
1405 |
|
|
parameters. When a configuration page changes, all the parameters are
|
1406 |
|
|
updated immediately with one bus operation. It is possible to store a
|
1407 |
|
|
specific configuration for every application (phase) in its own
|
1408 |
|
|
configuration page to enable fast configuration switching.
|
1409 |
|
|
|
1410 |
|
|
% !!! KS. myös, tuohon ei kyllä löydy viitettä kuka julkaissut
|
1411 |
|
|
% ym,joten se ei varmaan käy
|
1412 |
|
|
%
|
1413 |
|
|
% $http://www.eetasia.com/ARTICLES/2005JAN/B/2005JAN17_MPR_TA.pdf?SOURCES=DOWNLOAD$
|
1414 |
|
|
|
1415 |
|
|
|
1416 |
|
|
|
1417 |
|
|
|
1418 |
|
|
Runtime reconfiguration is illustrated in Fig \ref{fig:cfg_mem_wave}
|
1419 |
|
|
for 2-page configuration memory. Signals coming from receive
|
1420 |
|
|
controller to configuration memory (\textit{addr\_in, data\_in,
|
1421 |
|
|
we\_in}) are shown on top. % with
|
1422 |
|
|
% post-fix
|
1423 |
|
|
% \emph{\_in}.
|
1424 |
|
|
In the middle are the registers \textit{.prior, .n\_agents, .arb\_type, .max\_send} for both
|
1425 |
|
|
configuration pages (all parameter registers are not shown for clarity). On
|
1426 |
|
|
the bottom, are the signals from memory to transfer controller
|
1427 |
|
|
(\textit{prior\_out, n\_agents\_out, arb\_type\_out, max\_send\_out}).
|
1428 |
|
|
In the example, the first digit of the address defines the page and two
|
1429 |
|
|
last digits define the parameter number.
|
1430 |
|
|
\begin{enumerate}
|
1431 |
|
|
\item The parameter registers for priority ($.prior$), arbitration
|
1432 |
|
|
type ($.arb\_type$), and maximum send amount ($.max\_send$) on
|
1433 |
|
|
current page (page 1) are configured to values 5, 2, and 20,
|
1434 |
|
|
respectively.
|
1435 |
|
|
|
1436 |
|
|
\item Parameters on the inactive page are updated: priority is set to
|
1437 |
|
|
4, arbitration type is changed from round-robin (0) to priority (1),
|
1438 |
|
|
and max\_send is increased to 30.
|
1439 |
|
|
|
1440 |
|
|
\item Page 2 is activated by writing value 2 to address 0x000. When
|
1441 |
|
|
the page is changed, all outputs to transfer controller change
|
1442 |
|
|
immediately. Since the number of agents ($n\_agents$) changes to
|
1443 |
|
|
value 8, the wrapper with priority 9 cannot access the bus anymore.
|
1444 |
|
|
This way arbitration latency can be decreased if some agent is known
|
1445 |
|
|
to be idle.
|
1446 |
|
|
\end{enumerate}
|
1447 |
|
|
|
1448 |
|
|
|
1449 |
|
|
\section{Performance and resource usage}
|
1450 |
|
|
|
1451 |
|
|
\subsection{HIBI wrapper structure}
|
1452 |
|
|
|
1453 |
|
|
The resource usage of the HIBI comes mainly from it's wrappers. HIBI
|
1454 |
|
|
version 3 has three types of them which include R1, R3 and
|
1455 |
|
|
R4. Figure~\ref{fig:r3_block_diagram} shows how a R3 wrapper is
|
1456 |
|
|
constructed of multiplexors and a R1 wrapper which has four separate
|
1457 |
|
|
FIFOs itself.
|
1458 |
|
|
|
1459 |
|
|
\begin{figure*}
|
1460 |
|
|
\begin{center}
|
1461 |
|
|
\includegraphics[width=0.9\textwidth]{../Fig/Eps/fig_r3_structure.eps}
|
1462 |
|
|
\caption{HIBI R3 wrapper block diagram}
|
1463 |
|
|
\label{fig:r3_structure}
|
1464 |
|
|
\end{center}
|
1465 |
|
|
\end{figure*}
|
1466 |
|
|
|
1467 |
|
|
\subsection{Resource usage}
|
1468 |
|
|
|
1469 |
|
|
The resource usage for invidual HIBI wrappers was acquired from a SoC
|
1470 |
|
|
that was synthesized to a Arria II GX FPGA on a Arria II GX
|
1471 |
|
|
development board. The SoC had two HIBI components with both attached
|
1472 |
|
|
to a R3 HIBI wrapper. The size of the fifos on these wrappers was set
|
1473 |
|
|
to 4 words which means $4 \cdot 32b = 128 b$ on each fifo.
|
1474 |
|
|
|
1475 |
|
|
Table~\ref{table:resource_usage} shows the combinatorial ALU (adaptive
|
1476 |
|
|
LUT) counts and register counts of a wrapper. Both minimum and maximum
|
1477 |
|
|
values are reported since synthesis does not always produce exactly
|
1478 |
|
|
the same results. Area can be significantly reduced if the FIFOs are
|
1479 |
|
|
implemented as onchip memories (m9k blocks in Arria II GX).
|
1480 |
|
|
|
1481 |
|
|
\begin{table*}
|
1482 |
|
|
\caption {Resource usage of wrapper R3, with 32b data, multiplxed address and 5b command.
|
1483 |
|
|
\label{table:resource_usage}
|
1484 |
|
|
v.2 and v.3 }
|
1485 |
|
|
\begin{center}
|
1486 |
|
|
\begin{tabular}{l | l | r }
|
1487 |
|
|
\hline \hline
|
1488 |
|
|
Wrapper subblock & Unit & Value \\
|
1489 |
|
|
\hline \hline
|
1490 |
|
|
HIBI wrapper r3 & comb. ALUTs & 724-763 \\
|
1491 |
|
|
& registers & 1029-1168 \\
|
1492 |
|
|
\hline
|
1493 |
|
|
HIBI wrapper r1 & comb. ALUTs & 466-533 \\
|
1494 |
|
|
& registers & 825-935 \\
|
1495 |
|
|
\hline
|
1496 |
|
|
4-word FIFO & comb. ALUTs & 76-104 \\
|
1497 |
|
|
& registers & 155-167 \\
|
1498 |
|
|
\hline
|
1499 |
|
|
\end{tabular}
|
1500 |
|
|
\end{center}
|
1501 |
|
|
\end{table*}
|
1502 |
|
|
|
1503 |
|
|
|
1504 |
|
|
Fig.~\ref{fig_chip_planner} shows the
|
1505 |
|
|
resource usage layout on the FPGA as seen on the Chip Planner in
|
1506 |
|
|
Quartus II. The two wrappers are highlighted in blue.
|
1507 |
|
|
|
1508 |
|
|
|
1509 |
|
|
\begin{figure*}
|
1510 |
|
|
\begin{center}
|
1511 |
|
|
\includegraphics[width=0.4\textwidth]{../Fig/Eps/fig_chip_planner.eps}
|
1512 |
|
|
\caption{HIBI R3 in Quartus' chip planner tool}
|
1513 |
|
|
\label{fig:chip_plannet}
|
1514 |
|
|
\end{center}
|
1515 |
|
|
\end{figure*}
|
1516 |
|
|
|
1517 |
|
|
|
1518 |
|
|
\subsection{Simulated performance}
|
1519 |
|
|
|
1520 |
|
|
The throughput was measured for a 32 bit, 200 MHz HIBI segment with
|
1521 |
|
|
two components, both of which were connected to the segment with a R3
|
1522 |
|
|
wrapper. The sender transmitted a continous stream of 1024 words to a
|
1523 |
|
|
single address. Maximum throughput is $200 MHz \cdot 32b =$ 800
|
1524 |
|
|
MByte/s. Since the data and address are buses muxed together, the
|
1525 |
|
|
minimum time to send the stream would be 1025 cycles. Measured
|
1526 |
|
|
latency and throughput are shown in Fig.\ref{fig_performance}. Both
|
1527 |
|
|
approach their theoretical limits as the FIFO depth increases.
|
1528 |
|
|
|
1529 |
|
|
\begin{figure*}
|
1530 |
|
|
\begin{center}
|
1531 |
|
|
\subfigure[Transfer latency in cycles. Theoretical miniumum 1025 cycles (one cycle needed for address)]{\includegraphics[width=0.85\textwidth]{../Fig/Eps/gra_latency_1024words.eps}
|
1532 |
|
|
\label{subfig:perf_latency}}
|
1533 |
|
|
\subfigure[Throuhgpput in MB/s. Theoretical max 800 MB/s]{\includegraphics[width=0.85\textwidth]{../Fig/Eps/gra_throughput_1024words.eps}
|
1534 |
|
|
\label{subfig:perf_throughput}}
|
1535 |
|
|
\caption{Performance with 1024-word transfers.}
|
1536 |
|
|
\label{fig:performance}
|
1537 |
|
|
\end{center}
|
1538 |
|
|
\end{figure*}
|
1539 |
|
|
|
1540 |
|
|
|
1541 |
|
|
|
1542 |
|
|
|
1543 |
|
|
|
1544 |
|
|
\section{Usage examples}
|
1545 |
|
|
|
1546 |
|
|
IP can connect directly to HIBI but CPUs should use a DMA. It allows
|
1547 |
|
|
performing transfers on the backgournd while CPU is processing.
|
1548 |
|
|
|
1549 |
|
|
\subsection{Transmission with dual-port memory buffer and DMA controller}
|
1550 |
|
|
|
1551 |
|
|
Fig.~\ref{fig:dma_tx} shows the concept how CPU can send data using
|
1552 |
|
|
DMA.
|
1553 |
|
|
\begin{enumerate}
|
1554 |
|
|
\item CPU reserves buffer space from dual-port memory
|
1555 |
|
|
\item CPU copies/writes data to dual-port memory
|
1556 |
|
|
\item CPU configures DMA transfer: memory address, size of transfer,
|
1557 |
|
|
and destination IP-block's HIBI address (not local CPU address)
|
1558 |
|
|
\item DMA reads data from dual-port memory and sends the data to the
|
1559 |
|
|
configured HIBI address
|
1560 |
|
|
\end{enumerate}
|
1561 |
|
|
|
1562 |
|
|
|
1563 |
|
|
\begin{figure}
|
1564 |
|
|
\begin{center}
|
1565 |
|
|
{\includegraphics[width=0.7\textwidth]{../Fig/Eps/fig_dma_tx.eps}}
|
1566 |
|
|
\caption{Example how CPU sends using DMA.}
|
1567 |
|
|
\label{fig:dma_tx}
|
1568 |
|
|
\end{center}
|
1569 |
|
|
\end{figure}
|
1570 |
|
|
|
1571 |
|
|
|
1572 |
|
|
\subsection{Reception with dual-port memory buffer and DMA controller}
|
1573 |
|
|
|
1574 |
|
|
Fig.~\ref{fig:dma_rx} shows the concept how CPU can use DMA to copy
|
1575 |
|
|
received data into the local dual-port memory.
|
1576 |
|
|
|
1577 |
|
|
\begin{enumerate}
|
1578 |
|
|
\item CPU reserves buffer space from dual-port memory
|
1579 |
|
|
\item CPU configures DMA: Memory address, size of transfer, and the
|
1580 |
|
|
HIBI address in which data is received
|
1581 |
|
|
\item DMA copies the incoming data to DPRAM
|
1582 |
|
|
\item DMA interrupts CPU when a configured number of words have been
|
1583 |
|
|
received
|
1584 |
|
|
\item CPU knows that data is ready in memory and uses it/copies to
|
1585 |
|
|
data memory
|
1586 |
|
|
\end{enumerate}
|
1587 |
|
|
|
1588 |
|
|
\begin{figure}
|
1589 |
|
|
\begin{center}
|
1590 |
|
|
{\includegraphics[width=0.7\textwidth]{../Fig/Eps/fig_dma_rx.eps}}
|
1591 |
|
|
\caption{Example how CPU receives data usign DMA.}
|
1592 |
|
|
\label{fig:dma_rx}
|
1593 |
|
|
\end{center}
|
1594 |
|
|
\end{figure}
|
1595 |
|
|
|
1596 |
|
|
Rx buffers are organized as channels. Fig.~\ref{fig:dma_rx_buffers}
|
1597 |
|
|
shows how DMA translates incoming HIBI addresses into addresses in the
|
1598 |
|
|
local memory. Only memory space limits how many buffers (channels)
|
1599 |
|
|
exists at the same time. Channels have implicit meanings that must be
|
1600 |
|
|
agreed:
|
1601 |
|
|
\begin{enumerate}
|
1602 |
|
|
|
1603 |
|
|
\item Who (what IP-block or CPU) sends data to which channel, since
|
1604 |
|
|
otherwise the sender is not known (HIBI does not send sender ID in
|
1605 |
|
|
transfers).
|
1606 |
|
|
\item Possible explicit meaning of channel like ``DCT transform
|
1607 |
|
|
Q-parameter''. Then, it is not that relevant who provides data.
|
1608 |
|
|
\end{enumerate}
|
1609 |
|
|
|
1610 |
|
|
|
1611 |
|
|
\begin{figure}
|
1612 |
|
|
\begin{center}
|
1613 |
|
|
{\includegraphics[width=0.5\textwidth]{../Fig/Eps/fig_dma_rx_buffers.eps}}
|
1614 |
|
|
\caption{Example mapping between incoming address and buffer in dual-port memory.}
|
1615 |
|
|
\label{fig:dma_rx_buffers}
|
1616 |
|
|
\end{center}
|
1617 |
|
|
\end{figure}
|
1618 |
|
|
|
1619 |
|
|
\subsection{Example: use source specific addresses}
|
1620 |
|
|
|
1621 |
|
|
\begin{figure}
|
1622 |
|
|
\begin{center}
|
1623 |
|
|
{\includegraphics[width=0.5\textwidth]{../Fig/Eps/fig_src_specific_addr.eps}}
|
1624 |
|
|
\caption{Example how CPU instructs the IP block where to put result data.}
|
1625 |
|
|
\label{fig:src_specific_addr}
|
1626 |
|
|
\end{center}
|
1627 |
|
|
\end{figure}
|
1628 |
|
|
|
1629 |
|
|
|
1630 |
|
|
Designer wished to implement following high-level sequence ``HW
|
1631 |
|
|
IP-block A should send data to CPU after initialization''. The
|
1632 |
|
|
procedure to achieve this is
|
1633 |
|
|
\begin{enumerate}
|
1634 |
|
|
\item CPU Sets rx buffer address to its DMA block N2H2\_0
|
1635 |
|
|
\item CPU sends that same address to A's IP-block specific
|
1636 |
|
|
configuration register
|
1637 |
|
|
\item IP A knows now to where send data
|
1638 |
|
|
\item CPU knows from where data is coming to address
|
1639 |
|
|
\end{enumerate}
|
1640 |
|
|
|
1641 |
|
|
It is assumed that CPU and IP A know the data amount at design
|
1642 |
|
|
time. Otherwise, it must agreed upon during initialization (that was
|
1643 |
|
|
omitted for clarity).
|
1644 |
|
|
|
1645 |
|
|
\subsection{SW interface to DMA}
|
1646 |
|
|
|
1647 |
|
|
There are low-level SW macros available that access the hardware registers
|
1648 |
|
|
of HIBI PE DMA (abbreaviated as HPD). They implement a driver, but
|
1649 |
|
|
can be also used from user programs.
|
1650 |
|
|
|
1651 |
|
|
\begin{table*}
|
1652 |
|
|
\caption {The SW macros for accessing the DMA controller's registers}
|
1653 |
|
|
\label{table:dma_macros}
|
1654 |
|
|
\begin{center}
|
1655 |
|
|
\begin{tabular}{p{0.5\textwidth} | p{0.5\textwidth} }
|
1656 |
|
|
\hline
|
1657 |
|
|
Macro & Meaning \\
|
1658 |
|
|
\hline \hline
|
1659 |
|
|
|
1660 |
|
|
void HPD\_CHAN\_CONF ( int channel, int mem\_addr, int rx\_addr, int
|
1661 |
|
|
amount, int* base ) & Configure HPD channels. After configuration,
|
1662 |
|
|
specific channel is ready to receive amount of data to rx\_addr HIBI
|
1663 |
|
|
address. Received data is stored to mem\_addr in HPD address space.
|
1664 |
|
|
\\
|
1665 |
|
|
\hline
|
1666 |
|
|
|
1667 |
|
|
void HPD\_SEND (int mem\_addr, int amount, int haddr, int* base) &
|
1668 |
|
|
Send amount of data from mem\_addr to haddr HIBI address. mem\_addr is
|
1669 |
|
|
memory address in HPD address space. \\
|
1670 |
|
|
\hline
|
1671 |
|
|
|
1672 |
|
|
void HPD\_READ (int mem\_addr, int amount, int haddr, int* base) &
|
1673 |
|
|
Send command to read amountof data from haddrHIBI address. \\
|
1674 |
|
|
\hline
|
1675 |
|
|
|
1676 |
|
|
void HPD\_SEND\_MSG (int mem\_addr, int amount, int haddr, int* base)
|
1677 |
|
|
& Send amount of data from mem\_addr to haddr HIBI address as HIBI
|
1678 |
|
|
message. mem\_addr is memory address in HPD address space. \\
|
1679 |
|
|
\hline
|
1680 |
|
|
|
1681 |
|
|
int HPD\_TX\_DONE(int* base) & Returns status of transmit
|
1682 |
|
|
operation. \\
|
1683 |
|
|
\hline
|
1684 |
|
|
|
1685 |
|
|
void HPD\_CLEAR\_IRQ(int chan, int* base) & Clears IRQ of specific
|
1686 |
|
|
channel. \\
|
1687 |
|
|
\hline
|
1688 |
|
|
|
1689 |
|
|
int HPD\_GET\_IRQ\_CHAN(int* base) & Return the number of the channel
|
1690 |
|
|
that caused interrupt. If interrupt hasn't occurred, return -1. \\
|
1691 |
|
|
\hline
|
1692 |
|
|
\end{tabular}
|
1693 |
|
|
\end{center}
|
1694 |
|
|
\end{table*}
|
1695 |
|
|
|
1696 |
|
|
Notes: ``HPD'' is HIBI PE DMA (previously called Nios-to-HIBI 2,
|
1697 |
|
|
N2H2). ``Base'' is the base address of HIBI PE DMA in HIBI address
|
1698 |
|
|
space. ``Amount'' is data amount in 32-bit words.
|
1699 |
|
|
|
1700 |
|
|
|
1701 |
|
|
\begin{table*}
|
1702 |
|
|
\caption {The SW functions for using the DMA}
|
1703 |
|
|
\label{table:dma_functions}
|
1704 |
|
|
\begin{center}
|
1705 |
|
|
\begin{tabular}{p{0.5\textwidth} | p{0.5\textwidth} }
|
1706 |
|
|
\hline Function & Meaning \\ \hline \hline
|
1707 |
|
|
|
1708 |
|
|
void HIBI\_TX (uint8* pData, uint32 dataLen, uint32 destAddr,
|
1709 |
|
|
uint8 commType) &
|
1710 |
|
|
|
1711 |
|
|
Send data over HIBI. pData is pointer to data, dataLen is length
|
1712 |
|
|
of the data in bytes, destAddr is destination HIBI address,
|
1713 |
|
|
commType is either HIBI\_TRANSFER\_TYPE\_DATA or
|
1714 |
|
|
HIBI\_TRANSFER\_TYPE\_MESSAGE. Differences to lower level
|
1715 |
|
|
macros are the automatic copying of memory to HIBI PE DMA-buffer
|
1716 |
|
|
and protection against simultaneous sending in different
|
1717 |
|
|
threads. \\
|
1718 |
|
|
|
1719 |
|
|
\hline
|
1720 |
|
|
|
1721 |
|
|
struct sN2H\_ChannelInfo* N2H\_ReserveChannel( int32 bufferSize,
|
1722 |
|
|
void* callbackFunc, bool handleInDsr, bool calledFromDsr, sint32
|
1723 |
|
|
channelNum) &
|
1724 |
|
|
|
1725 |
|
|
Reserve a channel for receiving data. bufferSize Size of the
|
1726 |
|
|
data to be received (bytes). callbackFunc: Function to call
|
1727 |
|
|
when the data arrives. Prototype: function(uint8* pData, uint32
|
1728 |
|
|
dataLen, uint32 receivedAddr) handleInDsr: Set to false
|
1729 |
|
|
calledFromDsr: Set to false channelNum: Channel that is waiting
|
1730 |
|
|
for incoming data. The complete address will be HIBI base
|
1731 |
|
|
address + channelNum. Difference to lower level macros is that
|
1732 |
|
|
interrupt handler provided by HIBI driver, own function can be
|
1733 |
|
|
registered directly to handle data. \\
|
1734 |
|
|
|
1735 |
|
|
\hline
|
1736 |
|
|
\end{tabular}
|
1737 |
|
|
\end{center}
|
1738 |
|
|
\end{table*}
|
1739 |
|
|
|
1740 |
|
|
|
1741 |
|
|
|
1742 |
|
|
HIBI\_TX checks that previous send operation is complete and Calls
|
1743 |
|
|
HPD\_send macro. Hence, it also runs macros HPD\_TX\_ADDR, TX\_AMOUNT, HIBI\_ADDR,
|
1744 |
|
|
TX\_COMM, and TX\_START Releases the Tx channel.
|
1745 |
|
|
|
1746 |
|
|
Following example shows a data transfers between two CPUs assuming the
|
1747 |
|
|
system in
|
1748 |
|
|
Fig.~\ref{subfig:dma_example}. Fig.~\ref{subfig:dma_seq_diag} shows
|
1749 |
|
|
the sequence diagram.
|
1750 |
|
|
|
1751 |
|
|
|
1752 |
|
|
\begin{figure*}
|
1753 |
|
|
\begin{center}
|
1754 |
|
|
\subfigure[IP sends.]{\includegraphics[width=0.85\textwidth]{../Fig/Eps/fig_dma_example.eps}
|
1755 |
|
|
\label{subfig:dma_example}}
|
1756 |
|
|
\subfigure[IP receives data]{\includegraphics[width=0.85\textwidth]{../Fig/Eps/fig_dma_seq_diag.eps}
|
1757 |
|
|
\label{subfig:dma_seq_diag}}
|
1758 |
|
|
\caption{Examples of timing at IP interface.}
|
1759 |
|
|
\label{fig:dma_example}
|
1760 |
|
|
\end{center}
|
1761 |
|
|
\end{figure*}
|
1762 |
|
|
|
1763 |
|
|
|
1764 |
|
|
|
1765 |
|
|
|
1766 |
|
|
\section {Summary}
|
1767 |
|
|
|
1768 |
|
|
|
1769 |
|
|
|
1770 |
|
|
The most important properties of HIBI are summarized in
|
1771 |
|
|
Table.~\ref{table:hibi_versions}. HIBI network allows multiple
|
1772 |
|
|
topologies and utilizes distributed arbitration. The network is
|
1773 |
|
|
constructed by instantiating multiple wrapper components and and
|
1774 |
|
|
connecting them together. The wrapper is modular allowing good
|
1775 |
|
|
parameterization at design time and possibility to reconfigure certain
|
1776 |
|
|
parameters of the network runtime.
|
1777 |
|
|
\begin{table*}
|
1778 |
|
|
\caption{Properties of HIBI v.3}
|
1779 |
|
|
\label{table:hibi_versions}
|
1780 |
|
|
\begin{center}
|
1781 |
|
|
\includegraphics[width=0.9\textwidth]{../Fig/Eps/tab_hibi_v3.eps}
|
1782 |
|
|
\end{center}
|
1783 |
|
|
\end{table*}
|
1784 |
|
|
|
1785 |
|
|
\setcounter{secnumdepth}{-1}
|
1786 |
|
|
\bibliography{IEEEfull,hibi_datasheet_ref}
|
1787 |
|
|
%\bibliography{hibi_datasheet_ref}
|
1788 |
|
|
\bibliographystyle{IEEEtranS}
|
1789 |
|
|
|
1790 |
|
|
|
1791 |
|
|
\end{document}
|
1792 |
|
|
|