\documentclass[a4paper]{article}
|
\documentclass[a4paper]{article}
|
\usepackage{amsmath}
|
\usepackage{amsmath}
|
\usepackage{amssymb}
|
\usepackage{amssymb}
|
\usepackage{amsfonts}
|
\usepackage{amsfonts}
|
\usepackage{graphicx}
|
\usepackage{graphicx}
|
\DeclareGraphicsExtensions{.pdf,.eps,.png,.jpg}
|
\DeclareGraphicsExtensions{.pdf,.eps,.png,.jpg}
|
\usepackage{color}
|
\usepackage{color}
|
\usepackage{psfig}
|
\usepackage{psfig}
|
\usepackage{float}
|
\usepackage{float}
|
\usepackage{subfigure}
|
\usepackage{subfigure}
|
\setlength\topmargin{0 in}
|
\setlength\topmargin{0 in}
|
\setlength\oddsidemargin{0 in}
|
\setlength\oddsidemargin{0 in}
|
\setlength\textwidth{6.5 in}
|
\setlength\textwidth{6.5 in}
|
|
|
\title{Fully Pipelined AES Core}
|
\title{Fully Pipelined AES Core}
|
\author{Subhasis Das}
|
\author{Subhasis Das}
|
\date{}
|
\date{}
|
|
|
\begin{document}
|
\begin{document}
|
\maketitle
|
\maketitle
|
\section*{Basic Architecture}
|
\section*{Basic Architecture}
|
This core meets the NIST FIPS-197 specifications. The basic block diagram is given in Figure \ref{arch}.
|
This core meets the NIST FIPS-197 specifications. The basic block diagram is given in Figure \ref{arch}.
|
\begin{figure}[h]
|
\begin{figure}[h]
|
\centering
|
\centering
|
\includegraphics[scale=0.7]{arch}
|
\includegraphics[scale=0.7]{arch}
|
\caption{Basic Architecture}
|
\caption{Basic Architecture}
|
\label{arch}
|
\label{arch}
|
\end{figure}
|
\end{figure}
|
|
|
I have generated each of the roundkeys in two steps. Let us call
|
I have generated each of the roundkeys in two steps. Let us call
|
\[
|
\[
|
\text{RotWord}(\text{ Sbox}(C_3)\;)\text{ xor RCon}\; = \;f(C_3)
|
\text{RotWord}(\text{ Sbox}(C_3)\;)\text{ xor RCon}\; = \;f(C_3)
|
\]
|
\]
|
Then, we can see that
|
Then, we can see that
|
\begin{equation*}
|
\begin{equation*}
|
\begin{aligned}
|
\begin{aligned}
|
C_0^\prime &= f(C_3) \;\text{xor}\; C_0 \\
|
C_0^\prime &= f(C_3) \;\text{xor}\; C_0 \\
|
C_1^\prime &= f(C_3) \;\text{xor}\; C_0 \;\text{xor}\; C_1 \\
|
C_1^\prime &= f(C_3) \;\text{xor}\; C_0 \;\text{xor}\; C_1 \\
|
C_2^\prime &= f(C_3) \;\text{xor}\; C_0 \;\text{xor}\; C_1 \;\text{xor}\; C_2 \\
|
C_2^\prime &= f(C_3) \;\text{xor}\; C_0 \;\text{xor}\; C_1 \;\text{xor}\; C_2 \\
|
C_3^\prime &= f(C_3) \;\text{xor}\; C_0 \;\text{xor}\; C_1 \;\text{xor}\; C_2 \;\text{xor}\; C_3
|
C_3^\prime &= f(C_3) \;\text{xor}\; C_0 \;\text{xor}\; C_1 \;\text{xor}\; C_2 \;\text{xor}\; C_3
|
\end{aligned}
|
\end{aligned}
|
\end{equation*}
|
\end{equation*}
|
where $C_i$ is the column i of the current roundkey and $C_i^\prime$ is the column i of the next roundkey.
|
where $C_i$ is the column i of the current roundkey and $C_i^\prime$ is the column i of the next roundkey.
|
This first step of generating $f(C_3)$ is done alongwith the addkey step of the previous cycle and the second step is done in the combined S-Box and ShiftRows step.
|
This first step of generating $f(C_3)$ is done alongwith the addkey step of the previous cycle and the second step is done in the combined S-Box and ShiftRows step.
|
|
|
The inputs to the overall processor are as follows:
|
The inputs to the overall processor are as follows:
|
\begin{itemize}
|
\begin{itemize}
|
\item clk\_i: System Clock, Data I/O at rising edge
|
\item clk\_i: System Clock, Data I/O at rising edge
|
\item rst\_i: Asynchronous Reset, active high, initializes all inputs to all stages and the final output to zero.
|
\item rst\_i: Asynchronous Reset, active high, initializes all inputs to all stages and the final output to zero.
|
\item plaintext\_i: 16$\times$8 bits plaintext input
|
\item plaintext\_i: 16$\times$8 bits plaintext input
|
\item keyblock\_i: 16$\times$8 bits keyblock input
|
\item keyblock\_i: 16$\times$8 bits keyblock input
|
\end{itemize}
|
\end{itemize}
|
The output is
|
The output is
|
\begin{itemize}
|
\begin{itemize}
|
\item ciphertext\_o: 16$\times$8 bits ciphertext output
|
\item ciphertext\_o: 16$\times$8 bits ciphertext output
|
\end{itemize}
|
\end{itemize}
|
|
|
The timing diagram is shown in Figure \ref{clock}.
|
The timing diagram is shown in Figure \ref{clock}.
|
\begin{figure}[H]
|
\begin{figure}[H]
|
\centering
|
\centering
|
\includegraphics[scale=0.7]{clock}
|
\includegraphics[scale=0.7]{clock}
|
\caption{Timing Diagram}
|
\caption{Timing Diagram}
|
\label{clock}
|
\label{clock}
|
\end{figure}
|
\end{figure}
|
|
|
The \texttt{trunk/rtl/vhdl} directory contains the whole source code.
|
The \texttt{trunk/rtl/vhdl} directory contains the whole source code.
|
|
|
The sample testbench is in \texttt{trunk/bench/vhdl}.
|
The sample testbench is in \texttt{trunk/bench/vhdl}.
|
|
|
For compiling and running the testbench, the script \texttt{sim.sh} in \texttt{trunk/sim/rtl\_sim/run} directory can be used. It uses Xilinx free Isim simulator. The testbench takes in plaintext and key data from \texttt{vectors.dat} in \texttt{trunk/sim/rtl\_sim/src} directory. The expected ciphertext data should be present in \texttt{cipher.dat} in \texttt{trunk/sim/rtl\_sim/src} directory. The results are written to \texttt{output.log} in \texttt{trunk/sim/rtl\_sim/log} directory.
|
For compiling and running the testbench, the script \texttt{sim\_isim.sh} in \texttt{trunk/sim/rtl\_sim/run} directory can be used for Xilinx ISim simulator and \texttt{sim\_ghdl.sh} for GHDL. The testbench takes in plaintext and key data from \texttt{vectors.dat} in \texttt{trunk/sim/rtl\_sim/src} directory. The expected ciphertext data should be present in \texttt{cipher.dat} in \texttt{trunk/sim/rtl\_sim/src} directory. The results are written to \texttt{output.log} in \texttt{trunk/sim/rtl\_sim/log} directory. The final line is 'OK' if all tests pass, else it is 'FAIL'. This can be used to automate checkings over large test datasets.
|
|
|
The \texttt{trunk/syn/Xilinx/run} directory contains the \texttt{synth.sh} shell script, which will synthesize the design when run using Xilinx ISE WebPack tools.
|
The \texttt{trunk/syn/Xilinx/run} directory contains the \texttt{synth.sh} shell script, which will synthesize the design when run using Xilinx ISE WebPack tools.
|
|
|
The speed optimized synthesis results with timing driven map on a Xilinx 5VLX50T device is shown in Table \ref{stats}.
|
The speed optimized synthesis results with timing driven map on a Xilinx 5VLX50T device is shown in Table \ref{stats}.
|
\begin{table}[h]
|
\begin{table}[h]
|
\centering
|
\centering
|
\begin{tabular}{|l|l|}
|
\begin{tabular}{|l|l|}
|
\hline
|
\hline
|
$f_{max}$ & $\approx$ 330 MHz \\
|
$f_{max}$ & $\approx$ 330 MHz \\
|
\hline
|
\hline
|
Max throughput & $\approx$ 42 Gbps \\
|
Max throughput & $\approx$ 42 Gbps \\
|
\hline
|
\hline
|
Slice Registers's & 7873 (27\%) \\
|
Slice Registers's & 7873 (27\%) \\
|
\hline
|
\hline
|
Slice LUT's & 14724 (51\%) \\
|
Slice LUT's & 14724 (51\%) \\
|
\hline
|
\hline
|
Bonded IOB's & 386 (80\%) \\
|
Bonded IOB's & 386 (80\%) \\
|
\hline
|
\hline
|
\end{tabular}
|
\end{tabular}
|
\caption{Design Statistics}
|
\caption{Design Statistics}
|
\label{stats}
|
\label{stats}
|
\end{table}
|
\end{table}
|
|
|
All the synthesis, map and place and route logs are available in \texttt{trunk/syn/Xilinx/log} directory.
|
All the synthesis, map and place and route logs are available in \texttt{trunk/syn/Xilinx/log} directory.
|
\end{document}
|
\end{document}
|
|
|