URL https://opencores.org/ocsvn/dblclockfft/dblclockfft/trunk
Subversion Repositories dblclockfft

[/] [dblclockfft/] [trunk/] [sw/] [butterfly.cpp] - Rev 40

Go to most recent revision | Compare with Previous | Blame | View Log
////////////////////////////////////////////////////////////////////////////////
//
// Filename: 	butterfly.cpp
//
// Project:	A General Purpose Pipelined FFT Implementation
//
// Purpose:	Builds one of two butterflies: either a butterfly implementation
//		using hardware optimized multiplies, or one that uses a logic
//	soft-multiply.
//
// Creator:	Dan Gisselquist, Ph.D.
//		Gisselquist Technology, LLC
//
////////////////////////////////////////////////////////////////////////////////
//
// Copyright (C) 2015-2018, Gisselquist Technology, LLC
//
// This file is part of the general purpose pipelined FFT project.
//
// The pipelined FFT project is free software (firmware): you can redistribute
// it and/or modify it under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// The pipelined FFT project is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTIBILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
// General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this program.  (It's in the $(ROOT)/doc directory.  Run make
// with no target there if the PDF file isn't present.)  If not, see
// <http://www.gnu.org/licenses/> for a copy.
//
// License:	LGPL, v3, as defined and found on www.gnu.org,
//		http://www.gnu.org/licenses/lgpl.html
//
//
////////////////////////////////////////////////////////////////////////////////
//
//
#define _CRT_SECURE_NO_WARNINGS   //  ms vs 2012 doesn't like fopen
#include <stdio.h>
#include <stdlib.h>
 
#ifdef _MSC_VER //  added for ms vs compatibility
 
#include <io.h>
#include <direct.h>
#define _USE_MATH_DEFINES
#define	R_OK    4       /* Test for read permission.  */
#define	W_OK    2       /* Test for write permission.  */
#define	X_OK    0       /* !!!!!! execute permission - unsupported in windows*/
#define	F_OK    0       /* Test for existence.  */
 
#if _MSC_VER <= 1700
 
int lstat(const char *filename, struct stat *buf) { return 1; };
#define	S_ISDIR(A)	0
 
#else
 
#define	lstat	_stat
#define S_ISDIR	_S_IFDIR
 
#endif
 
#define	mkdir(A,B)	_mkdir(A)
 
#define access _access
 
#else
// And for G++/Linux environment
 
#include <unistd.h>	// Defines the R_OK/W_OK/etc. macros
#include <sys/stat.h>
#endif
 
#include <string.h>
#include <string>
#include <math.h>
#include <ctype.h>
#include <assert.h>
 
#include "defaults.h"
#include "legal.h"
#include "rounding.h"
#include "fftlib.h"
#include "bldstage.h"
#include "bitreverse.h"
#include "softmpy.h"
#include "butterfly.h"
 
void	build_butterfly(const char *fname, int xtracbits, ROUND_T rounding,
			int	ckpce, const bool async_reset) {
	FILE	*fp = fopen(fname, "w");
	if (NULL == fp) {
		fprintf(stderr, "Could not open \'%s\' for writing\n", fname);
		perror("O/S Err was:");
		return;
	}
	const	char	*rnd_string;
	if (rounding == RND_TRUNCATE)
		rnd_string = "truncate";
	else if (rounding == RND_FROMZERO)
		rnd_string = "roundfromzero";
	else if (rounding == RND_HALFUP)
		rnd_string = "roundhalfup";
	else
		rnd_string = "convround";
 
	//if (ckpce >= 3)
		//ckpce = 3;
	if (ckpce <= 1)
		ckpce = 1;
 
	std::string	resetw("i_reset");
	if (async_reset)
		resetw = std::string("i_areset_n");
 
 
	fprintf(fp,
SLASHLINE
"//\n"
"// Filename:\tbutterfly.v\n"
"//\n"
"// Project:\t%s\n"
"//\n"
"// Purpose:\tThis routine caculates a butterfly for a decimation\n"
"//		in frequency version of an FFT.  Specifically, given\n"
"//	complex Left and Right values together with a coefficient, the output\n"
"//	of this routine is given by:\n"
"//\n"
"//		L' = L + R\n"
"//		R' = (L - R)*C\n"
"//\n"
"//	The rest of the junk below handles timing (mostly), to make certain\n"
"//	that L' and R' reach the output at the same clock.  Further, just to\n"
"//	make certain that is the case, an 'aux' input exists.  This aux value\n"
"//	will come out of this routine synchronized to the values it came in\n"
"//	with.  (i.e., both L', R', and aux all have the same delay.)  Hence,\n"
"//	a caller of this routine may set aux on the first input with valid\n"
"//	data, and then wait to see aux set on the output to know when to find\n"
"//	the first output with valid data.\n"
"//\n"
"//	All bits are preserved until the very last clock, where any more bits\n"
"//	than OWIDTH will be quietly discarded.\n"
"//\n"
"//	This design features no overflow checking.\n"
"//\n"
"// Notes:\n"
"//	CORDIC:\n"
"//		Much as we might like, we can't use a cordic here.\n"
"//		The goal is to accomplish an FFT, as defined, and a\n"
"//		CORDIC places a scale factor onto the data.  Removing\n"
"//		the scale factor would cost two multiplies, which\n"
"//		is precisely what we are trying to avoid.\n"
"//\n"
"//\n"
"//	3-MULTIPLIES:\n"
"//		It should also be possible to do this with three multiplies\n"
"//		and an extra two addition cycles.\n"
"//\n"
"//		We want\n"
"//			R+I = (a + jb) * (c + jd)\n"
"//			R+I = (ac-bd) + j(ad+bc)\n"
"//		We multiply\n"
"//			P1 = ac\n"
"//			P2 = bd\n"
"//			P3 = (a+b)(c+d)\n"
"//		Then\n"
"//			R+I=(P1-P2)+j(P3-P2-P1)\n"
"//\n"
"//		WIDTHS:\n"
"//		On multiplying an X width number by an\n"
"//		Y width number, X>Y, the result should be (X+Y)\n"
"//		bits, right?\n"
"//		-2^(X-1) <= a <= 2^(X-1) - 1\n"
"//		-2^(Y-1) <= b <= 2^(Y-1) - 1\n"
"//		(2^(Y-1)-1)*(-2^(X-1)) <= ab <= 2^(X-1)2^(Y-1)\n"
"//		-2^(X+Y-2)+2^(X-1) <= ab <= 2^(X+Y-2) <= 2^(X+Y-1) - 1\n"
"//		-2^(X+Y-1) <= ab <= 2^(X+Y-1)-1\n"
"//		YUP!  But just barely.  Do this and you'll really want\n"
"//		to drop a bit, although you will risk overflow in so\n"
"//		doing.\n"
"//\n"
"//	20150602 -- The sync logic lines have been completely redone.  The\n"
"//		synchronization lines no longer go through the FIFO with the\n"
"//		left hand sum, but are kept out of memory.  This allows the\n"
"//		butterfly to use more optimal memory resources, while also\n"
"//		guaranteeing that the sync lines can be properly reset upon\n"
"//		any reset signal.\n"
"//\n"
"//\n%s"
"//\n", prjname, creator);
	fprintf(fp, "%s", cpyleft);
	fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
 
	fprintf(fp,
"module\tbutterfly(i_clk, %s, i_ce, i_coef, i_left, i_right, i_aux,\n"
		"\t\to_left, o_right, o_aux);\n"
	"\t// Public changeable parameters ...\n", resetw.c_str());
 
	fprintf(fp,
	"\tparameter IWIDTH=%d,", TST_BUTTERFLY_IWIDTH);
#ifdef	TST_BUTTERFLY_CWIDTH
	fprintf(fp, "CWIDTH=%d,", TST_BUTTERFLY_CWIDTH);
#else
	fprintf(fp, "CWIDTH=IWIDTH+%d,", xtracbits);
#endif
#ifdef	TST_BUTTERFLY_OWIDTH
	fprintf(fp, "OWIDTH=%d;\n", TST_BUTTERFLY_OWIDTH);
	// OWIDTH = TST_BUTTERFLY_OWIDTH;
#else
	fprintf(fp, "OWIDTH=IWIDTH+1;\n");
#endif
	fprintf(fp, "\tparameter\tSHIFT=0;\n");
 
	fprintf(fp,
	"\t// The number of clocks per each i_ce.  The actual number can be\n"
	"\t// more, but the algorithm depends upon at least this many for\n"
	"\t// extra internal processing.\n"
	"\tparameter	CKPCE=%d;\n", ckpce);
 
	fprintf(fp,
	"\t//\n"
	"\t// Local/derived parameters that are calculated from the above\n"
	"\t// params.  Apart from algorithmic changes below, these should not\n"
	"\t// be adjusted\n"
	"\t//\n"
	"\t// The first step is to calculate how many clocks it takes our\n"
	"\t// multiply to come back with an answer within.  The time in the\n"
	"\t// multiply depends upon the input value with the fewest number of\n"
	"\t// bits--to keep the pipeline depth short.  So, let's find the\n"
	"\t// fewest number of bits here.\n"
	"\tlocalparam MXMPYBITS = \n"
		"\t\t((IWIDTH+2)>(CWIDTH+1)) ? (CWIDTH+1) : (IWIDTH + 2);\n"
	"\t//\n"
	"\t// Given this \"fewest\" number of bits, we can calculate the\n"
	"\t// number of clocks the multiply itself will take.\n"
	"\tlocalparam	MPYDELAY=((MXMPYBITS+1)/2)+2;\n"
	"\t//\n"
	"\t// In an environment when CKPCE > 1, the multiply delay isn\'t\n"
	"\t// necessarily the delay felt by this algorithm--measured in\n"
	"\t// i_ce\'s.  In particular, if the multiply can operate with more\n"
	"\t// operations per clock, it can appear to finish \"faster\".\n"
	"\t// Since most of the logic in this core operates on the slower\n"
	"\t// clock, we'll need to map that speed into the number of slower\n"
	"\t// clock ticks that it takes.\n"
	"\tlocalparam	LCLDELAY = (CKPCE == 1) ? MPYDELAY\n"
		"\t\t: (CKPCE == 2) ? (MPYDELAY/2+2)\n"
		"\t\t: (MPYDELAY/3 + 2);\n"
	"\tlocalparam	LGDELAY = (MPYDELAY>64) ? 7\n"
			"\t\t\t: (MPYDELAY > 32) ? 6\n"
			"\t\t\t: (MPYDELAY > 16) ? 5\n"
			"\t\t\t: (MPYDELAY >  8) ? 4\n"
			"\t\t\t: (MPYDELAY >  4) ? 3\n"
			"\t\t\t: 2;\n"
	"\tlocalparam	AUXLEN=(LCLDELAY+3);\n"
	"\tlocalparam	MPYREMAINDER = MPYDELAY - CKPCE*(MPYDELAY/CKPCE);\n"
"\n\n");
 
 
	fprintf(fp,
	"\tinput\twire\ti_clk, %s, i_ce;\n"
	"\tinput\twire\t[(2*CWIDTH-1):0] i_coef;\n"
	"\tinput\twire\t[(2*IWIDTH-1):0] i_left, i_right;\n"
	"\tinput\twire\ti_aux;\n"
	"\toutput\twire	[(2*OWIDTH-1):0] o_left, o_right;\n"
	"\toutput\treg\to_aux;\n\n", resetw.c_str());
 
	if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
	"\tlocalparam	F_LGDEPTH = (AUXLEN > 64) ? 7\n"
			"\t\t\t: (AUXLEN > 32) ? 6\n"
			"\t\t\t: (AUXLEN > 16) ? 5\n"
			"\t\t\t: (AUXLEN >  8) ? 4\n"
			"\t\t\t: (AUXLEN >  4) ? 3 : 2;\n"
"\n"
	"\tlocalparam	F_DEPTH = AUXLEN;\n"
	"\tlocalparam	[F_LGDEPTH-1:0]	F_D = F_DEPTH[F_LGDEPTH-1:0]-1;\n"
"\n"
	"\treg	signed	[IWIDTH-1:0]	f_dlyleft_r  [0:F_DEPTH-1];\n"
	"\treg	signed	[IWIDTH-1:0]	f_dlyleft_i  [0:F_DEPTH-1];\n"
	"\treg	signed	[IWIDTH-1:0]	f_dlyright_r [0:F_DEPTH-1];\n"
	"\treg	signed	[IWIDTH-1:0]	f_dlyright_i [0:F_DEPTH-1];\n"
	"\treg	signed	[CWIDTH-1:0]	f_dlycoeff_r [0:F_DEPTH-1];\n"
	"\treg	signed	[CWIDTH-1:0]	f_dlycoeff_i [0:F_DEPTH-1];\n"
	"\treg	signed	[F_DEPTH-1:0]	f_dlyaux;\n"
"\n"
	"\twire	signed	[IWIDTH:0]		f_predifr, f_predifi;\n"
	"\twire	signed	[IWIDTH+CWIDTH+3-1:0]	f_predifrx, f_predifix;\n"
	"\twire	signed	[CWIDTH:0]		f_sumcoef;\n"
	"\twire	signed	[IWIDTH+1:0]		f_sumdiff;\n"
	"\twire	signed	[IWIDTH:0]		f_sumr, f_sumi;\n"
	"\twire	signed	[IWIDTH+CWIDTH+3-1:0]	f_sumrx, f_sumix;\n"
	"\twire	signed	[IWIDTH:0]		f_difr, f_difi;\n"
	"\twire	signed	[IWIDTH+CWIDTH+3-1:0]	f_difrx, f_difix;\n"
	"\twire	signed	[IWIDTH+CWIDTH+3-1:0]	f_widecoeff_r, f_widecoeff_i;\n"
"\n"
	"\twire	[(CWIDTH):0]	fp_one_ic, fp_two_ic, fp_three_ic, f_p3c_in;\n"
	"\twire	[(IWIDTH+1):0]	fp_one_id, fp_two_id, fp_three_id, f_p3d_in;\n"
"`endif\n\n");
 
	fprintf(fp,
	"\treg\t[(2*IWIDTH-1):0]\tr_left, r_right;\n"
	"\treg\t[(2*CWIDTH-1):0]\tr_coef, r_coef_2;\n"
	"\twire\tsigned\t[(IWIDTH-1):0]\tr_left_r, r_left_i, r_right_r, r_right_i;\n"
	"\tassign\tr_left_r  = r_left[ (2*IWIDTH-1):(IWIDTH)];\n"
	"\tassign\tr_left_i  = r_left[ (IWIDTH-1):0];\n"
	"\tassign\tr_right_r = r_right[(2*IWIDTH-1):(IWIDTH)];\n"
	"\tassign\tr_right_i = r_right[(IWIDTH-1):0];\n"
"\n"
	"\treg\tsigned\t[(IWIDTH):0]\tr_sum_r, r_sum_i, r_dif_r, r_dif_i;\n"
"\n"
	"\treg	[(LGDELAY-1):0]	fifo_addr;\n"
	"\twire	[(LGDELAY-1):0]	fifo_read_addr;\n"
	"\tassign\tfifo_read_addr = fifo_addr - LCLDELAY[(LGDELAY-1):0];\n"
	"\treg	[(2*IWIDTH+1):0]	fifo_left [ 0:((1<<LGDELAY)-1)];\n"
"\n");
	fprintf(fp,
	"\t// Set up the input to the multiply\n"
	"\talways @(posedge i_clk)\n"
	"\tif (i_ce)\n"
	"\tbegin\n"
		"\t\t// One clock just latches the inputs\n"
		"\t\tr_left <= i_left;	// No change in # of bits\n"
		"\t\tr_right <= i_right;\n"
		"\t\tr_coef  <= i_coef;\n"
		"\t\t// Next clock adds/subtracts\n"
		"\t\tr_sum_r <= r_left_r + r_right_r; // Now IWIDTH+1 bits\n"
		"\t\tr_sum_i <= r_left_i + r_right_i;\n"
		"\t\tr_dif_r <= r_left_r - r_right_r;\n"
		"\t\tr_dif_i <= r_left_i - r_right_i;\n"
		"\t\t// Other inputs are simply delayed on second clock\n"
		"\t\tr_coef_2<= r_coef;\n"
	"\tend\n"
"\n");
	fprintf(fp,
	"\t// Don\'t forget to record the even side, since it doesn\'t need\n"
	"\t// to be multiplied, but yet we still need the results in sync\n"
	"\t// with the answer when it is ready.\n"
	"\tinitial fifo_addr = 0;\n");
	if (async_reset)
		fprintf(fp, "\talways @(posedge i_clk, negedge i_areset_n)\n\tif (!i_areset_n)\n");
	else
		fprintf(fp, "\talways @(posedge i_clk)\n\tif (i_reset)\n");
	fprintf(fp,
			"\t\tfifo_addr <= 0;\n"
		"\telse if (i_ce)\n"
			"\t\t// Need to delay the sum side--nothing else happens\n"
			"\t\t// to it, but it needs to stay synchronized with the\n"
			"\t\t// right side.\n"
			"\t\tfifo_addr <= fifo_addr + 1;\n"
"\n"
	"\talways @(posedge i_clk)\n"
	"\tif (i_ce)\n"
		"\t\tfifo_left[fifo_addr] <= { r_sum_r, r_sum_i };\n"
"\n"
	"\twire\tsigned\t[(CWIDTH-1):0]	ir_coef_r, ir_coef_i;\n"
	"\tassign\tir_coef_r = r_coef_2[(2*CWIDTH-1):CWIDTH];\n"
	"\tassign\tir_coef_i = r_coef_2[(CWIDTH-1):0];\n"
	"\twire\tsigned\t[((IWIDTH+2)+(CWIDTH+1)-1):0]\tp_one, p_two, p_three;\n"
"\n"
"\n");
	fprintf(fp,
	"\t// Multiply output is always a width of the sum of the widths of\n"
	"\t// the two inputs.  ALWAYS.  This is independent of the number of\n"
	"\t// bits in p_one, p_two, or p_three.  These values needed to\n"
	"\t// accumulate a bit (or two) each.  However, this approach to a\n"
	"\t// three multiply complex multiply cannot increase the total\n"
	"\t// number of bits in our final output.  We\'ll take care of\n"
	"\t// dropping back down to the proper width, OWIDTH, in our routine\n"
	"\t// below.\n"
"\n"
"\n");
	fprintf(fp,
	"\t// We accomplish here \"Karatsuba\" multiplication.  That is,\n"
	"\t// by doing three multiplies we accomplish the work of four.\n"
	"\t// Let\'s prove to ourselves that this works ... We wish to\n"
	"\t// multiply: (a+jb) * (c+jd), where a+jb is given by\n"
	"\t//\ta + jb = r_dif_r + j r_dif_i, and\n"
	"\t//\tc + jd = ir_coef_r + j ir_coef_i.\n"
	"\t// We do this by calculating the intermediate products P1, P2,\n"
	"\t// and P3 as\n"
	"\t//\tP1 = ac\n"
	"\t//\tP2 = bd\n"
	"\t//\tP3 = (a + b) * (c + d)\n"
	"\t// and then complete our final answer with\n"
	"\t//\tac - bd = P1 - P2 (this checks)\n"
	"\t//\tad + bc = P3 - P2 - P1\n"
	"\t//\t        = (ac + bc + ad + bd) - bd - ac\n"
	"\t//\t        = bc + ad (this checks)\n"
"\n"
"\n");
	fprintf(fp,
	"\t// This should really be based upon an IF, such as in\n"
	"\t// if (IWIDTH < CWIDTH) then ...\n"
	"\t// However, this is the only (other) way I know to do it.\n"
	"\tgenerate if (CKPCE <= 1)\n"
	"\tbegin\n"
"\n"
		"\t\twire\t[(CWIDTH):0]\tp3c_in;\n"
		"\t\twire\t[(IWIDTH+1):0]\tp3d_in;\n"
		"\t\tassign\tp3c_in = ir_coef_i + ir_coef_r;\n"
		"\t\tassign\tp3d_in = r_dif_r + r_dif_i;\n"
		"\n"
		"\t\t// We need to pad these first two multiplies by an extra\n"
		"\t\t// bit just to keep them aligned with the third,\n"
		"\t\t// simpler, multiply.\n"
		"\t\tlongbimpy #(CWIDTH+1,IWIDTH+2) p1(i_clk, i_ce,\n"
				"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r},\n"
				"\t\t\t\t{r_dif_r[IWIDTH],r_dif_r}, p_one");
		if (formal_property_flag) fprintf(fp,
"\n`ifdef\tFORMAL\n"
				"\t\t\t\t, fp_one_ic, fp_one_id\n"
"`endif\n"
			"\t\t\t");
		fprintf(fp, ");\n"
		"\t\tlongbimpy #(CWIDTH+1,IWIDTH+2) p2(i_clk, i_ce,\n"
				"\t\t\t\t{ir_coef_i[CWIDTH-1],ir_coef_i},\n"
				"\t\t\t\t{r_dif_i[IWIDTH],r_dif_i}, p_two");
		if (formal_property_flag) fprintf(fp,
"\n`ifdef\tFORMAL\n"
				"\t\t\t\t, fp_two_ic, fp_two_id\n"
"`endif\n"
			"\t\t\t");
		fprintf(fp, ");\n"
		"\t\tlongbimpy #(CWIDTH+1,IWIDTH+2) p3(i_clk, i_ce,\n"
			"\t\t\t\tp3c_in, p3d_in, p_three");
		if (formal_property_flag) fprintf(fp,
"\n`ifdef\tFORMAL\n"
				"\t\t\t\t, fp_three_ic, fp_three_id\n"
"`endif\n"
			"\t\t\t");
		fprintf(fp, ");\n"
"\n");
 
	///////////////////////////////////////////
	///
	///	Two clocks per CE, so CE, no-ce, CE, no-ce, etc
	///
	fprintf(fp,
	"\tend else if (CKPCE == 2)\n"
	"\tbegin : CKPCE_TWO\n"
		"\t\t// Coefficient multiply inputs\n"
		"\t\treg		[2*(CWIDTH)-1:0]	mpy_pipe_c;\n"
		"\t\t// Data multiply inputs\n"
		"\t\treg		[2*(IWIDTH+1)-1:0]	mpy_pipe_d;\n"
		"\t\twire	signed	[(CWIDTH-1):0]	mpy_pipe_vc;\n"
		"\t\twire	signed	[(IWIDTH):0]	mpy_pipe_vd;\n"
		"\t\t//\n"
		"\t\treg	signed	[(CWIDTH+1)-1:0]	mpy_cof_sum;\n"
		"\t\treg	signed	[(IWIDTH+2)-1:0]	mpy_dif_sum;\n"
"\n"
		"\t\tassign	mpy_pipe_vc =  mpy_pipe_c[2*(CWIDTH)-1:CWIDTH];\n"
		"\t\tassign	mpy_pipe_vd =  mpy_pipe_d[2*(IWIDTH+1)-1:IWIDTH+1];\n"
"\n"
		"\t\treg			mpy_pipe_v;\n"
		"\t\treg			ce_phase;\n"
"\n"
		"\t\treg	signed	[(CWIDTH+IWIDTH+3)-1:0]	mpy_pipe_out;\n"
		"\t\treg	signed [IWIDTH+CWIDTH+3-1:0]	longmpy;\n"
"\n");
		if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
		"\t\twire	[CWIDTH:0]	f_past_ic;\n"
		"\t\twire	[IWIDTH+1:0]	f_past_id;\n"
		"\t\twire	[CWIDTH:0]	f_past_mux_ic;\n"
		"\t\twire	[IWIDTH+1:0]	f_past_mux_id;\n"
"\n"
		"\t\treg	[CWIDTH:0]	f_rpone_ic, f_rptwo_ic, f_rpthree_ic,\n"
					"\t\t\t\t\tf_rp2one_ic, f_rp2two_ic, f_rp2three_ic;\n"
		"\t\treg	[IWIDTH+1:0]	f_rpone_id, f_rptwo_id, f_rpthree_id,\n"
					"\t\t\t\t\tf_rp2one_id, f_rp2two_id, f_rp2three_id;\n"
"`endif\n\n");
 
		fprintf(fp,
"\n"
		"\t\tinitial	ce_phase = 1'b0;\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (i_reset)\n"
			"\t\t\tce_phase <= 1'b0;\n"
		"\t\telse if (i_ce)\n"
			"\t\t\tce_phase <= 1'b1;\n"
		"\t\telse\n"
			"\t\t\tce_phase <= 1'b0;\n"
"\n"
		"\t\talways @(*)\n"
			"\t\t\tmpy_pipe_v = (i_ce)||(ce_phase);\n"
"\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (ce_phase)\n"
		"\t\tbegin\n"
			"\t\t\tmpy_pipe_c[2*CWIDTH-1:0] <=\n"
				"\t\t\t\t\t{ ir_coef_r, ir_coef_i };\n"
			"\t\t\tmpy_pipe_d[2*(IWIDTH+1)-1:0] <=\n"
				"\t\t\t\t\t{ r_dif_r, r_dif_i };\n"
"\n"
			"\t\t\tmpy_cof_sum  <= ir_coef_i + ir_coef_r;\n"
			"\t\t\tmpy_dif_sum <= r_dif_r + r_dif_i;\n"
"\n"
		"\t\tend else if (i_ce)\n"
		"\t\tbegin\n"
			"\t\t\tmpy_pipe_c[2*(CWIDTH)-1:0] <= {\n"
				"\t\t\t\tmpy_pipe_c[(CWIDTH)-1:0], {(CWIDTH){1'b0}} };\n"
			"\t\t\tmpy_pipe_d[2*(IWIDTH+1)-1:0] <= {\n"
				"\t\t\t\tmpy_pipe_d[(IWIDTH+1)-1:0], {(IWIDTH+1){1'b0}} };\n"
		"\t\tend\n"
"\n");
	fprintf(fp,
		"\t\tlongbimpy #(CWIDTH+1,IWIDTH+2) mpy0(i_clk, mpy_pipe_v,\n"
			"\t\t\t\tmpy_cof_sum, mpy_dif_sum, longmpy\n");
		if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
			"\t\t\t\t, f_past_ic, f_past_id\n"
"`endif\n");
	fprintf(fp,"\t\t\t);\n"
"\n");
 
	fprintf(fp,
		"\t\tlongbimpy #(CWIDTH+1,IWIDTH+2) mpy1(i_clk, mpy_pipe_v,\n"
			"\t\t\t\t{ mpy_pipe_vc[CWIDTH-1], mpy_pipe_vc },\n"
			"\t\t\t\t{ mpy_pipe_vd[IWIDTH  ], mpy_pipe_vd },\n"
			"\t\t\t\tmpy_pipe_out\n");
		if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
			"\t\t\t\t, f_past_mux_ic, f_past_mux_id\n"
"`endif\n");
	fprintf(fp,"\t\t\t);\n"
"\n");
 
	fprintf(fp,
		"\t\treg\tsigned\t[((IWIDTH+2)+(CWIDTH+1)-1):0]\n"
			"\t\t\t\t\trp_one, rp_two, rp_three,\n"
			"\t\t\t\t\trp2_one, rp2_two, rp2_three;\n"
"\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (((i_ce)&&(!MPYDELAY[0]))\n"
		"\t\t\t||((ce_phase)&&(MPYDELAY[0])))\n"
		"\t\tbegin\n"
			"\t\t\trp_one <= mpy_pipe_out;\n");
		if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
			"\t\t\tf_rpone_ic <= f_past_mux_ic;\n"
			"\t\t\tf_rpone_id <= f_past_mux_id;\n"
"`endif\n");
		fprintf(fp,
		"\t\tend\n\n");
		fprintf(fp,
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (((i_ce)&&(MPYDELAY[0]))\n"
		"\t\t\t||((ce_phase)&&(!MPYDELAY[0])))\n"
		"\t\tbegin\n"
			"\t\t\trp_two <= mpy_pipe_out;\n");
		if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
			"\t\t\tf_rptwo_ic <= f_past_mux_ic;\n"
			"\t\t\tf_rptwo_id <= f_past_mux_id;\n"
"`endif\n");
		fprintf(fp,
		"\t\tend\n\n");
		fprintf(fp,
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (i_ce)\n"
		"\t\tbegin\n"
			"\t\t\trp_three <= longmpy;\n");
		if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
			"\t\t\tf_rpthree_ic <= f_past_ic;\n"
			"\t\t\tf_rpthree_id <= f_past_id;\n"
"`endif\n");
		fprintf(fp,
		"\t\tend\n"
"\n\n");
 
 
		fprintf(fp,
		"\t\t// Our outputs *MUST* be set on a clock where i_ce is\n"
		"\t\t// true for the following logic to work.  Make that\n"
		"\t\t// happen here.\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (i_ce)\n"
		"\t\tbegin\n"
			"\t\t\trp2_one<= rp_one;\n"
			"\t\t\trp2_two <= rp_two;\n"
			"\t\t\trp2_three<= rp_three;\n");
		if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
			"\t\t\tf_rp2one_ic <= f_rpone_ic;\n"
			"\t\t\tf_rp2one_id <= f_rpone_id;\n"
"\n"
 
			"\t\t\tf_rp2two_ic <= f_rptwo_ic;\n"
			"\t\t\tf_rp2two_id <= f_rptwo_id;\n"
"\n"
 
			"\t\t\tf_rp2three_ic <= f_rpthree_ic;\n"
			"\t\t\tf_rp2three_id <= f_rpthree_id;\n"
"`endif\n");
		fprintf(fp,
		"\t\tend\n"
"\n"
		"\t\tassign	p_one	= rp2_one;\n"
		"\t\tassign	p_two	= (!MPYDELAY[0])? rp2_two  : rp_two;\n"
		"\t\tassign	p_three	= ( MPYDELAY[0])? rp_three : rp2_three;\n"
"\n"
		"\t\t// verilator lint_off UNUSED\n"
		"\t\twire\t[2*(IWIDTH+CWIDTH+3)-1:0]\tunused;\n"
		"\t\tassign\tunused = { rp2_two, rp2_three };\n"
		"\t\t// verilator lint_on  UNUSED\n"
"\n");
		if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
		"\t\tassign fp_one_ic = f_rp2one_ic;\n"
		"\t\tassign fp_one_id = f_rp2one_id;\n"
"\n"
		"\t\tassign fp_two_ic = (!MPYDELAY[0])? f_rp2two_ic : f_rptwo_ic;\n"
		"\t\tassign fp_two_id = (!MPYDELAY[0])? f_rp2two_id : f_rptwo_id;\n"
"\n"
		"\t\tassign fp_three_ic= (MPYDELAY[0])? f_rpthree_ic : f_rp2three_ic;\n"
		"\t\tassign fp_three_id= (MPYDELAY[0])? f_rpthree_id : f_rp2three_id;\n"
"`endif\n\n");
 
 
	/////////////////////////
	///
	///	Three clock per CE, so CE, no-ce, no-ce*, CE
	///
	fprintf(fp,
"\tend else if (CKPCE <= 3)\n\tbegin : CKPCE_THREE\n");
 
	fprintf(fp,
	"\t\t// Coefficient multiply inputs\n"
	"\t\treg\t\t[3*(CWIDTH+1)-1:0]\tmpy_pipe_c;\n"
	"\t\t// Data multiply inputs\n"
	"\t\treg\t\t[3*(IWIDTH+2)-1:0]\tmpy_pipe_d;\n"
	"\t\twire\tsigned	[(CWIDTH):0]	mpy_pipe_vc;\n"
	"\t\twire\tsigned	[(IWIDTH+1):0]	mpy_pipe_vd;\n"
	"\n"
	"\t\tassign\tmpy_pipe_vc =  mpy_pipe_c[3*(CWIDTH+1)-1:2*(CWIDTH+1)];\n"
	"\t\tassign\tmpy_pipe_vd =  mpy_pipe_d[3*(IWIDTH+2)-1:2*(IWIDTH+2)];\n"
	"\n"
	"\t\treg\t\t\tmpy_pipe_v;\n"
	"\t\treg\t\t[2:0]\tce_phase;\n"
	"\n"
	"\t\treg\tsigned	[  (CWIDTH+IWIDTH+3)-1:0]	mpy_pipe_out;\n"
"\n");
	if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
		"\t\twire\t[CWIDTH:0]	f_past_ic;\n"
		"\t\twire\t[IWIDTH+1:0]	f_past_id;\n"
"\n"
		"\t\treg\t[CWIDTH:0]	f_rpone_ic, f_rptwo_ic, f_rpthree_ic,\n"
					"\t\t\t\t\tf_rp2one_ic, f_rp2two_ic, f_rp2three_ic,\n"
					"\t\t\t\t\tf_rp3one_ic;\n"
		"\t\treg\t[IWIDTH+1:0]	f_rpone_id, f_rptwo_id, f_rpthree_id,\n"
					"\t\t\t\t\tf_rp2one_id, f_rp2two_id, f_rp2three_id,\n"
					"\t\t\t\t\tf_rp3one_id;\n"
"`endif\n"
"\n");
 
	fprintf(fp,
	"\t\tinitial\tce_phase = 3'b011;\n"
	"\t\talways @(posedge i_clk)\n"
	"\t\tif (i_reset)\n"
		"\t\t\tce_phase <= 3'b011;\n"
	"\t\telse if (i_ce)\n"
		"\t\t\tce_phase <= 3'b000;\n"
	"\t\telse if (ce_phase != 3'b011)\n"
		"\t\t\tce_phase <= ce_phase + 1'b1;\n"
"\n"
	"\t\talways @(*)\n"
		"\t\t\tmpy_pipe_v = (i_ce)||(ce_phase < 3'b010);\n"
"\n");
 
	fprintf(fp,
	"\t\talways @(posedge i_clk)\n"
	"\t\tif (ce_phase == 3\'b000)\n"
	"\t\tbegin\n"
		"\t\t\t// Second clock\n"
		"\t\t\tmpy_pipe_c[3*(CWIDTH+1)-1:(CWIDTH+1)] <= {\n"
		"\t\t\t\tir_coef_r[CWIDTH-1], ir_coef_r,\n"
		"\t\t\t\tir_coef_i[CWIDTH-1], ir_coef_i };\n"
		"\t\t\tmpy_pipe_c[CWIDTH:0] <= ir_coef_i + ir_coef_r;\n"
		"\t\t\tmpy_pipe_d[3*(IWIDTH+2)-1:(IWIDTH+2)] <= {\n"
		"\t\t\t\tr_dif_r[IWIDTH], r_dif_r,\n"
		"\t\t\t\tr_dif_i[IWIDTH], r_dif_i };\n"
		"\t\t\tmpy_pipe_d[(IWIDTH+2)-1:0] <= r_dif_r + r_dif_i;\n"
"\n"
	"\t\tend else if (mpy_pipe_v)\n"
	"\t\tbegin\n"
		"\t\t\tmpy_pipe_c[3*(CWIDTH+1)-1:0] <= {\n"
		"\t\t\t\tmpy_pipe_c[2*(CWIDTH+1)-1:0], {(CWIDTH+1){1\'b0}} };\n"
		"\t\t\tmpy_pipe_d[3*(IWIDTH+2)-1:0] <= {\n"
		"\t\t\t\tmpy_pipe_d[2*(IWIDTH+2)-1:0], {(IWIDTH+2){1\'b0}} };\n"
	"\t\tend\n"
"\n");
	fprintf(fp,
		"\t\tlongbimpy #(CWIDTH+1,IWIDTH+2) mpy(i_clk, mpy_pipe_v,\n"
			"\t\t\t\tmpy_pipe_vc, mpy_pipe_vd, mpy_pipe_out\n");
	if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
			"\t\t\t\t, f_past_ic, f_past_id\n"
"`endif\n");
	fprintf(fp,
		"\t\t\t);\n"
"\n");
 
	fprintf(fp,
	"\t\treg\tsigned\t[((IWIDTH+2)+(CWIDTH+1)-1):0]\n"
				"\t\t\t\trp_one,  rp_two,  rp_three,\n"
				"\t\t\t\trp2_one, rp2_two, rp2_three,\n"
				"\t\t\t\trp3_one;\n"
"\n");
 
	fprintf(fp,
	"\t\talways @(posedge i_clk)\n"
	"\t\tif (MPYREMAINDER == 0)\n"
	"\t\tbegin\n\n"
	"\t\t	if (i_ce)\n"
	"\t\t	begin\n"
	"\t\t		rp_two   <= mpy_pipe_out;\n");
	if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
	"\t\t		f_rptwo_ic <= f_past_ic;\n"
	"\t\t		f_rptwo_id <= f_past_id;\n"
"`endif\n");
	fprintf(fp,
	"\t\t	end else if (ce_phase == 3'b000)\n"
	"\t\t	begin\n"
	"\t\t		rp_three <= mpy_pipe_out;\n");
	if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
	"\t\t		f_rpthree_ic <= f_past_ic;\n"
	"\t\t		f_rpthree_id <= f_past_id;\n"
"`endif\n");
	fprintf(fp,
	"\t\t	end else if (ce_phase == 3'b001)\n"
	"\t\t	begin\n"
	"\t\t		rp_one   <= mpy_pipe_out;\n");
	if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
	"\t\t		f_rpone_ic <= f_past_ic;\n"
	"\t\t		f_rpone_id <= f_past_id;\n"
"`endif\n");
	fprintf(fp,
	"\t\t	end\n"
	"\t\tend else if (MPYREMAINDER == 1)\n"
	"\t\tbegin\n\n"
	"\t\t	if (i_ce)\n"
	"\t\t	begin\n"
	"\t\t		rp_one   <= mpy_pipe_out;\n");
	if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
	"\t\t		f_rpone_ic <= f_past_ic;\n"
	"\t\t		f_rpone_id <= f_past_id;\n"
"`endif\n");
	fprintf(fp,
	"\t\t	end else if (ce_phase == 3'b000)\n"
	"\t\t	begin\n"
	"\t\t		rp_two   <= mpy_pipe_out;\n");
	if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
	"\t\t		f_rptwo_ic <= f_past_ic;\n"
	"\t\t		f_rptwo_id <= f_past_id;\n"
"`endif\n");
	fprintf(fp,
	"\t\t	end else if (ce_phase == 3'b001)\n"
	"\t\t	begin\n"
	"\t\t		rp_three <= mpy_pipe_out;\n");
	if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
	"\t\t		f_rpthree_ic <= f_past_ic;\n"
	"\t\t		f_rpthree_id <= f_past_id;\n"
"`endif\n");
	fprintf(fp,
	"\t\t	end\n"
	"\t\tend else // if (MPYREMAINDER == 2)\n"
	"\t\tbegin\n\n"
	"\t\t	if (i_ce)\n"
	"\t\t	begin\n"
	"\t\t		rp_three <= mpy_pipe_out;\n");
	if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
	"\t\t		f_rpthree_ic <= f_past_ic;\n"
	"\t\t		f_rpthree_id <= f_past_id;\n"
"`endif\n");
	fprintf(fp,
	"\t\t	end else if (ce_phase == 3'b000)\n"
	"\t\t	begin\n"
	"\t\t		rp_one   <= mpy_pipe_out;\n");
	if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
	"\t\t		f_rpone_ic <= f_past_ic;\n"
	"\t\t		f_rpone_id <= f_past_id;\n"
"`endif\n");
	fprintf(fp,
	"\t\t	end else if (ce_phase == 3'b001)\n"
	"\t\t	begin\n"
	"\t\t		rp_two   <= mpy_pipe_out;\n");
	if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
	"\t\t		f_rptwo_ic <= f_past_ic;\n"
	"\t\t		f_rptwo_id <= f_past_id;\n"
"`endif\n");
	fprintf(fp,
	"\t\t	end\n"
	"\t\tend\n\n");
 
	fprintf(fp,
	"\t\talways @(posedge i_clk)\n"
	"\t\tif (i_ce)\n"
	"\t\tbegin\n"
		"\t\t\trp2_one   <= rp_one;\n"
		"\t\t\trp2_two   <= rp_two;\n"
		"\t\t\trp2_three <= (MPYREMAINDER == 2) ? mpy_pipe_out : rp_three;\n"
		"\t\t\trp3_one   <= (MPYREMAINDER == 0) ? rp2_one : rp_one;\n");
 
	if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
			"\t\t\tf_rp2one_ic <= f_rpone_ic;\n"
			"\t\t\tf_rp2one_id <= f_rpone_id;\n"
"\n"
			"\t\t\tf_rp2two_ic <= f_rptwo_ic;\n"
			"\t\t\tf_rp2two_id <= f_rptwo_id;\n"
"\n"
			"\t\t\tf_rp2three_ic <= (MPYREMAINDER==2) ? f_past_ic : f_rpthree_ic;\n"
			"\t\t\tf_rp2three_id <= (MPYREMAINDER==2) ? f_past_id : f_rpthree_id;\n"
			"\t\t\tf_rp3one_ic <= (MPYREMAINDER==0) ? f_rp2one_ic : f_rpone_ic;\n"
			"\t\t\tf_rp3one_id <= (MPYREMAINDER==0) ? f_rp2one_id : f_rpone_id;\n"
"`endif\n");
 
 
	fprintf(fp,
		"\t\tend\n"
"\n"
	"\t\tassign\tp_one   = rp3_one;\n"
	"\t\tassign\tp_two   = rp2_two;\n"
	"\t\tassign\tp_three = rp2_three;\n"
"\n");
	if (formal_property_flag) fprintf(fp,
"`ifdef	FORMAL\n"
		"\t\tassign	fp_one_ic = f_rp3one_ic;\n"
		"\t\tassign	fp_one_id = f_rp3one_id;\n"
"\n"
		"\t\tassign	fp_two_ic = f_rp2two_ic;\n"
		"\t\tassign	fp_two_id = f_rp2two_id;\n"
"\n"
		"\t\tassign	fp_three_ic = f_rp2three_ic;\n"
		"\t\tassign	fp_three_id = f_rp2three_id;\n"
"`endif\n"
"\n");
 
	fprintf(fp,
"\tend endgenerate\n");
 
	fprintf(fp,
	"\t// These values are held in memory and delayed during the\n"
	"\t// multiply.  Here, we recover them.  During the multiply,\n"
	"\t// values were multiplied by 2^(CWIDTH-2)*exp{-j*2*pi*...},\n"
	"\t// therefore, the left_x values need to be right shifted by\n"
	"\t// CWIDTH-2 as well.  The additional bits come from a sign\n"
	"\t// extension.\n"
	"\twire\tsigned\t[(IWIDTH+CWIDTH):0]	fifo_i, fifo_r;\n"
	"\treg\t\t[(2*IWIDTH+1):0]	fifo_read;\n"
	"\tassign\tfifo_r = { {2{fifo_read[2*(IWIDTH+1)-1]}},\n"
		"\t\tfifo_read[(2*(IWIDTH+1)-1):(IWIDTH+1)], {(CWIDTH-2){1\'b0}} };\n"
	"\tassign\tfifo_i = { {2{fifo_read[(IWIDTH+1)-1]}},\n"
		"\t\tfifo_read[((IWIDTH+1)-1):0], {(CWIDTH-2){1\'b0}} };\n"
"\n"
"\n"
	"\treg\tsigned\t[(CWIDTH+IWIDTH+3-1):0]	mpy_r, mpy_i;\n"
"\n");
	fprintf(fp,
	"\t// Let's do some rounding and remove unnecessary bits.\n"
	"\t// We have (IWIDTH+CWIDTH+3) bits here, we need to drop down to\n"
	"\t// OWIDTH, and SHIFT by SHIFT bits in the process.  The trick is\n"
	"\t// that we don\'t need (IWIDTH+CWIDTH+3) bits.  We\'ve accumulated\n"
	"\t// them, but the actual values will never fill all these bits.\n"
	"\t// In particular, we only need:\n"
	"\t//\t IWIDTH bits for the input\n"
	"\t//\t     +1 bit for the add/subtract\n"
	"\t//\t+CWIDTH bits for the coefficient multiply\n"
	"\t//\t     +1 bit for the add/subtract in the complex multiply\n"
	"\t//\t ------\n"
	"\t//\t (IWIDTH+CWIDTH+2) bits at full precision.\n"
	"\t//\n"
	"\t// However, the coefficient multiply multiplied by a maximum value\n"
	"\t// of 2^(CWIDTH-2).  Thus, we only have\n"
	"\t//\t   IWIDTH bits for the input\n"
	"\t//\t       +1 bit for the add/subtract\n"
	"\t//\t+CWIDTH-2 bits for the coefficient multiply\n"
	"\t//\t       +1 (optional) bit for the add/subtract in the cpx mpy.\n"
	"\t//\t -------- ... multiply.  (This last bit may be shifted out.)\n"
	"\t//\t (IWIDTH+CWIDTH) valid output bits.\n"
	"\t// Now, if the user wants to keep any extras of these (via OWIDTH),\n"
	"\t// or if he wishes to arbitrarily shift some of these off (via\n"
	"\t// SHIFT) we accomplish that here.\n"
"\n");
	fprintf(fp,
	"\twire\tsigned\t[(OWIDTH-1):0]\trnd_left_r, rnd_left_i, rnd_right_r, rnd_right_i;\n\n");
 
	fprintf(fp,
	"\twire\tsigned\t[(CWIDTH+IWIDTH+3-1):0]\tleft_sr, left_si;\n"
	"\tassign	left_sr = { {(2){fifo_r[(IWIDTH+CWIDTH)]}}, fifo_r };\n"
	"\tassign	left_si = { {(2){fifo_i[(IWIDTH+CWIDTH)]}}, fifo_i };\n\n");
 
	fprintf(fp,
	"\t%s #(CWIDTH+IWIDTH+3,OWIDTH,SHIFT+4) do_rnd_left_r(i_clk, i_ce,\n"
	"\t\t\t\tleft_sr, rnd_left_r);\n\n",
		rnd_string);
	fprintf(fp,
	"\t%s #(CWIDTH+IWIDTH+3,OWIDTH,SHIFT+4) do_rnd_left_i(i_clk, i_ce,\n"
	"\t\t\t\tleft_si, rnd_left_i);\n\n",
		rnd_string);
	fprintf(fp,
	"\t%s #(CWIDTH+IWIDTH+3,OWIDTH,SHIFT+4) do_rnd_right_r(i_clk, i_ce,\n"
	"\t\t\t\tmpy_r, rnd_right_r);\n\n", rnd_string);
	fprintf(fp,
	"\t%s #(CWIDTH+IWIDTH+3,OWIDTH,SHIFT+4) do_rnd_right_i(i_clk, i_ce,\n"
	"\t\t\t\tmpy_i, rnd_right_i);\n\n", rnd_string);
	fprintf(fp,
	"\talways @(posedge i_clk)\n"
	"\tif (i_ce)\n"
	"\tbegin\n"
		"\t\t// First clock, recover all values\n"
		"\t\tfifo_read <= fifo_left[fifo_read_addr];\n"
		"\t\t// These values are IWIDTH+CWIDTH+3 bits wide\n"
		"\t\t// although they only need to be (IWIDTH+1)\n"
		"\t\t// + (CWIDTH) bits wide.  (We\'ve got two\n"
		"\t\t// extra bits we need to get rid of.)\n"
		"\t\tmpy_r <= p_one - p_two;\n"
		"\t\tmpy_i <= p_three - p_one - p_two;\n"
	"\tend\n"
"\n");
 
	fprintf(fp,
	"\treg\t[(AUXLEN-1):0]\taux_pipeline;\n"
	"\tinitial\taux_pipeline = 0;\n");
	if (async_reset)
		fprintf(fp, "\talways @(posedge i_clk, negedge i_areset_n)\n\tif (!i_areset_n)\n");
	else
		fprintf(fp, "\talways @(posedge i_clk)\n\tif (i_reset)\n");
	fprintf(fp,
	"\t\taux_pipeline <= 0;\n"
	"\telse if (i_ce)\n"
	"\t\taux_pipeline <= { aux_pipeline[(AUXLEN-2):0], i_aux };\n"
"\n");
	fprintf(fp,
	"\tinitial o_aux = 1\'b0;\n");
	if (async_reset)
		fprintf(fp, "\talways @(posedge i_clk, negedge i_areset_n)\n\tif (!i_areset_n)\n");
	else
		fprintf(fp, "\talways @(posedge i_clk)\n\tif (i_reset)\n");
	fprintf(fp,
		"\t\to_aux <= 1\'b0;\n"
		"\telse if (i_ce)\n"
		"\tbegin\n"
			"\t\t// Second clock, latch for final clock\n"
			"\t\to_aux <= aux_pipeline[AUXLEN-1];\n"
		"\tend\n"
"\n");
 
	fprintf(fp,
	"\t// As a final step, we pack our outputs into two packed two\'s\n"
	"\t// complement numbers per output word, so that each output word\n"
	"\t// has (2*OWIDTH) bits in it, with the top half being the real\n"
	"\t// portion and the bottom half being the imaginary portion.\n"
	"\tassign	o_left = { rnd_left_r, rnd_left_i };\n"
	"\tassign	o_right= { rnd_right_r,rnd_right_i};\n"
"\n");
 
	fprintf(fp,
"`ifdef	FORMAL\n");
	if (formal_property_flag) {
		fprintf(fp,
	"\tinitial\tf_dlyaux[0] = 0;\n"
	"\talways @(posedge i_clk)\n"
	"\tif (i_reset)\n"
		"\t\tf_dlyaux\t<= 0;\n"
	"\telse if (i_ce)\n"
		"\t\tf_dlyaux\t<= { f_dlyaux[F_DEPTH-2:0], i_aux };\n"
"\n"
	"\talways @(posedge i_clk)\n"
	"\tif (i_ce)\n"
	"\tbegin\n"
	"\t	f_dlyleft_r[0]   <= i_left[ (2*IWIDTH-1):IWIDTH];\n"
	"\t	f_dlyleft_i[0]   <= i_left[ (  IWIDTH-1):0];\n"
	"\t	f_dlyright_r[0]  <= i_right[(2*IWIDTH-1):IWIDTH];\n"
	"\t	f_dlyright_i[0]  <= i_right[(  IWIDTH-1):0];\n"
	"\t	f_dlycoeff_r[0]  <= i_coef[ (2*CWIDTH-1):CWIDTH];\n"
	"\t	f_dlycoeff_i[0]  <= i_coef[ (  CWIDTH-1):0];\n"
	"\tend\n"
"\n"
	"\tgenvar	k;\n"
	"\tgenerate for(k=1; k<F_DEPTH; k=k+1)\n"
	"\tbegin : F_PROPAGATE_DELAY_LINES\n"
"\n"
"\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (i_ce)\n"
		"\t\tbegin\n"
		"\t\t	f_dlyleft_r[k]  <= f_dlyleft_r[ k-1];\n"
		"\t\t	f_dlyleft_i[k]  <= f_dlyleft_i[ k-1];\n"
		"\t\t	f_dlyright_r[k] <= f_dlyright_r[k-1];\n"
		"\t\t	f_dlyright_i[k] <= f_dlyright_i[k-1];\n"
		"\t\t	f_dlycoeff_r[k] <= f_dlycoeff_r[k-1];\n"
		"\t\t	f_dlycoeff_i[k] <= f_dlycoeff_i[k-1];\n"
		"\t\tend\n"
"\n"
	"\tend endgenerate\n"
"\n"
"`ifndef VERILATOR\n"
	"\t//\n"
	"\t// Make some i_ce restraining assumptions.  These are necessary\n"
	"\t// to get the design to pass induction.\n"
	"\t//\n"
	"\tgenerate if (CKPCE <= 1)\n"
	"\tbegin\n"
"\n"
		"\t\t// No primary i_ce assumption.  i_ce can be anything\n"
		"\t\t//\n"
		"\t\t// First induction i_ce assumption: No more than one\n"
		"\t\t// empty cycle between used cycles.  Without this\n"
		"\t\t// assumption, or one like it, induction would never\n"
		"\t\t// complete.\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif ((!$past(i_ce)))\n"
			"\t\t\tassume(i_ce);\n"
"\n"
		"\t\t// Second induction i_ce assumption: avoid skipping an\n"
		"\t\t// i_ce and thus stretching out the i_ce cycle two i_ce\n"
		"\t\t// cycles in a row.  Without this assumption, induction\n"
		"\t\t// would still complete, it would just take longer\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (($past(i_ce))&&(!$past(i_ce,2)))\n"
			"\t\t\tassume(i_ce);\n"
"\n"
	"\tend else if (CKPCE == 2)\n"
	"\tbegin : F_CKPCE_TWO\n"
"\n"
		"\t\t// Primary i_ce assumption: Every i_ce cycle is followed\n"
		"\t\t// by a non-i_ce cycle, so the multiplies can be\n"
		"\t\t// multiplexed\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif ($past(i_ce))\n"
			"\t\t\tassume(!i_ce);\n"
 
		"\t\t// First induction assumption: Don't let this stretch\n"
		"\t\t// out too far.  This is necessary to pass induction\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif ((!$past(i_ce))&&(!$past(i_ce,2)))\n"
			"\t\t\tassume(i_ce);\n"
"\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif ((!$past(i_ce))&&($past(i_ce,2))\n"
			"\t\t\t\t&&(!$past(i_ce,3))&&(!$past(i_ce,4)))\n"
			"\t\t\tassume(i_ce);\n"
"\n"
	"\tend else if (CKPCE == 3)\n"
	"\tbegin : F_CKPCE_THREE\n"
"\n"
		"\t\t// Primary i_ce assumption: Following any i_ce cycle,\n"
		"\t\t// there must be two clock cycles with i_ce de-asserted\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (($past(i_ce))||($past(i_ce,2)))\n"
			"\t\t\tassume(!i_ce);\n"
"\n"
		"\t\t// Induction assumption: Allow i_ce's every third or\n"
		"\t\t// fourth clock, but don't allow them to be separated\n"
		"\t\t// further than that\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif ((!$past(i_ce))&&(!$past(i_ce,2))&&(!$past(i_ce,3)))\n"
			"\t\t\tassume(i_ce);\n"
"\n"
		"\t\t// Second induction assumption, to speed up the proof:\n"
		"\t\t// If it's the earliest possible opportunity for an\n"
		"\t\t// i_ce, and the last i_ce was late, don't let this one\n"
		"\t\t// be late as well.\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif ((!$past(i_ce))&&(!$past(i_ce,2))\n"
			"\t\t\t&&($past(i_ce,3))&&(!$past(i_ce,4))\n"
			"\t\t\t&&(!$past(i_ce,5))&&(!$past(i_ce,6)))\n"
			"\t\t\tassume(i_ce);\n"
"\n"
	"\tend endgenerate\n"
"`endif\n"
"\n"
	"\treg	[F_LGDEPTH:0]	f_startup_counter;\n"
	"\tinitial	f_startup_counter = 0;\n"
	"\talways @(posedge i_clk)\n"
	"\tif (i_reset)\n"
	"\t	f_startup_counter <= 0;\n"
	"\telse if ((i_ce)&&(!(&f_startup_counter)))\n"
	"\t	f_startup_counter <= f_startup_counter + 1;\n"
"\n"
	"\talways @(*)\n"
	"\tbegin\n"
	"\t	f_sumr = f_dlyleft_r[F_D] + f_dlyright_r[F_D];\n"
	"\t	f_sumi = f_dlyleft_i[F_D] + f_dlyright_i[F_D];\n"
	"\tend\n"
"\n"
	"\tassign\tf_sumrx = { {(4){f_sumr[IWIDTH]}}, f_sumr, {(CWIDTH-2){1'b0}} };\n"
	"\tassign\tf_sumix = { {(4){f_sumi[IWIDTH]}}, f_sumi, {(CWIDTH-2){1'b0}} };\n"
"\n"
	"\talways @(*)\n"
	"\tbegin\n"
	"\t	f_difr = f_dlyleft_r[F_D] - f_dlyright_r[F_D];\n"
	"\t	f_difi = f_dlyleft_i[F_D] - f_dlyright_i[F_D];\n"
	"\tend\n"
"\n"
	"\tassign\tf_difrx = { {(CWIDTH+2){f_difr[IWIDTH]}}, f_difr };\n"
	"\tassign\tf_difix = { {(CWIDTH+2){f_difi[IWIDTH]}}, f_difi };\n"
"\n"
	"\tassign\tf_widecoeff_r ={ {(IWIDTH+3){f_dlycoeff_r[F_D][CWIDTH-1]}},\n"
					"\t\t\t\t\t\tf_dlycoeff_r[F_D] };\n"
	"\tassign\tf_widecoeff_i ={ {(IWIDTH+3){f_dlycoeff_i[F_D][CWIDTH-1]}},\n"
					"\t\t\t\t\t\tf_dlycoeff_i[F_D] };\n"
"\n"
	"\talways @(posedge i_clk)\n"
	"\tif (f_startup_counter > {1'b0, F_D})\n"
	"\tbegin\n"
	"\t	assert(aux_pipeline == f_dlyaux);\n"
	"\t	assert(left_sr == f_sumrx);\n"
	"\t	assert(left_si == f_sumix);\n"
	"\t	assert(aux_pipeline[AUXLEN-1] == f_dlyaux[F_D]);\n"
"\n"
	"\t	if ((f_difr == 0)&&(f_difi == 0))\n"
	"\t	begin\n"
	"\t		assert(mpy_r == 0);\n"
	"\t		assert(mpy_i == 0);\n"
	"\t	end else if ((f_dlycoeff_r[F_D] == 0)\n"
	"\t			&&(f_dlycoeff_i[F_D] == 0))\n"
	"\t	begin\n"
	"\t		assert(mpy_r == 0);\n"
	"\t		assert(mpy_i == 0);\n"
	"\t	end\n"
"\n"
	"\t	if ((f_dlycoeff_r[F_D] == 1)&&(f_dlycoeff_i[F_D] == 0))\n"
	"\t	begin\n"
	"\t		assert(mpy_r == f_difrx);\n"
	"\t		assert(mpy_i == f_difix);\n"
	"\t	end\n"
"\n"
	"\t	if ((f_dlycoeff_r[F_D] == 0)&&(f_dlycoeff_i[F_D] == 1))\n"
	"\t	begin\n"
	"\t		assert(mpy_r == -f_difix);\n"
	"\t		assert(mpy_i ==  f_difrx);\n"
	"\t	end\n"
"\n"
	"\t	if ((f_difr == 1)&&(f_difi == 0))\n"
	"\t	begin\n"
	"\t		assert(mpy_r == f_widecoeff_r);\n"
	"\t		assert(mpy_i == f_widecoeff_i);\n"
	"\t	end\n"
"\n"
	"\t	if ((f_difr == 0)&&(f_difi == 1))\n"
	"\t	begin\n"
	"\t		assert(mpy_r == -f_widecoeff_i);\n"
	"\t		assert(mpy_i ==  f_widecoeff_r);\n"
	"\t	end\n"
	"\tend\n"
"\n");
 
		fprintf(fp,
	"\t// Let's see if we can improve our performance at all by\n"
	"\t// moving our test one clock earlier.  If nothing else, it should\n"
	"\t// help induction finish one (or more) clocks ealier than\n"
	"\t// otherwise\n"
"\n\n"
	"\talways @(*)\n"
	"\tbegin\n"
		"\t\tf_predifr = f_dlyleft_r[F_D-1] - f_dlyright_r[F_D-1];\n"
		"\t\tf_predifi = f_dlyleft_i[F_D-1] - f_dlyright_i[F_D-1];\n"
	"\tend\n"
"\n"
	"\tassign	f_predifrx = { {(CWIDTH+2){f_predifr[IWIDTH]}}, f_predifr };\n"
	"\tassign	f_predifix = { {(CWIDTH+2){f_predifi[IWIDTH]}}, f_predifi };\n"
"\n"
	"\talways @(*)\n"
	"\tbegin\n"
		"\t\tf_sumcoef = f_dlycoeff_r[F_D-1] + f_dlycoeff_i[F_D-1];\n"
		"\t\tf_sumdiff = f_predifr + f_predifi;\n"
	"\tend\n"
"\n"
	"\t// Induction helpers\n"
	"\talways @(posedge i_clk)\n"
	"\tif (f_startup_counter >= { 1'b0, F_D })\n"
	"\tbegin\n"
		"\t\tif (f_dlycoeff_r[F_D-1] == 0)\n"
			"\t\t\tassert(p_one == 0);\n"
		"\t\tif (f_dlycoeff_i[F_D-1] == 0)\n"
			"\t\t\tassert(p_two == 0);\n"
"\n"
		"\t\tif (f_dlycoeff_r[F_D-1] == 1)\n"
			"\t\t\tassert(p_one == f_predifrx);\n"
		"\t\tif (f_dlycoeff_i[F_D-1] == 1)\n"
			"\t\t\tassert(p_two == f_predifix);\n"
"\n"
		"\t\tif (f_predifr == 0)\n"
			"\t\t\tassert(p_one == 0);\n"
		"\t\tif (f_predifi == 0)\n"
			"\t\t\tassert(p_two == 0);\n"
"\n"
		"\t\t// verilator lint_off WIDTH\n"
		"\t\tif (f_predifr == 1)\n"
			"\t\t\tassert(p_one == f_dlycoeff_r[F_D-1]);\n"
		"\t\tif (f_predifi == 1)\n"
			"\t\t\tassert(p_two == f_dlycoeff_i[F_D-1]);\n"
		"\t\t// verilator lint_on  WIDTH\n"
"\n"
		"\t\tif (f_sumcoef == 0)\n"
			"\t\t\tassert(p_three == 0);\n"
		"\t\tif (f_sumdiff == 0)\n"
			"\t\t\tassert(p_three == 0);\n"
		"\t\t// verilator lint_off WIDTH\n"
		"\t\tif (f_sumcoef == 1)\n"
			"\t\t\tassert(p_three == f_sumdiff);\n"
		"\t\tif (f_sumdiff == 1)\n"
			"\t\t\tassert(p_three == f_sumcoef);\n"
		"\t\t// verilator lint_on  WIDTH\n"
"`ifdef	VERILATOR\n"
		"\t\t// Check that the multiplies match--but *ONLY* if using\n"
		"\t\t// Verilator, and not if using formal proper\n"
		"\t\tassert(p_one   == f_predifr * f_dlycoeff_r[F_D-1]);\n"
		"\t\tassert(p_two   == f_predifi * f_dlycoeff_i[F_D-1]);\n"
		"\t\tassert(p_three == f_sumdiff * f_sumcoef);\n"
"`endif	// VERILATOR\n"
	"\tend\n\n");
 
		fprintf(fp,
	"\t// The following logic formally insists that our version of the\n"
	"\t// inputs to the multiply matches what the (multiclock) multiply\n"
	"\t// thinks its inputs were.  While this may seem redundant, the\n"
	"\t// proof will not complete in any reasonable amount of time\n"
	"\t// without these assertions.\n"
"\n"
	"\tassign\tf_p3c_in = f_dlycoeff_i[F_D-1] + f_dlycoeff_r[F_D-1];\n"
	"\tassign\tf_p3d_in = f_predifi + f_predifr;\n"
"\n"
	"\talways @(*)\n"
	"\tif (f_startup_counter >= { 1'b0, F_D })\n"
	"\tbegin\n"
		"\t\tassert(fp_one_ic == { f_dlycoeff_r[F_D-1][CWIDTH-1],\n"
				"\t\t\t\tf_dlycoeff_r[F_D-1][CWIDTH-1:0] });\n"
		"\t\tassert(fp_two_ic == { f_dlycoeff_i[F_D-1][CWIDTH-1],\n"
				"\t\t\t\tf_dlycoeff_i[F_D-1][CWIDTH-1:0] });\n"
		"\t\tassert(fp_one_id == { f_predifr[IWIDTH], f_predifr });\n"
		"\t\tassert(fp_two_id == { f_predifi[IWIDTH], f_predifi });\n"
		"\t\tassert(fp_three_ic == f_p3c_in);\n"
		"\t\tassert(fp_three_id == f_p3d_in);\n"
	"\tend\n"
"\n");
 
 
		fprintf(fp,
	"\t// F_CHECK will be set externally by the solver, so that we can\n"
	"\t// double check that the solver is actually testing what we think\n"
	"\t// it is testing.  We'll set it here to MPYREMAINDER, which will\n"
	"\t// essentially eliminate the check--unless overridden by the\n"
	"\t// solver.\n"
	"\tparameter	F_CHECK = MPYREMAINDER;\n"
	"\tinitial	assert(MPYREMAINDER == F_CHECK);\n\n");
 
	} else {
		fprintf(fp, "// Set the formal_property_flag to enable formal\n"
			"// property generation\n");
	}
		fprintf(fp,
"`endif // FORMAL\n");
 
	fprintf(fp,
"endmodule\n");
	fclose(fp);
}
 
void	build_hwbfly(const char *fname, int xtracbits, ROUND_T rounding,
		int ckpce, const bool async_reset) {
	FILE	*fp = fopen(fname, "w");
	if (NULL == fp) {
		fprintf(stderr, "Could not open \'%s\' for writing\n", fname);
		perror("O/S Err was:");
		return;
	}
 
	const	char	*rnd_string;
	if (rounding == RND_TRUNCATE)
		rnd_string = "truncate";
	else if (rounding == RND_FROMZERO)
		rnd_string = "roundfromzero";
	else if (rounding == RND_HALFUP)
		rnd_string = "roundhalfup";
	else
		rnd_string = "convround";
 
	std::string	resetw("i_reset");
	if (async_reset)
		resetw = std::string("i_areset_n");
 
 
	fprintf(fp,
SLASHLINE
"//\n"
"// Filename:\thwbfly.v\n"
"//\n"
"// Project:\t%s\n"
"//\n"
"// Purpose:\tThis routine is identical to the butterfly.v routine found\n"
"//		in 'butterfly.v', save only that it uses the verilog\n"
"//	operator '*' in hopes that the synthesizer would be able to optimize\n"
"//	it with hardware resources.\n"
"//\n"
"//	It is understood that a hardware multiply can complete its operation in\n"
"//	a single clock.\n"
"//\n"
"// Operation:\n"
"//\n"
"//	Given two inputs, A (i_left) and B (i_right), and a complex\n"
"//	coefficient C (i_coeff), return two outputs, O1 and O2, where:\n"
"//\n"
"//		O1 = A + B, and\n"
"//		O2 = (A - B)*C\n"
"//\n"
"//	This operation is commonly known as a Decimation in Frequency (DIF)\n"
"//	Radix-2 Butterfly.\n"
"//	O1 and O2 are rounded before being returned in (o_left) and o_right\n"
"//	to OWIDTH bits.  If SHIFT is one, an extra bit is dropped from these\n"
"//	values during the rounding process.\n"
"//\n"
"//	Further, since these outputs will take some number of clocks to\n"
"//	calculate, we'll pipe a value (i_aux) through the system and return\n"
"//	it with the results (o_aux), so you can synchronize to the outgoing\n"
"//	output stream.\n"
"//\n"
"//\n%s"
"//\n", prjname, creator);
	fprintf(fp, "%s", cpyleft);
	fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
	fprintf(fp,
"module	hwbfly(i_clk, %s, i_ce, i_coef, i_left, i_right, i_aux,\n"
		"\t\to_left, o_right, o_aux);\n"
	"\t// Public changeable parameters ...\n"
	"\t//	- IWIDTH, number of bits in each component of the input\n"
	"\t//	- CWIDTH, number of bits in each component of the twiddle factor\n"
	"\t//	- OWIDTH, number of bits in each component of the output\n"
	"\tparameter IWIDTH=16,CWIDTH=IWIDTH+%d,OWIDTH=IWIDTH+1;\n"
	"\t// Drop an additional bit on the output?\n"
	"\tparameter\t\tSHIFT=0;\n"
	"\t// The number of clocks per clock enable, 1, 2, or 3.\n"
	"\tparameter\t[1:0]\tCKPCE=%d;\n\t//\n", resetw.c_str(), xtracbits,
		ckpce);
 
	fprintf(fp,
	"\tinput\twire\ti_clk, %s, i_ce;\n"
	"\tinput\twire\t[(2*CWIDTH-1):0]\ti_coef;\n"
	"\tinput\twire\t[(2*IWIDTH-1):0]\ti_left, i_right;\n"
	"\tinput\twire\ti_aux;\n"
	"\toutput\twire\t[(2*OWIDTH-1):0]\to_left, o_right;\n"
	"\toutput\treg\to_aux;\n\n"
"\n", resetw.c_str());
 
	fprintf(fp,
	"\treg\t[(2*IWIDTH-1):0]	r_left, r_right;\n"
	"\treg\t			r_aux, r_aux_2;\n"
	"\treg\t[(2*CWIDTH-1):0]	r_coef;\n"
	"\twire	signed	[(IWIDTH-1):0]	r_left_r, r_left_i, r_right_r, r_right_i;\n"
	"\tassign\tr_left_r  = r_left[ (2*IWIDTH-1):(IWIDTH)];\n"
	"\tassign\tr_left_i  = r_left[ (IWIDTH-1):0];\n"
	"\tassign\tr_right_r = r_right[(2*IWIDTH-1):(IWIDTH)];\n"
	"\tassign\tr_right_i = r_right[(IWIDTH-1):0];\n"
	"\treg	signed	[(CWIDTH-1):0]	ir_coef_r, ir_coef_i;\n"
"\n"
	"\treg	signed	[(IWIDTH):0]	r_sum_r, r_sum_i, r_dif_r, r_dif_i;\n"
"\n"
	"\treg	[(2*IWIDTH+2):0]	leftv, leftvv;\n"
"\n"
	"\t// Set up the input to the multiply\n"
	"\tinitial r_aux   = 1\'b0;\n"
	"\tinitial r_aux_2 = 1\'b0;\n");
	if (async_reset)
		fprintf(fp, "\talways @(posedge i_clk, negedge i_areset_n)\n\t\tif (!i_areset_n)\n");
	else
		fprintf(fp, "\talways @(posedge i_clk)\n\t\tif (i_reset)\n");
	fprintf(fp,
		"\t\tbegin\n"
			"\t\t\tr_aux <= 1\'b0;\n"
			"\t\t\tr_aux_2 <= 1\'b0;\n"
		"\t\tend else if (i_ce)\n"
		"\t\tbegin\n"
			"\t\t\t// One clock just latches the inputs\n"
			"\t\t\tr_aux <= i_aux;\n"
			"\t\t\t// Next clock adds/subtracts\n"
			"\t\t\t// Other inputs are simply delayed on second clock\n"
			"\t\t\tr_aux_2 <= r_aux;\n"
		"\t\tend\n"
	"\talways @(posedge i_clk)\n"
		"\t\tif (i_ce)\n"
		"\t\tbegin\n"
			"\t\t\t// One clock just latches the inputs\n"
			"\t\t\tr_left <= i_left;	// No change in # of bits\n"
			"\t\t\tr_right <= i_right;\n"
			"\t\t\tr_coef  <= i_coef;\n"
			"\t\t\t// Next clock adds/subtracts\n"
			"\t\t\tr_sum_r <= r_left_r + r_right_r; // Now IWIDTH+1 bits\n"
			"\t\t\tr_sum_i <= r_left_i + r_right_i;\n"
			"\t\t\tr_dif_r <= r_left_r - r_right_r;\n"
			"\t\t\tr_dif_i <= r_left_i - r_right_i;\n"
			"\t\t\t// Other inputs are simply delayed on second clock\n"
			"\t\t\tir_coef_r <= r_coef[(2*CWIDTH-1):CWIDTH];\n"
			"\t\t\tir_coef_i <= r_coef[(CWIDTH-1):0];\n"
		"\t\tend\n"
	"\n\n");
	fprintf(fp,
"\t// See comments in the butterfly.v source file for a discussion of\n"
"\t// these operations and the appropriate bit widths.\n\n");
	fprintf(fp,
	"\twire\tsigned	[((IWIDTH+1)+(CWIDTH)-1):0]	p_one, p_two;\n"
	"\twire\tsigned	[((IWIDTH+2)+(CWIDTH+1)-1):0]	p_three;\n"
"\n"
	"\tinitial leftv    = 0;\n"
	"\tinitial leftvv   = 0;\n");
	if (async_reset)
		fprintf(fp, "\talways @(posedge i_clk, negedge i_areset_n)\n\t\tif (!i_areset_n)\n");
	else
		fprintf(fp, "\talways @(posedge i_clk)\n\t\tif (i_reset)\n");
	fprintf(fp,
		"\t\tbegin\n"
			"\t\t\tleftv <= 0;\n"
			"\t\t\tleftvv <= 0;\n"
		"\t\tend else if (i_ce)\n"
		"\t\tbegin\n"
			"\t\t\t// Second clock, pipeline = 1\n"
			"\t\t\tleftv <= { r_aux_2, r_sum_r, r_sum_i };\n"
"\n"
			"\t\t\t// Third clock, pipeline = 3\n"
			"\t\t\t//   As desired, each of these lines infers a DSP48\n"
			"\t\t\tleftvv <= leftv;\n"
		"\t\tend\n"
"\n");
 
	// Nominally, we should handle code for 1, 2, or 3 clocks per CE, with
	// one clock per CE meaning CE could be constant.  The code below
	// instead handles 1 or 3 clocks per CE, leaving the two clocks per
	// CE optimization(s) unfulfilled.
 
//	fprintf(fp,
//"\tend else if (CKPCI == 2'b01)\n\tbegin\n");
 
	///////////////////////////////////////////
	///
	///	One clock per CE, so CE, CE, CE, CE, CE is possible
	///
	fprintf(fp,
"\tgenerate if (CKPCE <= 1)\n\tbegin : CKPCE_ONE\n");
 
	fprintf(fp,
	"\t\t// Coefficient multiply inputs\n"
	"\t\treg\tsigned	[(CWIDTH-1):0]	p1c_in, p2c_in;\n"
	"\t\t// Data multiply inputs\n"
	"\t\treg\tsigned	[(IWIDTH):0]	p1d_in, p2d_in;\n"
	"\t\t// Product 3, coefficient input\n"
	"\t\treg\tsigned	[(CWIDTH):0]	p3c_in;\n"
	"\t\t// Product 3, data input\n"
	"\t\treg\tsigned	[(IWIDTH+1):0]	p3d_in;\n"
"\n");
	fprintf(fp,
	"\t\treg\tsigned	[((IWIDTH+1)+(CWIDTH)-1):0]	rp_one, rp_two;\n"
	"\t\treg\tsigned	[((IWIDTH+2)+(CWIDTH+1)-1):0]	rp_three;\n"
"\n");
 
	fprintf(fp,
	"\t\talways @(posedge i_clk)\n"
	"\t\tif (i_ce)\n"
	"\t\tbegin\n"
		"\t\t\t// Second clock, pipeline = 1\n"
		"\t\t\tp1c_in <= ir_coef_r;\n"
		"\t\t\tp2c_in <= ir_coef_i;\n"
		"\t\t\tp1d_in <= r_dif_r;\n"
		"\t\t\tp2d_in <= r_dif_i;\n"
		"\t\t\tp3c_in <= ir_coef_i + ir_coef_r;\n"
		"\t\t\tp3d_in <= r_dif_r + r_dif_i;\n"
	"\t\tend\n\n");
 
	if (formal_property_flag)
		fprintf(fp,
"`ifndef	FORMAL\n");
 
	fprintf(fp,
	"\t\talways @(posedge i_clk)\n"
	"\t\tif (i_ce)\n"
	"\t\tbegin\n"
		"\t\t\t// Third clock, pipeline = 3\n"
		"\t\t\t//   As desired, each of these lines infers a DSP48\n"
		"\t\t\trp_one   <= p1c_in * p1d_in;\n"
		"\t\t\trp_two   <= p2c_in * p2d_in;\n"
		"\t\t\trp_three <= p3c_in * p3d_in;\n"
	"\t\tend\n");
 
	if (formal_property_flag)
		fprintf(fp,
"`else\n"
		"\t\twire	signed	[((IWIDTH+1)+(CWIDTH)-1):0]	pre_rp_one, pre_rp_two;\n"
		"\t\twire	signed	[((IWIDTH+2)+(CWIDTH+1)-1):0]	pre_rp_three;\n"
"\n"
		"\t\tabs_mpy #(CWIDTH,IWIDTH+1,1'b1)\n"
		"\t\t	onei(p1c_in, p1d_in, pre_rp_one);\n"
		"\t\tabs_mpy #(CWIDTH,IWIDTH+1,1'b1)\n"
		"\t\t	twoi(p2c_in, p2d_in, pre_rp_two);\n"
		"\t\tabs_mpy #(CWIDTH+1,IWIDTH+2,1'b1)\n"
		"\t\t	threei(p3c_in, p3d_in, pre_rp_three);\n"
"\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (i_ce)\n"
		"\t\tbegin\n"
		"\t\t	rp_one   = pre_rp_one;\n"
		"\t\t	rp_two   = pre_rp_two;\n"
		"\t\t	rp_three = pre_rp_three;\n"
		"\t\tend\n"
"`endif // FORMAL\n");
 
	fprintf(fp,"\n"
	"\t\tassign\tp_one   = rp_one;\n"
	"\t\tassign\tp_two   = rp_two;\n"
	"\t\tassign\tp_three = rp_three;\n"
"\n");
 
	///////////////////////////////////////////
	///
	///	Two clocks per CE, so CE, no-ce, CE, no-ce, etc
	///
	fprintf(fp,
	"\tend else if (CKPCE <= 2)\n"
	"\tbegin : CKPCE_TWO\n"
		"\t\t// Coefficient multiply inputs\n"
		"\t\treg		[2*(CWIDTH)-1:0]	mpy_pipe_c;\n"
		"\t\t// Data multiply inputs\n"
		"\t\treg		[2*(IWIDTH+1)-1:0]	mpy_pipe_d;\n"
		"\t\twire	signed	[(CWIDTH-1):0]	mpy_pipe_vc;\n"
		"\t\twire	signed	[(IWIDTH):0]	mpy_pipe_vd;\n"
		"\t\t//\n"
		"\t\treg	signed	[(CWIDTH+1)-1:0]	mpy_cof_sum;\n"
		"\t\treg	signed	[(IWIDTH+2)-1:0]	mpy_dif_sum;\n"
"\n"
		"\t\tassign	mpy_pipe_vc =  mpy_pipe_c[2*(CWIDTH)-1:CWIDTH];\n"
		"\t\tassign	mpy_pipe_vd =  mpy_pipe_d[2*(IWIDTH+1)-1:IWIDTH+1];\n"
"\n"
		"\t\treg			mpy_pipe_v;\n"
		"\t\treg			ce_phase;\n"
"\n"
		"\t\treg	signed	[(CWIDTH+IWIDTH+1)-1:0]	mpy_pipe_out;\n"
		"\t\treg	signed [IWIDTH+CWIDTH+3-1:0]	longmpy;\n"
"\n"
"\n"
		"\t\tinitial	ce_phase = 1'b1;\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (i_reset)\n"
			"\t\t\tce_phase <= 1'b1;\n"
		"\t\telse if (i_ce)\n"
			"\t\t\tce_phase <= 1'b0;\n"
		"\t\telse\n"
			"\t\t\tce_phase <= 1'b1;\n"
"\n"
		"\t\talways @(*)\n"
			"\t\t\tmpy_pipe_v = (i_ce)||(!ce_phase);\n"
"\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (!ce_phase)\n"
		"\t\tbegin\n"
			"\t\t\t// Pre-clock\n"
			"\t\t\tmpy_pipe_c[2*CWIDTH-1:0] <=\n"
				"\t\t\t\t\t{ ir_coef_r, ir_coef_i };\n"
			"\t\t\tmpy_pipe_d[2*(IWIDTH+1)-1:0] <=\n"
				"\t\t\t\t\t{ r_dif_r, r_dif_i };\n"
"\n"
			"\t\t\tmpy_cof_sum  <= ir_coef_i + ir_coef_r;\n"
			"\t\t\tmpy_dif_sum <= r_dif_r + r_dif_i;\n"
"\n"
		"\t\tend else if (i_ce)\n"
		"\t\tbegin\n"
			"\t\t\t// First clock\n"
			"\t\t\tmpy_pipe_c[2*(CWIDTH)-1:0] <= {\n"
				"\t\t\t\tmpy_pipe_c[(CWIDTH)-1:0], {(CWIDTH){1'b0}} };\n"
			"\t\t\tmpy_pipe_d[2*(IWIDTH+1)-1:0] <= {\n"
				"\t\t\t\tmpy_pipe_d[(IWIDTH+1)-1:0], {(IWIDTH+1){1'b0}} };\n"
		"\t\tend\n\n");
 
	if (formal_property_flag)
		fprintf(fp, "`ifndef	FORMAL\n");
 
	fprintf(fp,
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (i_ce) // First clock\n"
			"\t\t\tlongmpy <= mpy_cof_sum * mpy_dif_sum;\n"
"\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (mpy_pipe_v)\n"
			"\t\t\tmpy_pipe_out <= mpy_pipe_vc * mpy_pipe_vd;\n");
 
	if (formal_property_flag)
		fprintf(fp, "`else\n"
		"\t\twire	signed [IWIDTH+CWIDTH+3-1:0]	pre_longmpy;\n"
		"\t\twire	signed	[(CWIDTH+IWIDTH+1)-1:0]	pre_mpy_pipe_out;\n"
"\n"
		"\t\tabs_mpy	#(CWIDTH+1,IWIDTH+2,1)\n"
		"\t\t	longmpyi(mpy_cof_sum, mpy_dif_sum, pre_longmpy);\n"
"\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (i_ce)\n"
		"\t\t	longmpy <= pre_longmpy;\n"
"\n"
"\n"
		"\t\tabs_mpy #(CWIDTH,IWIDTH+1,1)\n"
		"\t\t	mpy_pipe_outi(mpy_pipe_vc, mpy_pipe_vd, pre_mpy_pipe_out);\n"
"\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (mpy_pipe_v)\n"
		"\t\t	mpy_pipe_out <= pre_mpy_pipe_out;\n"
"`endif\n");
 
	fprintf(fp,"\n"
		"\t\treg\tsigned\t[((IWIDTH+1)+(CWIDTH)-1):0]	rp_one,\n"
				"\t\t\t\t\t\t\trp2_one, rp_two;\n"
		"\t\treg\tsigned\t[((IWIDTH+2)+(CWIDTH+1)-1):0]	rp_three;\n"
"\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (!ce_phase) // 1.5 clock\n"
			"\t\t\trp_one <= mpy_pipe_out;\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (i_ce) // two clocks\n"
			"\t\t\trp_two <= mpy_pipe_out;\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (i_ce) // Second clock\n"
			"\t\t\trp_three<= longmpy;\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (i_ce)\n"
			"\t\t\trp2_one<= rp_one;\n"
"\n"
		"\t\tassign	p_one  = rp2_one;\n"
		"\t\tassign	p_two  = rp_two;\n"
		"\t\tassign	p_three= rp_three;\n"
"\n");
 
	/////////////////////////
	///
	///	Three clock per CE, so CE, no-ce, no-ce*, CE
	///
	fprintf(fp,
"\tend else if (CKPCE <= 2'b11)\n\tbegin : CKPCE_THREE\n");
 
	fprintf(fp,
	"\t\t// Coefficient multiply inputs\n"
	"\t\treg\t\t[3*(CWIDTH+1)-1:0]\tmpy_pipe_c;\n"
	"\t\t// Data multiply inputs\n"
	"\t\treg\t\t[3*(IWIDTH+2)-1:0]\tmpy_pipe_d;\n"
	"\t\twire\tsigned	[(CWIDTH):0]	mpy_pipe_vc;\n"
	"\t\twire\tsigned	[(IWIDTH+1):0]	mpy_pipe_vd;\n"
	"\n"
	"\t\tassign\tmpy_pipe_vc =  mpy_pipe_c[3*(CWIDTH+1)-1:2*(CWIDTH+1)];\n"
	"\t\tassign\tmpy_pipe_vd =  mpy_pipe_d[3*(IWIDTH+2)-1:2*(IWIDTH+2)];\n"
	"\n"
	"\t\treg\t\t\tmpy_pipe_v;\n"
	"\t\treg\t\t[2:0]\tce_phase;\n"
	"\n"
	"\t\treg\tsigned	[  (CWIDTH+IWIDTH+3)-1:0]	mpy_pipe_out;\n"
"\n");
	fprintf(fp,
	"\t\tinitial\tce_phase = 3'b011;\n"
	"\t\talways @(posedge i_clk)\n"
	"\t\tif (i_reset)\n"
		"\t\t\tce_phase <= 3'b011;\n"
	"\t\telse if (i_ce)\n"
		"\t\t\tce_phase <= 3'b000;\n"
	"\t\telse if (ce_phase != 3'b011)\n"
		"\t\t\tce_phase <= ce_phase + 1'b1;\n"
"\n"
	"\t\talways @(*)\n"
		"\t\t\tmpy_pipe_v = (i_ce)||(ce_phase < 3'b010);\n"
"\n");
 
	fprintf(fp,
	"\t\talways @(posedge i_clk)\n"
		"\t\t\tif (ce_phase == 3\'b000)\n"
		"\t\t\tbegin\n"
			"\t\t\t\t// Second clock\n"
			"\t\t\t\tmpy_pipe_c[3*(CWIDTH+1)-1:(CWIDTH+1)] <= {\n"
			"\t\t\t\t\tir_coef_r[CWIDTH-1], ir_coef_r,\n"
			"\t\t\t\t\tir_coef_i[CWIDTH-1], ir_coef_i };\n"
			"\t\t\t\tmpy_pipe_c[CWIDTH:0] <= ir_coef_i + ir_coef_r;\n"
			"\t\t\t\tmpy_pipe_d[3*(IWIDTH+2)-1:(IWIDTH+2)] <= {\n"
			"\t\t\t\t\tr_dif_r[IWIDTH], r_dif_r,\n"
			"\t\t\t\t\tr_dif_i[IWIDTH], r_dif_i };\n"
			"\t\t\t\tmpy_pipe_d[(IWIDTH+2)-1:0] <= r_dif_r + r_dif_i;\n"
"\n"
		"\t\t\tend else if (mpy_pipe_v)\n"
		"\t\t\tbegin\n"
			"\t\t\t\tmpy_pipe_c[3*(CWIDTH+1)-1:0] <= {\n"
			"\t\t\t\t\tmpy_pipe_c[2*(CWIDTH+1)-1:0], {(CWIDTH+1){1\'b0}} };\n"
			"\t\t\t\tmpy_pipe_d[3*(IWIDTH+2)-1:0] <= {\n"
			"\t\t\t\t\tmpy_pipe_d[2*(IWIDTH+2)-1:0], {(IWIDTH+2){1\'b0}} };\n"
		"\t\t\tend\n\n");
 
	if (formal_property_flag)
		fprintf(fp, "`ifndef\tFORMAL\n");
 
	fprintf(fp,
	"\t\talways @(posedge i_clk)\n"
	"\t\t\tif (mpy_pipe_v)\n"
			"\t\t\t\tmpy_pipe_out <= mpy_pipe_vc * mpy_pipe_vd;\n"
"\n");
 
	if (formal_property_flag)
		fprintf(fp,
"`else\t// FORMAL\n"
		"\t\twire	signed	[  (CWIDTH+IWIDTH+3)-1:0] pre_mpy_pipe_out;\n"
"\n"
		"\t\tabs_mpy #(CWIDTH+1,IWIDTH+2,1)\n"
		"\t\t	mpy_pipe_outi(mpy_pipe_vc, mpy_pipe_vd, pre_mpy_pipe_out);\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\t	if (mpy_pipe_v)\n"
		"\t\t		mpy_pipe_out <= pre_mpy_pipe_out;\n"
"`endif\t// FORMAL\n\n");
 
 
	fprintf(fp,
	"\t\treg\tsigned\t[((IWIDTH+1)+(CWIDTH)-1):0]\trp_one, rp_two,\n"
					"\t\t\t\t\t\trp2_one, rp2_two;\n"
	"\t\treg\tsigned\t[((IWIDTH+2)+(CWIDTH+1)-1):0]\trp_three, rp2_three;\n"
 
"\n");
 
	fprintf(fp,
	"\t\talways @(posedge i_clk)\n"
	"\t\tif(i_ce)\n"
		"\t\t\trp_one <= mpy_pipe_out[(CWIDTH+IWIDTH):0];\n"
	"\t\talways @(posedge i_clk)\n"
	"\t\tif(ce_phase == 3'b000)\n"
		"\t\t\trp_two <= mpy_pipe_out[(CWIDTH+IWIDTH):0];\n"
	"\t\talways @(posedge i_clk)\n"
	"\t\tif(ce_phase == 3'b001)\n"
		"\t\t\trp_three <= mpy_pipe_out;\n"
	"\t\talways @(posedge i_clk)\n"
	"\t\tif (i_ce)\n"
	"\t\tbegin\n"
		"\t\t\trp2_one<= rp_one;\n"
		"\t\t\trp2_two<= rp_two;\n"
		"\t\t\trp2_three<= rp_three;\n"
	"\t\tend\n");
	fprintf(fp,
	"\t\tassign	p_one\t= rp2_one;\n"
	"\t\tassign	p_two\t= rp2_two;\n"
	"\t\tassign\tp_three\t= rp2_three;\n"
"\n");
 
	fprintf(fp,
"\tend endgenerate\n");
 
	fprintf(fp,
	"\twire\tsigned	[((IWIDTH+2)+(CWIDTH+1)-1):0]	w_one, w_two;\n"
	"\tassign\tw_one = { {(2){p_one[((IWIDTH+1)+(CWIDTH)-1)]}}, p_one };\n"
	"\tassign\tw_two = { {(2){p_two[((IWIDTH+1)+(CWIDTH)-1)]}}, p_two };\n"
"\n");
 
	fprintf(fp,
	"\t// These values are held in memory and delayed during the\n"
	"\t// multiply.  Here, we recover them.  During the multiply,\n"
	"\t// values were multiplied by 2^(CWIDTH-2)*exp{-j*2*pi*...},\n"
	"\t// therefore, the left_x values need to be right shifted by\n"
	"\t// CWIDTH-2 as well.  The additional bits come from a sign\n"
	"\t// extension.\n"
	"\twire\taux_s;\n"
	"\twire\tsigned\t[(IWIDTH+CWIDTH):0]	left_si, left_sr;\n"
	"\treg\t\t[(2*IWIDTH+2):0]	left_saved;\n"
	"\tassign\tleft_sr = { {2{left_saved[2*(IWIDTH+1)-1]}}, left_saved[(2*(IWIDTH+1)-1):(IWIDTH+1)], {(CWIDTH-2){1\'b0}} };\n"
	"\tassign\tleft_si = { {2{left_saved[(IWIDTH+1)-1]}}, left_saved[((IWIDTH+1)-1):0], {(CWIDTH-2){1\'b0}} };\n"
	"\tassign\taux_s = left_saved[2*IWIDTH+2];\n"
"\n"
	"\t(* use_dsp48=\"no\" *)\n"
	"\treg	signed	[(CWIDTH+IWIDTH+3-1):0]	mpy_r, mpy_i;\n"
"\n");
 
	fprintf(fp,
	"\tinitial left_saved = 0;\n"
	"\tinitial o_aux      = 1\'b0;\n");
	if (async_reset)
		fprintf(fp, "\talways @(posedge i_clk, negedge i_areset_n)\n\t\tif (!i_areset_n)\n");
	else
		fprintf(fp, "\talways @(posedge i_clk)\n\t\tif (i_reset)\n");
	fprintf(fp,
	"\t\tbegin\n"
		"\t\t\tleft_saved <= 0;\n"
		"\t\t\to_aux <= 1\'b0;\n"
	"\t\tend else if (i_ce)\n"
	"\t\tbegin\n"
		"\t\t\t// First clock, recover all values\n"
		"\t\t\tleft_saved <= leftvv;\n"
"\n"
		"\t\t\t// Second clock, round and latch for final clock\n"
		"\t\t\to_aux <= aux_s;\n"
	"\t\tend\n"
	"\talways @(posedge i_clk)\n"
	"\t\tif (i_ce)\n"
	"\t\tbegin\n"
		"\t\t\t// These values are IWIDTH+CWIDTH+3 bits wide\n"
		"\t\t\t// although they only need to be (IWIDTH+1)\n"
		"\t\t\t// + (CWIDTH) bits wide.  (We've got two\n"
		"\t\t\t// extra bits we need to get rid of.)\n"
		"\n"
		"\t\t\t// These two lines also infer DSP48\'s.\n"
		"\t\t\t// To keep from using extra DSP48 resources,\n"
		"\t\t\t// they are prevented from using DSP48\'s\n"
		"\t\t\t// by the (* use_dsp48 ... *) comment above.\n"
		"\t\t\tmpy_r <= w_one - w_two;\n"
		"\t\t\tmpy_i <= p_three - w_one - w_two;\n"
	"\t\tend\n"
	"\n");
 
	fprintf(fp,
	"\t// Round the results\n"
	"\twire\tsigned\t[(OWIDTH-1):0]\trnd_left_r, rnd_left_i, rnd_right_r, rnd_right_i;\n\n");
	fprintf(fp,
	"\t%s #(CWIDTH+IWIDTH+1,OWIDTH,SHIFT+2) do_rnd_left_r(i_clk, i_ce,\n"
	"\t\t\t\tleft_sr, rnd_left_r);\n\n",
		rnd_string);
	fprintf(fp,
	"\t%s #(CWIDTH+IWIDTH+1,OWIDTH,SHIFT+2) do_rnd_left_i(i_clk, i_ce,\n"
	"\t\t\t\tleft_si, rnd_left_i);\n\n",
		rnd_string);
	fprintf(fp,
	"\t%s #(CWIDTH+IWIDTH+3,OWIDTH,SHIFT+4) do_rnd_right_r(i_clk, i_ce,\n"
	"\t\t\t\tmpy_r, rnd_right_r);\n\n", rnd_string);
	fprintf(fp,
	"\t%s #(CWIDTH+IWIDTH+3,OWIDTH,SHIFT+4) do_rnd_right_i(i_clk, i_ce,\n"
	"\t\t\t\tmpy_i, rnd_right_i);\n\n", rnd_string);
 
 
	fprintf(fp,
	"\t// As a final step, we pack our outputs into two packed two's\n"
	"\t// complement numbers per output word, so that each output word\n"
	"\t// has (2*OWIDTH) bits in it, with the top half being the real\n"
	"\t// portion and the bottom half being the imaginary portion.\n"
	"\tassign\to_left = { rnd_left_r, rnd_left_i };\n"
	"\tassign\to_right= { rnd_right_r,rnd_right_i};\n"
"\n");
 
	if (formal_property_flag) {
		fprintf(fp,
"`ifdef	FORMAL\n"
	"\tlocalparam	F_LGDEPTH = 3;\n"
	"\tlocalparam	F_DEPTH = 5;\n"
	"\tlocalparam	[F_LGDEPTH-1:0]	F_D = F_DEPTH-1;\n"
"\n"
	"\treg	signed	[IWIDTH-1:0]	f_dlyleft_r  [0:F_DEPTH-1];\n"
	"\treg	signed	[IWIDTH-1:0]	f_dlyleft_i  [0:F_DEPTH-1];\n"
	"\treg	signed	[IWIDTH-1:0]	f_dlyright_r [0:F_DEPTH-1];\n"
	"\treg	signed	[IWIDTH-1:0]	f_dlyright_i [0:F_DEPTH-1];\n"
	"\treg	signed	[CWIDTH-1:0]	f_dlycoeff_r [0:F_DEPTH-1];\n"
	"\treg	signed	[CWIDTH-1:0]	f_dlycoeff_i [0:F_DEPTH-1];\n"
	"\treg	signed	[F_DEPTH-1:0]	f_dlyaux;\n"
"\n"
	"\talways @(posedge i_clk)\n"
	"\tif (i_reset)\n"
		"\t\tf_dlyaux <= 0;\n"
	"\telse if (i_ce)\n"
		"\t\tf_dlyaux <= { f_dlyaux[F_DEPTH-2:0], i_aux };\n"
"\n"
	"\talways @(posedge i_clk)\n"
	"\tif (i_ce)\n"
	"\tbegin\n"
		"\t\tf_dlyleft_r[0]   <= i_left[ (2*IWIDTH-1):IWIDTH];\n"
		"\t\tf_dlyleft_i[0]   <= i_left[ (  IWIDTH-1):0];\n"
		"\t\tf_dlyright_r[0]  <= i_right[(2*IWIDTH-1):IWIDTH];\n"
		"\t\tf_dlyright_i[0]  <= i_right[(  IWIDTH-1):0];\n"
		"\t\tf_dlycoeff_r[0]  <= i_coef[ (2*CWIDTH-1):CWIDTH];\n"
		"\t\tf_dlycoeff_i[0]  <= i_coef[ (  CWIDTH-1):0];\n"
	"\tend\n"
"\n"
	"\tgenvar	k;\n"
	"\tgenerate for(k=1; k<F_DEPTH; k=k+1)\n"
"\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\tif (i_ce)\n"
		"\t\tbegin\n"
			"\t\t\tf_dlyleft_r[k]  <= f_dlyleft_r[ k-1];\n"
			"\t\t\tf_dlyleft_i[k]  <= f_dlyleft_i[ k-1];\n"
			"\t\t\tf_dlyright_r[k] <= f_dlyright_r[k-1];\n"
			"\t\t\tf_dlyright_i[k] <= f_dlyright_i[k-1];\n"
			"\t\t\tf_dlycoeff_r[k] <= f_dlycoeff_r[k-1];\n"
			"\t\t\tf_dlycoeff_i[k] <= f_dlycoeff_i[k-1];\n"
		"\t\tend\n"
"\n"
	"\tendgenerate\n"
"\n"
"`ifdef	VERILATOR"
/*
	"\tgenerate if (CKPCE <= 1)\n"
	"\tbegin\n"
"\n"
	"\t\t// i_ce is allowed to be anything in this mode\n"
"\n"
	"\tend else if (CKPCE == 2)\n"
	"\tbegin : F_CKPCE_TWO\n"
"\n"
		"\t\tassert property (@(posedge i_clk)\n"
		"\t\t	i_ce |=> !i_ce);\n"
	"\n"
	"\tend else if (CKPCE == 3)\n"
	"\tbegin : F_CKPCE_THREE\n"
"\n"
		"\t\tassert property (@(posedge i_clk)\n"
		"\t\t	i_ce |=> !i_ce ##1 !i_ce);\n"
"\n"
	"\tend endgenerate\n"
*/
"\n"
"`else\n"
	"\talways @(posedge i_clk)\n"
	"\tif ((!$past(i_ce))&&(!$past(i_ce,2))&&(!$past(i_ce,3))\n"
			"\t\t\t&&(!$past(i_ce,4)))\n"
		"\t\tassume(i_ce);\n"
"\n"
	"\tgenerate if (CKPCE <= 1)\n"
	"\tbegin\n"
"\n"
	"\t\t// i_ce is allowed to be anything in this mode\n"
"\n"
	"\tend else if (CKPCE == 2)\n"
	"\tbegin : F_CKPCE_TWO\n"
"\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\t	if ($past(i_ce))\n"
		"\t\t		assume(!i_ce);\n"
	"\n"
	"\tend else if (CKPCE == 3)\n"
	"\tbegin : F_CKPCE_THREE\n"
"\n"
		"\t\talways @(posedge i_clk)\n"
		"\t\t	if (($past(i_ce))||($past(i_ce,2)))\n"
		"\t\t		assume(!i_ce);\n"
"\n"
	"\tend endgenerate\n"
"`endif"
"\n"
	"\treg	[F_LGDEPTH-1:0]	f_startup_counter;\n"
	"\tinitial	f_startup_counter = 0;\n"
	"\talways @(posedge i_clk)\n"
	"\tif (i_reset)\n"
		"\t\tf_startup_counter <= 0;\n"
	"\telse if ((i_ce)&&(!(&f_startup_counter)))\n"
		"\t\tf_startup_counter <= f_startup_counter + 1;\n"
"\n"
	"\twire	signed	[IWIDTH:0]	f_sumr, f_sumi;\n"
	"\talways @(*)\n"
	"\tbegin\n"
		"\t\tf_sumr = f_dlyleft_r[F_D] + f_dlyright_r[F_D];\n"
		"\t\tf_sumi = f_dlyleft_i[F_D] + f_dlyright_i[F_D];\n"
	"\tend\n"
"\n"
	"\twire	signed	[IWIDTH+CWIDTH:0]	f_sumrx, f_sumix;\n"
	"\tassign	f_sumrx = { {(2){f_sumr[IWIDTH]}}, f_sumr, {(CWIDTH-2){1'b0}} };\n"
	"\tassign	f_sumix = { {(2){f_sumi[IWIDTH]}}, f_sumi, {(CWIDTH-2){1'b0}} };\n"
	"\n"
	"\twire	signed	[IWIDTH:0]	f_difr, f_difi;\n"
	"\talways @(*)\n"
	"\tbegin\n"
		"\t\tf_difr = f_dlyleft_r[F_D] - f_dlyright_r[F_D];\n"
		"\t\tf_difi = f_dlyleft_i[F_D] - f_dlyright_i[F_D];\n"
	"\tend\n"
"\n"
	"\twire	signed	[IWIDTH+CWIDTH+3-1:0]	f_difrx, f_difix;\n"
	"\tassign	f_difrx = { {(CWIDTH+2){f_difr[IWIDTH]}}, f_difr };\n"
	"\tassign	f_difix = { {(CWIDTH+2){f_difi[IWIDTH]}}, f_difi };\n"
"\n"
	"\twire	signed	[IWIDTH+CWIDTH+3-1:0]	f_widecoeff_r, f_widecoeff_i;\n"
	"\tassign	f_widecoeff_r = {{(IWIDTH+3){f_dlycoeff_r[F_D][CWIDTH-1]}},\n"
	"\t		f_dlycoeff_r[F_D] };\n"
	"\tassign	f_widecoeff_i = {{(IWIDTH+3){f_dlycoeff_i[F_D][CWIDTH-1]}},\n"
	"\t		f_dlycoeff_i[F_D] };\n"
"\n"
	"\talways @(posedge i_clk)\n"
	"\tif (f_startup_counter > F_D)\n"
	"\tbegin\n"
		"\t\tassert(left_sr == f_sumrx);\n"
		"\t\tassert(left_si == f_sumix);\n"
		"\t\tassert(aux_s == f_dlyaux[F_D]);\n"
"\n"
		"\t\tif ((f_difr == 0)&&(f_difi == 0))\n"
		"\t\tbegin\n"
		"\t\t	assert(mpy_r == 0);\n"
		"\t\t	assert(mpy_i == 0);\n"
		"\t\tend else if ((f_dlycoeff_r[F_D] == 0)\n"
		"\t\t		&&(f_dlycoeff_i[F_D] == 0))\n"
		"\t\tbegin\n"
		"\t		assert(mpy_r == 0);\n"
		"\t\t	assert(mpy_i == 0);\n"
		"\t\tend\n"
"\n"
		"\t\tif ((f_dlycoeff_r[F_D] == 1)&&(f_dlycoeff_i[F_D] == 0))\n"
		"\t\tbegin\n"
		"\t\t	assert(mpy_r == f_difrx);\n"
		"\t\t	assert(mpy_i == f_difix);\n"
		"\t\tend\n"
"\n"
		"\t\tif ((f_dlycoeff_r[F_D] == 0)&&(f_dlycoeff_i[F_D] == 1))\n"
		"\t\tbegin\n"
		"\t\t	assert(mpy_r == -f_difix);\n"
		"\t\t	assert(mpy_i ==  f_difrx);\n"
		"\t\tend\n"
"\n"
		"\t\tif ((f_difr == 1)&&(f_difi == 0))\n"
		"\t\tbegin\n"
		"\t\t	assert(mpy_r == f_widecoeff_r);\n"
		"\t\t	assert(mpy_i == f_widecoeff_i);\n"
		"\t\tend\n"
"\n"
		"\t\tif ((f_difr == 0)&&(f_difi == 1))\n"
		"\t\tbegin\n"
		"\t\t	assert(mpy_r == -f_widecoeff_i);\n"
		"\t\t	assert(mpy_i ==  f_widecoeff_r);\n"
		"\t\tend\n"
	"\tend\n"
"\n");
 
		fprintf(fp,
	"\t// Let's see if we can improve our performance at all by\n"
	"\t// moving our test one clock earlier.  If nothing else, it should\n"
	"\t// help induction finish one (or more) clocks ealier than\n"
	"\t// otherwise\n"
"\n\n"
	"\twire	signed	[IWIDTH:0]	f_predifr, f_predifi;\n"
	"\talways @(*)\n"
	"\tbegin\n"
		"\t\tf_predifr = f_dlyleft_r[F_D-1] - f_dlyright_r[F_D-1];\n"
		"\t\tf_predifi = f_dlyleft_i[F_D-1] - f_dlyright_i[F_D-1];\n"
	"\tend\n"
"\n"
	"\twire	signed	[IWIDTH+CWIDTH+1-1:0]	f_predifrx, f_predifix;\n"
	"\tassign	f_predifrx = { {(CWIDTH){f_predifr[IWIDTH]}}, f_predifr };\n"
	"\tassign	f_predifix = { {(CWIDTH){f_predifi[IWIDTH]}}, f_predifi };\n"
"\n"
	"\twire	signed	[CWIDTH:0]	f_sumcoef;\n"
	"\twire	signed	[IWIDTH+1:0]	f_sumdiff;\n"
	"\talways @(*)\n"
	"\tbegin\n"
		"\t\tf_sumcoef = f_dlycoeff_r[F_D-1] + f_dlycoeff_i[F_D-1];\n"
		"\t\tf_sumdiff = f_predifr + f_predifi;\n"
	"\tend\n"
"\n"
	"\t// Induction helpers\n"
	"\talways @(posedge i_clk)\n"
	"\tif (f_startup_counter >= F_D)\n"
	"\tbegin\n"
		"\t\tif (f_dlycoeff_r[F_D-1] == 0)\n"
			"\t\t\tassert(p_one == 0);\n"
		"\t\tif (f_dlycoeff_i[F_D-1] == 0)\n"
			"\t\t\tassert(p_two == 0);\n"
"\n"
		"\t\tif (f_dlycoeff_r[F_D-1] == 1)\n"
			"\t\t\tassert(p_one == f_predifrx);\n"
		"\t\tif (f_dlycoeff_i[F_D-1] == 1)\n"
			"\t\t\tassert(p_two == f_predifix);\n"
"\n"
		"\t\tif (f_predifr == 0)\n"
			"\t\t\tassert(p_one == 0);\n"
		"\t\tif (f_predifi == 0)\n"
			"\t\t\tassert(p_two == 0);\n"
"\n"
		"\t\t// verilator lint_off WIDTH\n"
		"\t\tif (f_predifr == 1)\n"
			"\t\t\tassert(p_one == f_dlycoeff_r[F_D-1]);\n"
		"\t\tif (f_predifi == 1)\n"
			"\t\t\tassert(p_two == f_dlycoeff_i[F_D-1]);\n"
		"\t\t// verilator lint_on  WIDTH\n"
"\n"
		"\t\tif (f_sumcoef == 0)\n"
			"\t\t\tassert(p_three == 0);\n"
		"\t\tif (f_sumdiff == 0)\n"
			"\t\t\tassert(p_three == 0);\n"
		"\t\t// verilator lint_off WIDTH\n"
		"\t\tif (f_sumcoef == 1)\n"
			"\t\t\tassert(p_three == f_sumdiff);\n"
		"\t\tif (f_sumdiff == 1)\n"
			"\t\t\tassert(p_three == f_sumcoef);\n"
		"\t\t// verilator lint_on  WIDTH\n"
"`ifdef	VERILATOR\n"
		"\t\tassert(p_one   == f_predifr * f_dlycoeff_r[F_D-1]);\n"
		"\t\tassert(p_two   == f_predifi * f_dlycoeff_i[F_D-1]);\n"
		"\t\tassert(p_three == f_sumdiff * f_sumcoef);\n"
"`endif	// VERILATOR\n"
	"\tend\n\n"
"`endif // FORMAL\n");
	}
 
	fprintf(fp,
"endmodule\n");
 
	fclose(fp);
}
Go to most recent revision | Compare with Previous | Blame | View Log
Browse

Tools

Subversion Repositories dblclockfft

[/] [dblclockfft/] [trunk/] [sw/] [butterfly.cpp] - Rev 40