| 1 |
3 |
ojosynariz |
----------------------------------------------------------------------------------
|
| 2 |
|
|
-- Company: CEI
|
| 3 |
|
|
-- Engineer: David Aledo
|
| 4 |
|
|
--
|
| 5 |
|
|
-- Create Date: 12:41:19 06/10/2013
|
| 6 |
|
|
-- Design Name: Configurable ANN
|
| 7 |
|
|
-- Module Name: layerSP_top - Behavioral
|
| 8 |
|
|
-- Project Name:
|
| 9 |
|
|
-- Target Devices:
|
| 10 |
|
|
-- Tool versions:
|
| 11 |
|
|
-- Description: neuron layer top for artificial neural networks. Serial input and
|
| 12 |
|
|
-- parallel output.
|
| 13 |
|
|
--
|
| 14 |
|
|
-- Dependencies:
|
| 15 |
|
|
--
|
| 16 |
|
|
-- Revision:
|
| 17 |
|
|
-- Revision 0.01 - File Created
|
| 18 |
|
|
-- Additional Comments:
|
| 19 |
|
|
--
|
| 20 |
|
|
----------------------------------------------------------------------------------
|
| 21 |
|
|
library IEEE;
|
| 22 |
|
|
use IEEE.STD_LOGIC_1164.ALL;
|
| 23 |
|
|
use ieee.numeric_std.all;
|
| 24 |
|
|
|
| 25 |
8 |
jstefanowi |
library work;
|
| 26 |
|
|
use work.wb_init.all; -- initialization package, comment out when not used
|
| 27 |
|
|
|
| 28 |
3 |
ojosynariz |
-- Deprecated XPS library:
|
| 29 |
|
|
--library proc_common_v3_00_a;
|
| 30 |
|
|
--use proc_common_v3_00_a.proc_common_pkg.all; -- Only for simulation ( pad_power2() )
|
| 31 |
|
|
|
| 32 |
|
|
entity layerSP_top is
|
| 33 |
|
|
|
| 34 |
|
|
generic
|
| 35 |
|
|
(
|
| 36 |
10 |
jstefanowi |
NumN : natural := 8; ------- Number of neurons of the layer
|
| 37 |
|
|
NumIn : natural := 64; ------- Number of inputs of each neuron
|
| 38 |
|
|
NbitIn : natural := 8; ------- Bit width of the input data
|
| 39 |
|
|
NbitW : natural := 8; ------- Bit width of weights and biases
|
| 40 |
|
|
NbitOut : natural := 12; ------- Bit width of the output data
|
| 41 |
|
|
lra_l : natural := 10; ------- Layer RAM address length. It should value log2(NumN)+log2(NumIn)
|
| 42 |
|
|
wra_l : natural := 6; ------- Weight RAM address length. It should value log2(NumIn)
|
| 43 |
|
|
bra_l : natural := 3; ------- Bias RAM address length. It should value log2(NumN)
|
| 44 |
|
|
LSbit : natural := 4; ------- Less significant bit of the outputs
|
| 45 |
8 |
jstefanowi |
WBinit : boolean := false;
|
| 46 |
10 |
jstefanowi |
LNum : natural := 0 ------- layer number (needed for initialization)
|
| 47 |
|
|
|
| 48 |
3 |
ojosynariz |
);
|
| 49 |
|
|
|
| 50 |
|
|
port
|
| 51 |
|
|
(
|
| 52 |
|
|
-- Input ports
|
| 53 |
|
|
reset : in std_logic;
|
| 54 |
|
|
clk : in std_logic;
|
| 55 |
|
|
run_in : in std_logic; -- Start and input data validation
|
| 56 |
|
|
m_en : in std_logic; -- Memory enable (external interface)
|
| 57 |
|
|
b_sel : in std_logic; -- Bias memory select
|
| 58 |
|
|
m_we : in std_logic_vector(((NbitW+7)/8)-1 downto 0); -- Memory write enable (external interface)
|
| 59 |
|
|
inputs : in std_logic_vector(NbitIn-1 downto 0); -- Input data (serial)
|
| 60 |
|
|
wdata : in std_logic_vector(NbitW-1 downto 0); -- Write data of weight and bias memories
|
| 61 |
|
|
addr : in std_logic_vector(lra_l-1 downto 0); -- Address of weight and bias memories
|
| 62 |
|
|
|
| 63 |
|
|
-- Output ports
|
| 64 |
|
|
run_out : out std_logic; -- Output data validation, run_in for the next layer
|
| 65 |
|
|
rdata : out std_logic_vector(NbitW-1 downto 0); -- Read data of weight and bias memories
|
| 66 |
|
|
outputs : out std_logic_vector((NbitOut*NumN)-1 downto 0) -- Output data (parallel)
|
| 67 |
|
|
);
|
| 68 |
|
|
|
| 69 |
|
|
end layerSP_top;
|
| 70 |
|
|
|
| 71 |
|
|
architecture Behavioral of layerSP_top is
|
| 72 |
|
|
|
| 73 |
|
|
type ramd_type is array (NumIn-1 downto 0) of std_logic_vector(NbitW-1 downto 0); -- Optimal: 32 or 64 spaces
|
| 74 |
|
|
type layer_ram is array (NumN-1 downto 0) of ramd_type;
|
| 75 |
|
|
type outm_type is array (NumN-1 downto 0) of std_logic_vector(NbitW-1 downto 0);
|
| 76 |
8 |
jstefanowi |
|
| 77 |
3 |
ojosynariz |
|
| 78 |
8 |
jstefanowi |
function fw_init(LNum : natural) return layer_ram is
|
| 79 |
|
|
variable tmp_arr : layer_ram := (others => (others => (others => '0'))) ;
|
| 80 |
|
|
begin
|
| 81 |
|
|
if WBinit = true then
|
| 82 |
|
|
for i in 0 to NumIn-1 loop
|
| 83 |
|
|
for j in 0 to NumN-1 loop
|
| 84 |
|
|
tmp_arr(j)(i) := w_init(LNum)(i)(j);
|
| 85 |
|
|
end loop;
|
| 86 |
|
|
end loop;
|
| 87 |
|
|
end if;
|
| 88 |
|
|
return tmp_arr ;
|
| 89 |
|
|
end fw_init;
|
| 90 |
|
|
|
| 91 |
|
|
function fb_init(LNum : natural) return outm_type is
|
| 92 |
|
|
variable tmp_arr : outm_type := (others => (others => '0')) ;
|
| 93 |
|
|
begin
|
| 94 |
|
|
if WBinit = true then
|
| 95 |
|
|
for i in 0 to NumN-1 loop
|
| 96 |
|
|
tmp_arr(i) := b_init(LNum)(i);
|
| 97 |
|
|
end loop;
|
| 98 |
|
|
end if;
|
| 99 |
|
|
return tmp_arr;
|
| 100 |
|
|
end fb_init;
|
| 101 |
|
|
|
| 102 |
|
|
|
| 103 |
|
|
|
| 104 |
|
|
signal lram : layer_ram := fw_init(LNum); -- Layer RAM. One RAM per neuron. It stores the weights
|
| 105 |
|
|
signal breg : outm_type := fb_init(LNum); -- Bias registers. They can not be RAM because they are accessed simultaneously
|
| 106 |
3 |
ojosynariz |
signal outm : outm_type; -- RAM outputs to be multiplexed into rdata
|
| 107 |
|
|
signal m_sel : std_logic_vector(NumN-1 downto 0); -------- RAM select
|
| 108 |
|
|
signal Wyb : std_logic_vector((NbitW*NumN)-1 downto 0); --- Weight vectors
|
| 109 |
|
|
signal bias : std_logic_vector((NbitW*NumN)-1 downto 0); --- Bias vector
|
| 110 |
|
|
signal Nouts : std_logic_vector((NbitOut*NumN)-1 downto 0); -- Outputs from neurons
|
| 111 |
|
|
signal uaddr : unsigned(lra_l-1 downto 0); -- Unsigned address of weight and bias memories
|
| 112 |
|
|
|
| 113 |
|
|
signal inreg : std_logic_vector(NbitIn-1 downto 0); -- Input data register -- en1 is delayed 1 cycle in order to insert a register for Wyb
|
| 114 |
|
|
|
| 115 |
|
|
-- Control signals
|
| 116 |
|
|
signal cont : integer range 0 to NumIn-1; -- Input counter
|
| 117 |
|
|
signal en1 : std_logic; -- First step enable (multiplication of MAC)
|
| 118 |
|
|
signal en2 : std_logic; -- Second stage enable (accumulation of MAC)
|
| 119 |
|
|
signal en3 : std_logic; -- Shift register enable
|
| 120 |
|
|
signal a0 : std_logic; -- Signal to load accumulators with the multiplication result
|
| 121 |
|
|
signal aux_en3 : std_logic; -- Auxiliary signal to delay en3 two cycles
|
| 122 |
|
|
signal aux_a0 : std_logic;
|
| 123 |
|
|
signal aux2_en3 : std_logic;
|
| 124 |
|
|
|
| 125 |
|
|
begin
|
| 126 |
|
|
|
| 127 |
|
|
layerSP_inst: entity work.layerSP
|
| 128 |
|
|
generic map
|
| 129 |
|
|
(
|
| 130 |
|
|
NumN => NumN,
|
| 131 |
|
|
NumIn => NumIn,
|
| 132 |
|
|
NbitIn => NbitIn,
|
| 133 |
|
|
NbitW => NbitW,
|
| 134 |
|
|
NbitOut => NbitOut,
|
| 135 |
|
|
LSbit => LSbit
|
| 136 |
|
|
)
|
| 137 |
|
|
port map
|
| 138 |
|
|
(
|
| 139 |
|
|
-- Input ports
|
| 140 |
|
|
reset => reset,
|
| 141 |
|
|
clk => clk,
|
| 142 |
|
|
en => en1,
|
| 143 |
|
|
en2 => en2,
|
| 144 |
|
|
en_r => en3,
|
| 145 |
|
|
a0 => a0,
|
| 146 |
|
|
inputs => inreg,
|
| 147 |
|
|
Wyb => Wyb,
|
| 148 |
|
|
bias => bias,
|
| 149 |
|
|
|
| 150 |
|
|
-- Output ports
|
| 151 |
|
|
outputs => Nouts
|
| 152 |
|
|
);
|
| 153 |
|
|
|
| 154 |
|
|
uaddr <= unsigned(addr);
|
| 155 |
|
|
|
| 156 |
|
|
ram_selector:
|
| 157 |
|
|
process (uaddr(lra_l-1 downto wra_l),b_sel) -- Top part of memory address and b_sel
|
| 158 |
|
|
begin
|
| 159 |
|
|
m_sel <= (others => '0'); -- Default
|
| 160 |
|
|
for i in (NumN-1) downto 0 loop
|
| 161 |
|
|
-- The top part of memory address selects which RAM
|
| 162 |
|
|
if ( (to_integer(uaddr(lra_l-1 downto wra_l)) = i) and (b_sel = '0')) then
|
| 163 |
|
|
m_sel(i) <= '1'; -- Enables the selected RAM
|
| 164 |
|
|
end if;
|
| 165 |
|
|
end loop;
|
| 166 |
|
|
end process;
|
| 167 |
|
|
|
| 168 |
|
|
rams: -- Instance as weight and bias memories as neurons there are in the layer
|
| 169 |
|
|
for i in (NumN-1) downto 0 generate
|
| 170 |
|
|
process (clk)
|
| 171 |
|
|
variable d : std_logic_vector(NbitW-1 downto 0); -- Beware of elements whose length is not a multiple of 8
|
| 172 |
|
|
begin
|
| 173 |
|
|
if (clk'event and clk = '1') then
|
| 174 |
|
|
if (m_en = '1' and m_sel(i) = '1') then
|
| 175 |
|
|
for j in ((NbitW+7)/8)-1 downto 0 loop -- we byte to byte
|
| 176 |
|
|
if (m_we(j) = '1') then
|
| 177 |
|
|
d((8*(j+1))-1 downto 8*j) := wdata((8*(j+1))-1 downto 8*j);
|
| 178 |
|
|
else
|
| 179 |
|
|
d((8*(j+1))-1 downto 8*j) := lram(i)(to_integer(uaddr(wra_l-1 downto 0)))((8*(j+1))-1 downto 8*j);
|
| 180 |
|
|
end if;
|
| 181 |
|
|
end loop;
|
| 182 |
|
|
-- Bottom part of layer memory selects weights inside the selected RAM
|
| 183 |
|
|
lram(i)(to_integer(uaddr(wra_l-1 downto 0))) <= d;
|
| 184 |
|
|
--
|
| 185 |
|
|
end if;
|
| 186 |
|
|
end if;
|
| 187 |
|
|
end process;
|
| 188 |
|
|
-- Outputs are read in parallel, resulting in a bus of weights:
|
| 189 |
|
|
--Wyb((NbitW*(i+1))-1 downto NbitW*i) <= lram(i)(cont); -- Asynchronous read (forces distributed RAM)
|
| 190 |
|
|
process (clk) -- Synchronous read
|
| 191 |
|
|
begin
|
| 192 |
|
|
if clk'event and clk = '1' then
|
| 193 |
|
|
if reset = '1' then
|
| 194 |
|
|
--Wyb((NbitW*(i+1))-1 downto NbitW*i) <= (others => '0');
|
| 195 |
|
|
else
|
| 196 |
|
|
Wyb((NbitW*(i+1))-1 downto NbitW*i) <= lram(i)(cont);
|
| 197 |
|
|
end if;
|
| 198 |
|
|
end if;
|
| 199 |
|
|
end process;
|
| 200 |
8 |
jstefanowi |
outm(i) <= lram(i)(to_integer(uaddr(wra_l-1 downto 0))) when (uaddr(wra_l-1 downto 0) <= NumIn-1) else
|
| 201 |
|
|
(others => '0') ; -- Read all RAM
|
| 202 |
|
|
-- In my case I have 27 inputs and 34 neurons in the first layer. When I address
|
| 203 |
|
|
-- the 1 layer's inputs for the second neuron the layer which acccepts a 6 bit wide
|
| 204 |
|
|
-- input address (layer 2) sees the ..1 00100 (34) number and interprets it as an input
|
| 205 |
|
|
-- address (which goes only up to 33) hence the bound check failure
|
| 206 |
|
|
-- fix: I've changed the assignment to a conditional one to check if we are not
|
| 207 |
|
|
-- trying to read a weight of an input higher than the number of this layer's inputs.
|
| 208 |
3 |
ojosynariz |
end generate;
|
| 209 |
|
|
|
| 210 |
|
|
-- Synchronous read including breg:
|
| 211 |
|
|
process (clk)
|
| 212 |
|
|
begin
|
| 213 |
|
|
if (clk'event and clk = '1') then
|
| 214 |
|
|
if (m_en = '1') then
|
| 215 |
|
|
if (b_sel = '1') then
|
| 216 |
|
|
rdata <= breg(to_integer(uaddr(bra_l-1 downto 0))); -- Bias registers selected
|
| 217 |
|
|
else -- Other RAM selected:
|
| 218 |
|
|
rdata <= outm(to_integer(uaddr(lra_l-1 downto wra_l))); -- Multiplexes RAM outputs
|
| 219 |
|
|
-- May be safer if accesses to top address grater than NumN are avoided
|
| 220 |
|
|
end if;
|
| 221 |
|
|
end if;
|
| 222 |
|
|
end if;
|
| 223 |
|
|
end process;
|
| 224 |
|
|
|
| 225 |
|
|
bias_reg:
|
| 226 |
|
|
process (clk)
|
| 227 |
|
|
variable d : std_logic_vector(NbitW-1 downto 0); -- Beware of elements whose length is not a multiple of 8
|
| 228 |
|
|
begin
|
| 229 |
|
|
if (clk'event and clk = '1') then
|
| 230 |
|
|
if ( (m_en = '1') and (b_sel = '1') ) then
|
| 231 |
|
|
for i in ((NbitW+7)/8)-1 downto 0 loop -- we byte to byte
|
| 232 |
|
|
if (m_we(i) = '1') then
|
| 233 |
|
|
d((8*(i+1))-1 downto 8*i) := wdata((8*(i+1))-1 downto 8*i);
|
| 234 |
|
|
else
|
| 235 |
|
|
d((8*(i+1))-1 downto 8*i) := breg(to_integer(uaddr(bra_l-1 downto 0)))((8*(i+1))-1 downto 8*i);
|
| 236 |
|
|
end if;
|
| 237 |
|
|
end loop;
|
| 238 |
|
|
-- The bottom part (reduced) of layer RAM address selects the bias
|
| 239 |
|
|
breg(to_integer(uaddr(bra_l-1 downto 0))) <= d;
|
| 240 |
|
|
end if;
|
| 241 |
|
|
end if;
|
| 242 |
|
|
end process;
|
| 243 |
|
|
bias_read:
|
| 244 |
|
|
for i in (NumN-1) downto 0 generate
|
| 245 |
|
|
--bias((NbitW*(i+1))-1 downto NbitW*i) <= breg(i); -- Asynchronous read of all biases in parallel
|
| 246 |
|
|
process (clk)
|
| 247 |
|
|
begin
|
| 248 |
|
|
if clk'event and clk = '1' then
|
| 249 |
|
|
if reset = '1' then
|
| 250 |
|
|
--bias((NbitW*(i+1))-1 downto NbitW*i) <= (others => '0');
|
| 251 |
|
|
else
|
| 252 |
|
|
bias((NbitW*(i+1))-1 downto NbitW*i) <= breg(i); -- Synchronous read of all biases in parallel
|
| 253 |
|
|
end if;
|
| 254 |
|
|
end if;
|
| 255 |
|
|
end process;
|
| 256 |
|
|
end generate;
|
| 257 |
|
|
|
| 258 |
|
|
outputs <= Nouts;
|
| 259 |
|
|
|
| 260 |
|
|
control:
|
| 261 |
|
|
process (clk)
|
| 262 |
|
|
begin
|
| 263 |
|
|
if (clk'event and clk = '1') then
|
| 264 |
|
|
if (reset = '1') then
|
| 265 |
|
|
cont <= 0;
|
| 266 |
|
|
en1 <= '0';
|
| 267 |
|
|
en2 <= '0';
|
| 268 |
|
|
en3 <= '0';
|
| 269 |
|
|
a0 <= '0';
|
| 270 |
|
|
run_out <= '0';
|
| 271 |
|
|
aux_en3 <= '0';
|
| 272 |
|
|
aux2_en3 <= '0';
|
| 273 |
|
|
aux_a0 <= '0';
|
| 274 |
|
|
inreg <= (others => '0');
|
| 275 |
|
|
else
|
| 276 |
|
|
en1 <= run_in; -- en1 is delayed 1 cycle in order to insert a register for Wyb
|
| 277 |
|
|
inreg <= inputs;
|
| 278 |
|
|
-- Default:
|
| 279 |
|
|
aux2_en3 <= '0';
|
| 280 |
|
|
if (run_in = '1') then
|
| 281 |
|
|
if (cont = NumIn-1) then
|
| 282 |
|
|
cont <= 0; -- Restarts input counter
|
| 283 |
|
|
aux2_en3 <= '1';
|
| 284 |
|
|
else
|
| 285 |
|
|
cont <= cont +1;
|
| 286 |
|
|
end if;
|
| 287 |
8 |
jstefanowi |
--elsif (cont = NumIn-1) then -- for layers with more that
|
| 288 |
|
|
-- cont <= 0; -- 1 neuron uncommenting this
|
| 289 |
|
|
-- aux2_en3 <= '1'; -- solved a problem with cont resetting
|
| 290 |
3 |
ojosynariz |
end if;
|
| 291 |
|
|
en2 <= en1;
|
| 292 |
|
|
if (cont = 0 and run_in = '1') then
|
| 293 |
|
|
aux_a0 <= '1'; -- At the count beginning
|
| 294 |
|
|
else
|
| 295 |
|
|
aux_a0 <= '0';
|
| 296 |
|
|
end if;
|
| 297 |
|
|
a0 <= aux_a0;
|
| 298 |
|
|
aux_en3 <= aux2_en3;
|
| 299 |
|
|
en3 <= aux_en3;
|
| 300 |
|
|
run_out <= en3; -- It lasts for 1 cycle, just after the output enable of the layer (when all outputs have just updated)
|
| 301 |
|
|
end if;
|
| 302 |
|
|
end if;
|
| 303 |
|
|
end process;
|
| 304 |
|
|
|
| 305 |
|
|
end Behavioral;
|