Architecture#
Symbols & Naming Conventions#
\begin{align}
n &= \text{number of nodes}\\
l &= \text{layer number}\\
w,W &= \text{weights matrix}\\
b &= \text{bias matrix}\\
z,Z &= \text{hypthesis result (result before applying activation function)}\\
g(z) &= \text{activation function}\\
a,A &= \text{activation matrix (result after applying activation function)}\\
x,X &= \text{input to network}\\
\hat{y} &= \text{output of network}\\
\end{align}
values for forward propogation
\begin{align}
\huge{n^{[l]}} &= \text{number of nodes in the layer}\\
\huge{z^{[l]}} &= \text{hypothesis result of the layer}\\
\huge{w^{[l]}} &= \text{weights results of the layer}\\
\huge{b^{[l]}} &= \text{bias results of the layer}\\
\huge{a^{[l]}} &= \text{activation results of the layer}\\
\end{align}
derivatives for backward propogation
\begin{align}
\huge{dw^{[l]}} &= \frac{\partial L}{\partial w} \rightarrow \text{loss derivative based on weights}\\
\huge{db^{[l]}} &= \frac{\partial L}{\partial b} \rightarrow \text{ loss derivative based on biases}\\
\huge{dz^{[l]}} &= \frac{\partial L}{\partial z} \rightarrow \text{ loss derivative based on hypothesis result}\\
\huge{da^{[l]}} &= \frac{\partial L}{\partial a} \rightarrow \text{ loss derivative based on activation result}\\
\end{align}
Flow#
Shapes#
\begin{array}{ | c | c | c | }
\hline
W^{[l]} & ( n^{[l]} , n^{[l-1]} ) & dW^{[l]} \\
\hline
b^{[l]} & ( 1 , n^{[l]} ) & db^{[l]} \\
\hline
Z^{[l]} & ( n^{[l]} , n^{[l-1]} ) & dZ^{[l]} \\
\hline
A^{[l]} & ( n^{[l]} , n^{[l-1]} ) & dA^{[l]} \\
\hline
\end{array}
where
l = layer number >= 1
\(A^{[0]}\) = X