Architecture#

00e65d91166d4038909b6885410b4b4e

Symbols & Naming Conventions#

\begin{align} n &= \text{number of nodes}\\ l &= \text{layer number}\\ w,W &= \text{weights matrix}\\ b &= \text{bias matrix}\\ z,Z &= \text{hypthesis result (result before applying activation function)}\\ g(z) &= \text{activation function}\\ a,A &= \text{activation matrix (result after applying activation function)}\\ x,X &= \text{input to network}\\ \hat{y} &= \text{output of network}\\ \end{align}

values for forward propogation

\begin{align} \huge{n^{[l]}} &= \text{number of nodes in the layer}\\ \huge{z^{[l]}} &= \text{hypothesis result of the layer}\\ \huge{w^{[l]}} &= \text{weights results of the layer}\\ \huge{b^{[l]}} &= \text{bias results of the layer}\\ \huge{a^{[l]}} &= \text{activation results of the layer}\\ \end{align}

derivatives for backward propogation

\begin{align} \huge{dw^{[l]}} &= \frac{\partial L}{\partial w} \rightarrow \text{loss derivative based on weights}\\ \huge{db^{[l]}} &= \frac{\partial L}{\partial b} \rightarrow \text{ loss derivative based on biases}\\ \huge{dz^{[l]}} &= \frac{\partial L}{\partial z} \rightarrow \text{ loss derivative based on hypothesis result}\\ \huge{da^{[l]}} &= \frac{\partial L}{\partial a} \rightarrow \text{ loss derivative based on activation result}\\ \end{align}

Flow#

3bce0afa584e447882e3d315c4e7a8ec

Shapes#

\begin{array}{ | c | c | c | } \hline W^{[l]} & ( n^{[l]} , n^{[l-1]} ) & dW^{[l]} \\ \hline b^{[l]} & ( 1 , n^{[l]} ) & db^{[l]} \\ \hline Z^{[l]} & ( n^{[l]} , n^{[l-1]} ) & dZ^{[l]} \\ \hline A^{[l]} & ( n^{[l]} , n^{[l-1]} ) & dA^{[l]} \\ \hline \end{array}
where
l = layer number >= 1
\(A^{[0]}\) = X