#Forward Pass

##Example

In [1]:
import numpy as np
X = np.array([[1, 2], [3, -1]])
w = np.array([[2], [-1]])
b = np.array([[1], [1]])
print(X @ w + b)

[[1]
 [8]]


#Activation Function

##Example

In [2]:
def softmax(z):
  exp_scores = np.exp(z)
  sum_exp_scores = np.sum(exp_scores, axis=1, keepdims=True)

  # Compute the softmax probabilities by dividing each element by the sum
  softmax_probs = exp_scores / sum_exp_scores
  return softmax_probs

def sigmoid(z):
  return(1/(1+np.exp(-z)))

def tanh(z):
  return((np.exp(2*z)-1)/(np.exp(2*z)+1))

def relu(z):
  return(np.maximum(0, z))


In [3]:
z = np.array([[1], [8]])
for activation in [softmax, sigmoid, tanh, relu]:
  print(activation(z))

[[1.]
 [1.]]
[[0.73105858]
 [0.99966465]]
[[0.76159416]
 [0.99999977]]
[[1]
 [8]]


#Final Outputs

##Example

In [4]:
X = np.array([[0.29, 0.11, 0.80]])
w1 = np.array([[0.3, 0.6],
               [0.8, 0.4],
               [0.1, 0.5]])
w2 = np.array([[0.1, 0.5],
               [0.7, 0.2]])
b1 = np.array([[1.0, 1.0]])
b2 = np.array([[1.0, 1.0]])

z1 = sigmoid(X @ w1 + b1)
print(z1)

z2 = softmax(z1 @ w2 + b2)
print(z2)

[[0.77816418 0.83451912]]
[[0.52647369 0.47352631]]


In [5]:
print([round(prob, 4) for prob in z2[0]])
#The neural network would predict Yes! (with probability 52.65%)

[0.5265, 0.4735]


#Optimisation

##Example

In [6]:
# Make up (arbitrarily) 12 observations with two features.
X = np.array([[1, 2],
              [3, 1],
              [1, 1],
              [0, 1],
              [2, 2],
              [-2, 3],
              [1, 2],
              [-1, -0.5],
              [0.5, 1.2],
              [2, 1],
              [-2, 3],
              [-1, 1]
              ])

w_true = np.array([[1.5], [1.5]])
b_true = 0.1

y = X @ w_true + b_true
print(X); print(y)

[[ 1.   2. ]
 [ 3.   1. ]
 [ 1.   1. ]
 [ 0.   1. ]
 [ 2.   2. ]
 [-2.   3. ]
 [ 1.   2. ]
 [-1.  -0.5]
 [ 0.5  1.2]
 [ 2.   1. ]
 [-2.   3. ]
 [-1.   1. ]]
[[ 4.6 ]
 [ 6.1 ]
 [ 3.1 ]
 [ 1.6 ]
 [ 6.1 ]
 [ 1.6 ]
 [ 4.6 ]
 [-2.15]
 [ 2.65]
 [ 4.6 ]
 [ 1.6 ]
 [ 0.1 ]]


In [7]:
#number of rows == number of observations in the batch
X_batch = X[:3]
y_batch = y[:3]
N = X_batch.shape[0]
w = np.array([[1], [1]])
b = 0

#Gradients
y_hat = X_batch @ w + b
dw = 2/N * X_batch.T @ (y_hat - y_batch)
db = 2/N * np.sum(y_hat - y_batch)
print(dw); print(db)

[[-6.        ]
 [-4.26666667]]
-3.1999999999999993


In [8]:
#specify a learning rate to update
eta = 0.1
w = w - eta * dw
b = b - eta * db
print(w); print(b)

[[1.6       ]
 [1.42666667]]
0.31999999999999995


In [16]:
#loss function
def mse(y_pred, y_true):
  return(np.mean((y_pred-y_true)**2))

def lr_gradient_descent(X, y, batch_size=32, eta=0.1, w=None, b=None, max_iter=100, tol=1e-08):
    """
    Gradient descent optimization for linear regression with random batch updates.

    Parameters:
    eta: float - learning rate (default=0.1)
    w: numpy array of shape (p, 1) - initial weights (default=ones)
    b: float - initial bias (default=zero)
    max_iter: int - maximum number of iterations (default=100)
    tol: float - tolerance for stopping criteria (default=1e-08)

    Returns:
    w, b - optimized weights and bias
    """
    N, p = X.shape

    if w is None:
        w = np.ones((p, 1))
    if b is None:
        b = 0

    prev_error = np.inf
    batch_size = min(N, batch_size)
    num_batches = N//batch_size

    for iteration in range(max_iter):
        indices = np.arange(N)
        np.random.shuffle(indices)
        X_shuffled = X[indices]
        y_shuffled = y[indices]


        for batch in range(num_batches):
            start = batch * batch_size
            end = start + batch_size
            X_batch = X_shuffled[start:end]
            y_batch = y_shuffled[start:end]

            y_hat = X_batch @ w + b
            error = mse(y_hat.squeeze(), y_batch.squeeze())

            if np.abs(error - prev_error) < tol:
                return w, b

            prev_error = error

            dw = 2 / batch_size * X_batch.T @ (y_hat - y_batch)
            db = 2 / batch_size * np.sum(y_hat - y_batch)

            w -= eta * dw
            b -= eta * db

    return w, b

#Default initialisation
w_updated, b_updated = lr_gradient_descent(X, y, batch_size = 3, max_iter = 1000)
print(w_updated); print(b_updated)

[[1.49988073]
 [1.49929975]]
0.10118102139430363


In [10]:
#Higher learning rate
w_updated, b_updated = lr_gradient_descent(eta = 0.3, X = X, y = y, batch_size = 3, max_iter = 1000)
print(w_updated); print(b_updated)

[[nan]
 [nan]]
nan


  return(np.mean((y_pred-y_true)**2))
  if np.abs(error - prev_error) < tol:
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  dw = 2 / batch_size * X_batch.T @ (y_hat - y_batch)
  dw = 2 / batch_size * X_batch.T @ (y_hat - y_batch)
  w -= eta * dw


In [23]:
#'Good' initialisation and 'good' learning rate
w0 = np.array([[1.4], [1.6]])
b0 = 0.15
w_updated, b_updated = lr_gradient_descent(eta = 0.05, X = X, y = y,
                                           w = w0, b = b0,
                                           batch_size = 3, max_iter = 1000)
print(w_updated); print(b_updated)

[[1.49987491]
 [1.49949888]]
0.10105665852631913


In [20]:
#'Bad' initialisation
w0 = np.array([[-10.], [-10.]])
b0 = 5.0
w_updated, b_updated = lr_gradient_descent(eta = 0.05, X = X, y = y,
                                           w = w0, b = b0,
                                           batch_size = 3, max_iter = 1000)
print(w_updated); print(b_updated)

[[1.49954388]
 [1.49850416]]
0.10330006532281537
