9.3. Implementation

Complete Python code is available at: GRU_from_scrtch.py.

The GRU class is shown below:

class GRU:
    def __init__(self, input_units, hidden_units, return_sequences=False):
        self.return_sequences = return_sequences

        self.input_units = input_units
        self.hidden_units = hidden_units

        """
        Initialize random weights and bias using Glorot
        and Orthogonal Weight Initializations.

        Glorat Weight Initialization: Glorot & Bengio, AISTATS 2010
        http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf

        Orthogonal Weight Initialization: Saxe et al.,
        https://arxiv.org/pdf/1312.6120.pdf
        """
        self.Wz = np.random.randn(hidden_units, input_units) * np.sqrt(2.0 / (hidden_units + input_units))
        self.Wr = np.random.randn(hidden_units, input_units) * np.sqrt(2.0 / (hidden_units + input_units))
        self.W = np.random.randn(hidden_units, input_units) * np.sqrt(2.0 / (hidden_units + input_units))

        self.Uz = np.random.randn(hidden_units, hidden_units) * np.sqrt(2.0 / (2 * hidden_units))
        self.Uz, _, _ = np.linalg.svd(self.Uz) # Orthogonal Weight Initialization
        self.Ur = np.random.randn(hidden_units, hidden_units) * np.sqrt(2.0 / (2 * hidden_units))
        self.Ur, _, _ = np.linalg.svd(self.Ur) # Orthogonal Weight Initialization
        self.U = np.random.randn(hidden_units, hidden_units) * np.sqrt(2.0 / (2 * hidden_units))
        self.U, _, _ = np.linalg.svd(self.U) # Orthogonal Weight Initialization

        self.bz = np.random.randn(hidden_units, 1) * np.sqrt(2.0 / (1 + hidden_units))
        self.br = np.random.randn(hidden_units, 1) * np.sqrt(2.0 / (1 + hidden_units))
        self.b = np.random.randn(hidden_units, 1) * np.sqrt(2.0 / (1 + hidden_units))


    def get_grads(self):
        return [self.dWz, self.dWr, self.dW,
                self.dUz, self.dUr, self.dU,
                self.dbz, self.dbr, self.db]

    def get_params(self):
        return [self.Wz, self.Wr, self.W,
                self.Uz, self.Ur, self.U,
                self.bz, self.br, self.b]

    def forward_prop(self, x, n_sequence):
        self.x = x
        self.n_sequence = n_sequence

        self.z = np.zeros([self.n_sequence, self.hidden_units, 1])
        self.z_p = np.zeros([self.n_sequence, self.hidden_units, 1])
        self.r = np.zeros([self.n_sequence, self.hidden_units, 1])
        self.r_p = np.zeros([self.n_sequence, self.hidden_units, 1])
        self.h_h = np.zeros([self.n_sequence, self.hidden_units, 1])
        self.h_h_p = np.zeros([self.n_sequence, self.hidden_units, 1])
        self.h = np.zeros([self.n_sequence, self.hidden_units, 1])

        for t in range(self.n_sequence):
            if t == 0:
                self.z_p[t] = np.dot(self.Wz, self.x[t]) + self.bz
                self.z[t] = sigmoid(self.z_p[t])
                self.r_p[t] = np.dot(self.Wr, self.x[t]) + self.br
                self.r[t] = sigmoid(self.r_p[t])
                self.h_h_p[t] = np.dot(self.W, self.x[t]) + self.b
                self.h_h[t] = tanh(self.h_h_p[t])
                self.h[t] = self.z[t] * self.h_h[t]
            else:
                self.z_p[t] = (np.dot(self.Wz, self.x[t]) + np.dot(self.Uz, self.h[t - 1]) + self.bz)
                self.z[t] = sigmoid(self.z_p[t])
                self.r_p[t] = (np.dot(self.Wr, self.x[t]) + np.dot(self.Ur, self.h[t - 1]) + self.br)
                self.r[t] = sigmoid(self.r_p[t])
                self.h_h_p[t] = (np.dot(self.W, self.x[t]) + np.dot(self.U, self.r[t] * self.h[t - 1]) + self.b)
                self.h_h[t] = tanh(self.h_h_p[t])
                self.h[t] = (1 - self.z[t]) * self.h[t - 1] + self.z[t] * self.h_h[t]

        if self.return_sequences == False:
            return self.h[-1]
        else:
            return self.h


    def back_prop(self, grads):

        dh = np.zeros([self.n_sequence, self.hidden_units, 1])
        dx = np.zeros([self.n_sequence, self.input_units, 1])

        # Compute dh from time step T backward through 0.
        for t in reversed(range(self.n_sequence)):

            if t == self.n_sequence - 1:
                if self.return_sequences == True:
                    dh[t] = grads[t]
                else:
                    dh[t] = grads
            else:
                d1 = np.dot(self.Uz.T, dh[t + 1] * (self.h_h[t + 1] - self.h[t]) * deriv_sigmoid(self.z_p[t + 1]))
                d2 = np.dot(self.r[t + 1] * self.U.T, dh[t + 1] * self.z[t + 1] * deriv_tanh(self.h_h_p[t + 1]))
                d3 = np.dot(self.Ur.T, np.dot(self.h[t] * self.U.T, dh[t + 1] * self.z[t + 1] * deriv_tanh(self.h_h_p[t + 1])) * deriv_sigmoid(self.r_p[t + 1]))
                d4 = dh[t + 1] * (1 - self.z[t + 1])

                dh[t] = d1 + d2 + d3 + d4
                if self.return_sequences == True:
                    dh[t] += grads[t]

        # Compute the gradients of the weights and biases from time step 0 through T.
        for t in range(self.n_sequence):

            if t > 0:
                _dbz = (dh[t] * deriv_sigmoid(self.z_p[t]) * (self.h_h[t] - self.h[t - 1]))
            else:
                _dbz = dh[t] * deriv_sigmoid(self.z_p[t]) * self.h_h[t]

            self.dbz += _dbz
            self.dWz += np.dot(_dbz, self.x[t].T)
            if t > 0:
                self.dUz += np.dot(_dbz, self.h[t - 1].T)

            if t > 0:
                _dbr = np.dot(self.h[t - 1] * self.U.T, dh[t] * self.z[t] * deriv_tanh(self.h_h_p[t])) * deriv_sigmoid(self.r_p[t])
                self.dbr += _dbr
                self.dWr += np.dot(_dbr, self.x[t].T)
                self.dUr += np.dot(_dbr, self.h[t - 1].T)

            _db = dh[t] * self.z[t] * deriv_tanh(self.h_h_p[t])
            self.db += _db
            self.dW += np.dot(_db, self.x[t].T)
            if t > 0:
                self.dU += np.dot(_db, (self.r[t] * self.h[t - 1]).T)

            dx[t] = np.dot(self.Wz.T, _dbz) + np.dot(self.W.T, _db)
            if t > 0:
                dx[t] += np.dot(self.Wr.T, _dbr)

        return dx

The back_prop() method consists of two key steps:

  1. Computing hidden state gradients: This loop iterates backward through time steps, starting from the final step $T$ and ending at $0$, to calculate the gradient of the hidden state at each step, denoted as $dh[t]$.

  2. Calculating parameter gradients: Based on the computed hidden state gradients, this step calculates the gradients of the model’s parameters, with respect to the hidden state gradients.

[1] Create dataset.

# ============================
# Create Dataset
# ============================
n_sequence = 25
n_data = 100

n_sample = n_data - n_sequence  # number of samples

sin_data = ds.create_wave(n_data, 0.05)
X, Y = ds.dataset(sin_data, n_sequence)

[2] Create model.

We construct a simple RNN layer followed by a dense layer. The dense layer employs a linear activation function.

# ============================
# Create Model
# ============================

input_units = 1
hidden_units = 32
output_units = 1

gru = GRU(input_units, hidden_units)
dense = Layers.Dense(hidden_units, output_units, linear, deriv_linear)

[3] Training.

This model’s training function is identical to the neural network training functions introduced so far.

# ============================
# Training
# ============================

def train(gru, dense, X, Y, optimizer):
    # Forward Propagation
    last_h = gru.forward_prop(X, n_sequence)
    y = dense.forward_prop(last_h)

    # Back Propagation Through Time
    loss = np.sum((y - Y) ** 2 / 2)
    dL = y - Y

    grads = dense.back_prop(dL)
    _ = gru.back_prop(grads)

    update_weights([dense, gru], optimizer=optimizer)

    return loss


n_epochs = 200
lr = 0.0001

beta1 = 0.99
beta2 = 0.9999
optimizer = Optimizer.Adam(lr=lr, beta1=beta1, beta2=beta2)

for epoch in range(1, n_epochs + 1):

    loss = 0.0
    for j in range(n_sample):
        loss += train(gru, dense, X[j], Y[j], optimizer)

[4] Prediction.

Run the following command to generate and display the predicted sine wave:

$ python GRU_from_scratch.py
_________________________________________________________________
 Layer (type)                Output Shape              Param #
=================================================================
 gru (GRU)                   (None,  25, 32)              3264

 dense (Dense)               (None,  25,  1)                33
=================================================================
Total params: 3297
epoch: 10/200	 Loss = 0.413963
epoch: 20/200	 Loss = 0.246206
epoch: 30/200	 Loss = 0.212136
epoch: 40/200	 Loss = 0.193578
epoch: 50/200	 Loss = 0.182354
epoch: 60/200	 Loss = 0.174332
epoch: 70/200	 Loss = 0.167752
epoch: 80/200	 Loss = 0.161927
epoch: 90/200	 Loss = 0.156588
epoch: 100/200	 Loss = 0.151625
epoch: 110/200	 Loss = 0.146991
epoch: 120/200	 Loss = 0.142664
epoch: 130/200	 Loss = 0.138625
epoch: 140/200	 Loss = 0.134861
epoch: 150/200	 Loss = 0.131355
epoch: 160/200	 Loss = 0.128090
epoch: 170/200	 Loss = 0.125049
epoch: 180/200	 Loss = 0.122215
epoch: 190/200	 Loss = 0.119574
epoch: 200/200	 Loss = 0.117109