14.2. Implementations

Complete Python code is available at: Attention.py

14.2.1. Bahdanau Attention

Bahdanau Attention, a.k.a. Additive attention, Multi-Layer perceptron
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(query_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

The authors of this attention paper are also the authors of the GRU (Gated Recurrent Unit) paper.

14.2.2. Luong Attention

Luong attention, a.k.a. General Attention, Bilinear
class  LuongAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(GeneralAttention, self).__init__()
        self.W = tf.keras.layers.Dense(units)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = tf.transpose(tf.matmul(query_with_time_axis, self.W(values), transpose_b=True), perm=[0, 2, 1])

        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights