14.2. Implementations
Complete Python code is available at: Attention.py
14.2.1. Bahdanau Attention
- Bahdanau et al., 2015 “Neural Machine Translation by Joinly Learning To Align And Translate”
"""
Bahdanau Attention, a.k.a. Additive attention, Multi-Layer perceptron
"""
class BahdanauAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(BahdanauAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, query, values):
query_with_time_axis = tf.expand_dims(query, 1)
score = self.V(tf.nn.tanh(self.W1(values) + self.W2(query_with_time_axis)))
attention_weights = tf.nn.softmax(score, axis=1)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
Info
The authors of this attention paper are also the authors of the GRU (Gated Recurrent Unit) paper.
14.2.2. Luong Attention
- Luong et al., 2015 “Effective Approaches to Attention-based Neural Machine Translation”
"""
Luong attention, a.k.a. General Attention, Bilinear
"""
class LuongAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(GeneralAttention, self).__init__()
self.W = tf.keras.layers.Dense(units)
def call(self, query, values):
query_with_time_axis = tf.expand_dims(query, 1)
score = tf.transpose(tf.matmul(query_with_time_axis, self.W(values), transpose_b=True), perm=[0, 2, 1])
attention_weights = tf.nn.softmax(score, axis=1)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights