Merge pull request #48 from philipperemy/sequential

philipperemy · web-flow · commit 5fa5345b2928 · 2020-09-03T21:07:44.000+09:00
add sequential examples + keras layer
diff --git a/README.md b/README.md
@@ -2,18 +2,37 @@
 [![license](https://img.shields.io/badge/License-Apache_2.0-brightgreen.svg)](https://github.com/philipperemy/keras-attention-mechanism/blob/master/LICENSE) [![dep1](https://img.shields.io/badge/Tensorflow-2.0+-brightgreen.svg)](https://www.tensorflow.org/) [![dep2](https://img.shields.io/badge/Keras-2.0+-brightgreen.svg)](https://keras.io/) 
 ![Simple Keras Attention CI](https://github.com/philipperemy/keras-attention-mechanism/workflows/Simple%20Keras%20Attention%20CI/badge.svg)
 
-```
-pip install attention
-```
-
 Many-to-one attention mechanism for Keras.
 
 <p align="center">
-  <img src="examples/equations.png">
+  <img src="examples/equations.png" width="600">
 </p>
 
+
+Installation via pip
+
+```bash
+pip install attention
+```
+
+Import in the source code
+
+```python
+from attention import Attention
+
+# [...]
+
+m = Sequential([
+      LSTM(128, input_shape=(seq_length, 1), return_sequences=True),
+      Attention(name='attention_weight'), # <--------- here.
+      Dense(1, activation='linear')
+])
+```
+
 ## Examples
 
+Install the requirements before running the examples: `pip install -r requirements.txt`.
+
 ### IMDB Dataset
 
 In this experiment, we demonstrate that using attention yields a higher accuracy on the IMDB dataset. We consider two
@@ -46,6 +65,18 @@ task and the attention map converges to the ground truth.
   <img src="examples/attention.gif" width="320">
 </p>
 
+### Finding max of a sequence
+
+We consider many 1D sequences of the same length. The task is to find the maximum of each sequence. 
+
+We give the full sequence processed by the RNN layer to the attention layer. We expect the attention layer to focus on the maximum of each sequence.
+
+After a few epochs, the attention layer converges perfectly to what we expected.
+
+<p align="center">
+  <img src="examples/readme/example.png" width="320">
+</p>
+
 ## References
 
 - https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf
diff --git a/attention/__init__.py b/attention/__init__.py
@@ -1 +1,3 @@
-from attention.attention import attention_3d_block  # noqa
+from attention.attention import Attention  # noqa
+
+VERSION = '3.0'
diff --git a/attention/attention.py b/attention/attention.py
@@ -1,26 +1,32 @@
 from tensorflow.keras.layers import Dense, Lambda, dot, Activation, concatenate
+from tensorflow.keras.layers import Layer
 
 
-def attention_3d_block(hidden_states):
-    """
-    Many-to-one attention mechanism for Keras.
-    @param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim).
-    @return: 2D tensor with shape (batch_size, 128)
-    @author: felixhao28.
-    """
-    hidden_size = int(hidden_states.shape[2])
-    # Inside dense layer
-    #              hidden_states            dot               W            =>           score_first_part
-    # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size)
-    # W is the trainable weight matrix of attention Luong's multiplicative style score
-    score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec')(hidden_states)
-    #            score_first_part           dot        last_hidden_state     => attention_weights
-    # (batch_size, time_steps, hidden_size) dot   (batch_size, hidden_size)  => (batch_size, time_steps)
-    h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), name='last_hidden_state')(hidden_states)
-    score = dot([score_first_part, h_t], [2, 1], name='attention_score')
-    attention_weights = Activation('softmax', name='attention_weight')(score)
-    # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size)
-    context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector')
-    pre_activation = concatenate([context_vector, h_t], name='attention_output')
-    attention_vector = Dense(128, use_bias=False, activation='tanh', name='attention_vector')(pre_activation)
-    return attention_vector
+class Attention(Layer):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def __call__(self, hidden_states):
+        """
+        Many-to-one attention mechanism for Keras.
+        @param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim).
+        @return: 2D tensor with shape (batch_size, 128)
+        @author: felixhao28.
+        """
+        hidden_size = int(hidden_states.shape[2])
+        # Inside dense layer
+        #              hidden_states            dot               W            =>           score_first_part
+        # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size)
+        # W is the trainable weight matrix of attention Luong's multiplicative style score
+        score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec')(hidden_states)
+        #            score_first_part           dot        last_hidden_state     => attention_weights
+        # (batch_size, time_steps, hidden_size) dot   (batch_size, hidden_size)  => (batch_size, time_steps)
+        h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), name='last_hidden_state')(hidden_states)
+        score = dot([score_first_part, h_t], [2, 1], name='attention_score')
+        attention_weights = Activation('softmax', name='attention_weight')(score)
+        # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size)
+        context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector')
+        pre_activation = concatenate([context_vector, h_t], name='attention_output')
+        attention_vector = Dense(128, use_bias=False, activation='tanh', name='attention_vector')(pre_activation)
+        return attention_vector
diff --git a/examples/example-attention.py b/examples/example-attention.py
@@ -5,14 +5,11 @@
 import numpy
 import numpy as np
 from keract import get_activations
-from tensorflow.keras import Input
-from tensorflow.keras import Model
+from tensorflow.keras import Sequential
 from tensorflow.keras.callbacks import Callback
-from tensorflow.keras.layers import Dense
-from tensorflow.keras.layers import Dropout
-from tensorflow.keras.layers import LSTM
+from tensorflow.keras.layers import Dense, Dropout, LSTM
 
-from attention import attention_3d_block
+from attention import Attention
 
 
 def task_add_two_numbers_after_delimiter(n: int, seq_length: int, delimiter: float = 0.0,
@@ -59,14 +56,13 @@ def main():
     x_test_mask[:, test_index_1:test_index_1 + 1] = 1
     x_test_mask[:, test_index_2:test_index_2 + 1] = 1
 
-    # model
-    i = Input(shape=(seq_length, 1))
-    x = LSTM(100, return_sequences=True)(i)
-    x = attention_3d_block(x)
-    x = Dropout(0.2)(x)
-    x = Dense(1, activation='linear')(x)
+    model = Sequential([
+        LSTM(100, input_shape=(seq_length, 1), return_sequences=True),
+        Attention(name='attention_weight'),
+        Dropout(0.2),
+        Dense(1, activation='linear')
+    ])
 
-    model = Model(inputs=[i], outputs=[x])
     model.compile(loss='mse', optimizer='adam')
     print(model.summary())
 
@@ -79,7 +75,7 @@ def main():
     class VisualiseAttentionMap(Callback):
 
         def on_epoch_end(self, epoch, logs=None):
-            attention_map = get_activations(model, x_test, layer_name='attention_weight')['attention_weight']
+            attention_map = get_activations(model, x_test, layer_names='attention_weight')['attention_weight']
 
             # top is attention map.
             # bottom is ground truth.
diff --git a/examples/find_max.py b/examples/find_max.py
@@ -0,0 +1,64 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from keract import get_activations
+from tensorflow.keras import Sequential
+from tensorflow.keras.callbacks import Callback
+from tensorflow.keras.layers import Dense, LSTM
+
+from attention import Attention
+
+
+class VisualizeAttentionMap(Callback):
+
+    def __init__(self, model, x):
+        super().__init__()
+        self.model = model
+        self.x = x
+
+    def on_epoch_begin(self, epoch, logs=None):
+        attention_map = get_activations(self.model, self.x, layer_names='attention_weight')['attention_weight']
+        x = self.x[..., 0]
+        fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(5, 6))
+        maps = [attention_map, create_argmax_mask(attention_map), create_argmax_mask(x)]
+        maps_names = ['attention layer', 'attention layer - argmax()', 'ground truth - argmax()']
+        for i, ax in enumerate(axes.flat):
+            im = ax.imshow(maps[i], interpolation='none', cmap='jet')
+            ax.set_ylabel(maps_names[i] + '\n#sample axis')
+            ax.set_xlabel('sequence axis')
+            ax.xaxis.set_ticks([])
+            ax.yaxis.set_ticks([])
+        cbar_ax = fig.add_axes([0.75, 0.15, 0.05, 0.7])
+        fig.colorbar(im, cax=cbar_ax)
+        fig.suptitle(f'Epoch {epoch} - training')
+        plt.show()
+
+
+def create_argmax_mask(x):
+    mask = np.zeros_like(x)
+    for i, m in enumerate(x.argmax(axis=1)):
+        mask[i, m] = 1
+    return mask
+
+
+def main():
+    seq_length = 10
+    num_samples = 100000
+    # https://stats.stackexchange.com/questions/485784/which-distribution-has-its-maximum-uniformly-distributed
+    # Choose beta(1/N,1) to have max(X_1,...,X_n) ~ U(0, 1) => minimizes amount of knowledge.
+    # If all the max(s) are concentrated around 1, then it makes the task easy for the model.
+    x_data = np.random.beta(a=1 / seq_length, b=1, size=(num_samples, seq_length, 1))
+    y_data = np.max(x_data, axis=1)
+    model = Sequential([
+        LSTM(128, input_shape=(seq_length, 1), return_sequences=True),
+        Attention(name='attention_weight'),
+        Dense(1, activation='linear')
+    ])
+    model.compile(loss='mae')
+    max_epoch = 100
+    # visualize the attention on the first samples.
+    visualize = VisualizeAttentionMap(model, x_data[0:12])
+    model.fit(x_data, y_data, epochs=max_epoch, validation_split=0.2, callbacks=[visualize])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/imdb.py b/examples/imdb.py
@@ -1,42 +1,37 @@
 import numpy
 import numpy as np
-from tensorflow.keras import Input
-from tensorflow.keras import Model
+from tensorflow.keras import Sequential
 from tensorflow.keras.callbacks import Callback
 from tensorflow.keras.datasets import imdb
-from tensorflow.keras.layers import Dense
-from tensorflow.keras.layers import Dropout
-from tensorflow.keras.layers import Embedding
-from tensorflow.keras.layers import LSTM
+from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
 from tensorflow.keras.preprocessing import sequence
 
-from attention import attention_3d_block
+from attention import Attention
 
 
 def train_and_evaluate_model_on_imdb(add_attention=True):
     numpy.random.seed(7)
     # load the dataset but only keep the top n words, zero the rest
     top_words = 5000
-    (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
+    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=top_words)
     # truncate and pad input sequences
     max_review_length = 500
-    X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
-    X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
+    x_train = sequence.pad_sequences(x_train, maxlen=max_review_length)
+    x_test = sequence.pad_sequences(x_test, maxlen=max_review_length)
     # create the model
     embedding_vector_length = 32
-    i = Input(shape=(max_review_length,))
-    x = Embedding(top_words, embedding_vector_length, input_length=max_review_length)(i)
-    x = Dropout(0.5)(x)
-    if add_attention:
-        x = LSTM(100, return_sequences=True)(x)
-        x = attention_3d_block(x)
-    else:
-        x = LSTM(100, return_sequences=False)(x)
-        x = Dense(350, activation='relu')(x)  # same number of parameters so fair comparison.
-    x = Dropout(0.5)(x)
-    x = Dense(1, activation='sigmoid')(x)
 
-    model = Model(inputs=[i], outputs=[x])
+    model = Sequential([
+        Embedding(top_words, embedding_vector_length, input_length=max_review_length),
+        Dropout(0.5),
+        # attention vs no attention. same number of parameters so fair comparison.
+        *([LSTM(100, return_sequences=True), Attention()] if add_attention
+          else [LSTM(100), Dense(350, activation='relu')]),
+        Dropout(0.5),
+        Dense(1, activation='sigmoid')
+    ]
+    )
+
     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
     print(model.summary())
 
@@ -52,7 +47,7 @@ def on_epoch_end(self, epoch, logs=None):
             self.val_losses.append(logs['val_loss'])
 
     rbta = RecordBestTestAccuracy()
-    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[rbta])
+    model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=64, callbacks=[rbta])
 
     print(f"Max Test Accuracy: {100 * np.max(rbta.val_accuracies):.2f} %")
     print(f"Mean Test Accuracy: {100 * np.mean(rbta.val_accuracies):.2f} %")
diff --git a/examples/readme/example.png b/examples/readme/example.png
diff --git a/setup.py b/setup.py
@@ -1,18 +1,19 @@
 from setuptools import setup
 
+from attention import VERSION
+
 setup(
     name='attention',
-    version='2.2',
-    description='Keras Attention Many to One',
+    version=VERSION,
+    description='Keras Simple Attention',
     author='Philippe Remy',
     license='Apache 2.0',
     long_description_content_type='text/markdown',
     long_description=open('README.md').read(),
     packages=['attention'],
-    # manually install tensorflow or tensorflow-gpu
     install_requires=[
         'numpy>=1.18.1',
         'keras>=2.3.1',
-        'gast>=0.2.2'
+        'tensorflow>=2.1'
     ]
 )