Open In Colab

!pip install transformers
MAX_LEN = 128
BATCH_SIZE = 16 # per TPU core
TOTAL_STEPS = 2000  # thats approx 4 epochs
LR =  1e-5

PRETRAINED_MODEL = 'bert-base-uncased'

import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import transformers
from transformers import TFAutoModelWithLMHead, AutoTokenizer
import logging

def connect_to_TPU():
    """Detect hardware, return appropriate distribution strategy"""
        # TPU detection. No parameters necessary if TPU_NAME environment variable is
        # set: this is always the case on Kaggle.
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None

    if tpu:
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
        strategy = tf.distribute.get_strategy()

    global_batch_size = BATCH_SIZE * strategy.num_replicas_in_sync

    return tpu, strategy, global_batch_size

tpu, strategy, global_batch_size = connect_to_TPU()
print("REPLICAS: ", strategy.num_replicas_in_sync)
--2020-09-02 10:33:57--
Resolving (,,, ...
Connecting to (||:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23677025 (23M) [text/plain]
Saving to: ‘imdb_tr.csv’

imdb_tr.csv         100%[===================>]  22.58M  49.2MB/s    in 0.5s

2020-09-02 10:33:58 (49.2 MB/s) - ‘imdb_tr.csv’ saved [23677025/23677025]
data = pd.read_csv('imdb_tr.csv', encoding = "ISO-8859-1")
row_Number text polarity
0 2148 first think another Disney movie, might good, ... 1
1 23577 Put aside Dr. House repeat missed, Desperate H... 0
2 1319 big fan Stephen King's work, film made even gr... 1
3 13358 watched horrid thing TV. Needless say one movi... 0
4 9495 truly enjoyed film. acting terrific plot. Jeff... 1
#data = data.sample(1000)

def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(

    return np.array(enc_di['input_ids'])

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
X_data = regular_encode(data.text.values, tokenizer, maxlen=MAX_LEN)
CPU times: user 1min 4s, sys: 233 ms, total: 1min 4s
Wall time: 1min 5s
def prepare_mlm_input_and_labels(X):
    # 15% BERT masking
    inp_mask = np.random.rand(*X.shape)<0.15 
    # do not mask special tokens
    inp_mask[X<=2] = False
    # set targets to -1 by default, it means ignore
    labels =  -1 * np.ones(X.shape, dtype=int)
    # set labels for masked tokens
    labels[inp_mask] = X[inp_mask]

    # prepare input
    X_mlm = np.copy(X)
    # set input to [MASK] which is the last token for the 90% of tokens
    # this means leaving 10% unchanged
    inp_mask_2mask = inp_mask  & (np.random.rand(*X.shape)<0.90)
    X_mlm[inp_mask_2mask] = tokenizer.mask_token_id  # mask token is the last in the dict

    # set 10% to a random token
    inp_mask_2random = inp_mask_2mask  & (np.random.rand(*X.shape) < 1/9)
    X_mlm[inp_mask_2random] = np.random.randint(3, tokenizer.mask_token_id, inp_mask_2random.sum())

    return X_mlm, labels

# use validation and test data for mlm
X_train_mlm = np.vstack(X_data)
# masks and labels
X_train_mlm, y_train_mlm = prepare_mlm_input_and_labels(X_train_mlm)
def create_dist_dataset(X, y=None, training=False):
    dataset =

    ### Add y if present ###
    if y is not None:
        dataset_y =
        dataset =, dataset_y))

    ### Repeat if training ###
    if training:
        dataset = dataset.shuffle(len(X)).repeat()

    dataset = dataset.batch(global_batch_size).prefetch(AUTO)

    ### make it distributed  ###
    dist_dataset = strategy.experimental_distribute_dataset(dataset)

    return dist_dataset

train_dist_dataset = create_dist_dataset(X_train_mlm, y_train_mlm, True)


def create_mlm_model_and_optimizer():
    with strategy.scope():
        model = TFAutoModelWithLMHead.from_pretrained(PRETRAINED_MODEL)
        optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
    return model, optimizer

mlm_model, optimizer = create_mlm_model_and_optimizer()
HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…

Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertForMaskedLM: ['nsp___cls']
- This IS expected if you are initializing TFBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForMaskedLM were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.

Model: "tf_bert_for_masked_lm"
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
mlm___cls (TFBertMLMHead)    multiple                  24459834  
Total params: 110,104,890
Trainable params: 110,104,890
Non-trainable params: 0
CPU times: user 14.5 s, sys: 15 s, total: 29.5 s
Wall time: 58.1 s
def define_mlm_loss_and_metrics():
    with strategy.scope():
        mlm_loss_object = masked_sparse_categorical_crossentropy

        def compute_mlm_loss(labels, predictions):
            per_example_loss = mlm_loss_object(labels, predictions)
            loss = tf.nn.compute_average_loss(
                per_example_loss, global_batch_size = global_batch_size)
            return loss

        train_mlm_loss_metric = tf.keras.metrics.Mean()

    return compute_mlm_loss, train_mlm_loss_metric

def masked_sparse_categorical_crossentropy(y_true, y_pred):
    y_true_masked = tf.boolean_mask(y_true, tf.not_equal(y_true, -1))
    y_pred_masked = tf.boolean_mask(y_pred, tf.not_equal(y_true, -1))
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true_masked,
    return loss

def train_mlm(train_dist_dataset, total_steps=2000, evaluate_every=200):
    step = 0
    ### Training lopp ###
    for tensor in train_dist_dataset:

        if (step % evaluate_every == 0):   
            ### Print train metrics ###  
            train_metric = train_mlm_loss_metric.result().numpy()
            print("Step %d, train loss: %.2f" % (step, train_metric))     

            ### Reset  metrics ###

        if step  == total_steps:

def distributed_mlm_train_step(data):
    strategy.experimental_run_v2(mlm_train_step, args=(data,))

def mlm_train_step(inputs):
    features, labels = inputs

    with tf.GradientTape() as tape:
        predictions = mlm_model(features, training=True)[0]
        loss = compute_mlm_loss(labels, predictions)

    gradients = tape.gradient(loss, mlm_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, mlm_model.trainable_variables))


compute_mlm_loss, train_mlm_loss_metric = define_mlm_loss_and_metrics()
train_mlm(train_dist_dataset, TOTAL_STEPS, EVALUATE_EVERY)
Step 200, train loss: 8.89
Step 400, train loss: 8.03
Step 600, train loss: 7.68
Step 800, train loss: 7.43
Step 1000, train loss: 7.22
Step 1200, train loss: 7.00
Step 1400, train loss: 6.86
Step 1600, train loss: 6.68
Step 1800, train loss: 6.54
Step 2000, train loss: 6.38
CPU times: user 1min 23s, sys: 13.4 s, total: 1min 37s
Wall time: 9min 3s

Load and Test

from transformers import *
from pprint import pprint
pretrained_model = TFAutoModelWithLMHead.from_pretrained(PRETRAINED_MODEL)
nlp = pipeline("fill-mask",model=pretrained_model, tokenizer=tokenizer ,framework='tf')
pprint(nlp(f"I watched {nlp.tokenizer.mask_token} and that was awesome"))
[{'score': 0.31239137053489685,
  'sequence': '[CLS] i watched him and that was awesome [SEP]',
  'token': 2032,
  'token_str': 'him'},
 {'score': 0.1729636937379837,
  'sequence': '[CLS] i watched her and that was awesome [SEP]',
  'token': 2014,
  'token_str': 'her'},
 {'score': 0.13816313445568085,
  'sequence': '[CLS] i watched it and that was awesome [SEP]',
  'token': 2009,
  'token_str': 'it'},
 {'score': 0.08374697715044022,
  'sequence': '[CLS] i watched, and that was awesome [SEP]',
  'token': 1010,
  'token_str': ','},
 {'score': 0.06438492983579636,
  'sequence': '[CLS] i watched them and that was awesome [SEP]',
  'token': 2068,
  'token_str': 'them'}]
movie_mlm_model = TFAutoModelWithLMHead.from_pretrained('imdb_bert_uncased')
nlp = pipeline("fill-mask",model=movie_mlm_model, tokenizer=tokenizer ,framework='tf')
pprint(nlp(f"I watched {nlp.tokenizer.mask_token} and that was awesome"))
[{'score': 0.4467789828777313,
  'sequence': '[CLS] i watched it and that was awesome [SEP]',
  'token': 2009,
  'token_str': 'it'},
 {'score': 0.06318594515323639,
  'sequence': '[CLS] i watched movie and that was awesome [SEP]',
  'token': 3185,
  'token_str': 'movie'},
 {'score': 0.056345004588365555,
  'sequence': '[CLS] i watched, and that was awesome [SEP]',
  'token': 1010,
  'token_str': ','},
 {'score': 0.013144557364284992,
  'sequence': '[CLS] i watched this and that was awesome [SEP]',
  'token': 2023,
  'token_str': 'this'},
 {'score': 0.012886741198599339,
  'sequence': '[CLS] i watched one and that was awesome [SEP]',
  'token': 2028,
  'token_str': 'one'}]