Open In Colab

!pip install transformers
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
     |████████████████████████████████| 778kB 2.8MB/s 
[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.18.5)
Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
     |████████████████████████████████| 1.1MB 13.3MB/s 
[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)
Requirement already satisfied: dataclasses; python_version < "3.7" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7)
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
     |████████████████████████████████| 890kB 19.7MB/s 
[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)
Requirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.4)
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
     |████████████████████████████████| 3.0MB 20.6MB/s 
[?25hRequirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.6.20)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.15.0)
Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)
Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.16.0)
Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=6bda503b3bbdbf7626ff38a3e3235c3a20e948d97c40c78a681f5c87b90ed237
  Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45
Successfully built sacremoses
Installing collected packages: sentencepiece, sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.43 sentencepiece-0.1.91 tokenizers-0.8.1rc1 transformers-3.0.2
import os, pandas as pd
from sklearn.model_selection import train_test_split
import logging
from transformers import *
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from tqdm.autonotebook import tqdm

1. Set Configuration

class Config:
  train_file = './data.csv'
  eval_file = './eval.csv'
  max_seq_len = 128
  batch_size = 32
  epochs = 5
  model_name = 'bert-base-uncased'
  learning_rate = 2e-5
  n_classes = 3
  device = 'cpu'



flags = Config

2. Build Dataset Pipeline

class TextLabelDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt',
          truncation=True
        )

        return {
          'texts': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(label, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, max_len, batch_size, is_prediction=False):

  if isinstance(df, str):
    df = pd.read_csv(df)
  else:
    pass

  if is_prediction:
    ds = TextLabelDataset(
        texts=df.text.to_numpy(),
        labels=np.array([-1]*len(df.text.values)),
        tokenizer=tokenizer,
        max_len=max_len
        )
  else:
    ds = TextLabelDataset(
        texts=df.text.to_numpy(),
        labels=df.labels.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
        )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
        )

3. Build Model

class Classifier(nn.Module):

  def __init__(self, model_name, n_classes):
      super(Classifier, self).__init__()
      self.bert = AutoModel.from_pretrained(model_name)
      self.drop = nn.Dropout(p=0.3)
      self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
      _, pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      output = self.drop(pooled_output)
      return self.out(output)
class ClassificationModel:

  def __init__(self, flags):
    self.flags = flags
    self.tokenizer = BertTokenizer.from_pretrained(self.flags.model_name)
    self.model = Classifier(self.flags.model_name, self.flags.n_classes)
    self.model = self.model.to(self.flags.device)

  def train(self):

    train_data_loader = create_data_loader(self.flags.train_file, self.tokenizer, self.flags.max_seq_len, self.flags.batch_size)
    val_data_loader = create_data_loader(self.flags.eval_file, self.tokenizer, self.flags.max_seq_len, self.flags.batch_size)

    optimizer = AdamW(self.model.parameters(), lr=self.flags.learning_rate, correct_bias=False)
    total_steps = len(train_data_loader) * self.flags.epochs

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps=total_steps
    )

    loss_fn = nn.CrossEntropyLoss().to(self.flags.device)

    history = defaultdict(list)
    best_accuracy = 0

    if isinstance(self.flags.train_file, str):
      train_df = pd.read_csv(self.flags.train_file)

    if isinstance(self.flags.eval_file, str):
      eval_df = pd.read_csv(self.flags.eval_file)

    for epoch in range(self.flags.epochs):

      print(f'Epoch {epoch + 1}/{self.flags.epochs}')
      print('-' * 10)

      train_acc, train_loss = self.train_epoch(
        self.model,
        train_data_loader,    
        loss_fn, 
        optimizer, 
        self.flags.device, 
        scheduler, 
        len(train_df)
      )

      print(f'Train loss {train_loss} accuracy {train_acc}')

      val_acc, val_loss = self.eval_model(
        self.model,
        val_data_loader,
        loss_fn, 
        self.flags.device, 
        len(eval_df)
      )

      print(f'Val   loss {val_loss} accuracy {val_acc}')
      print()

      history['train_acc'].append(train_acc)
      history['train_loss'].append(train_loss)
      history['val_acc'].append(val_acc)
      history['val_loss'].append(val_loss)

      if val_acc > best_accuracy:
        torch.save(self.model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc


  def train_epoch(self, model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0
    tk0 = tqdm(data_loader, total=len(data_loader), desc="Training")
    for bi, d in enumerate(tk0):
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )

      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

      loss.backward()
      nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
      optimizer.step()
      scheduler.step()
      optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

  def eval_model(self, model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
      tk0 = tqdm(data_loader, total=len(data_loader), desc="Evaluating")
      for bi, d in enumerate(tk0):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)

        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)



Download Data and Preparation

!wget https://raw.githubusercontent.com/SrinidhiRaghavan/AI-Sentiment-Analysis-on-IMDB-Dataset/master/imdb_tr.csv
--2020-08-26 15:57:14--  https://raw.githubusercontent.com/SrinidhiRaghavan/AI-Sentiment-Analysis-on-IMDB-Dataset/master/imdb_tr.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23677025 (23M) [text/plain]
Saving to: ‘imdb_tr.csv’

imdb_tr.csv         100%[===================>]  22.58M  39.9MB/s    in 0.6s

2020-08-26 15:57:16 (39.9 MB/s) - ‘imdb_tr.csv’ saved [23677025/23677025]
data = pd.read_csv('imdb_tr.csv', encoding = "ISO-8859-1")
data.columns = ['row_Number', 'text', 'labels']
train_data = data.sample(1000)
test_data = data.sample(100)

train_data.to_csv('data.csv', index=False)
test_data.to_csv('eval.csv', index=False)

Training

from collections import defaultdict
import numpy as np
class Config:
  train_file = './data.csv'
  eval_file = './eval.csv'
  max_seq_len = 128
  batch_size = 32
  epochs = 5
  model_name = 'bert-base-uncased'
  learning_rate = 2e-5
  n_classes = 2
  device = 'cuda'

flags = Config
classification = ClassificationModel(flags)
classification.train()
Epoch 1/5
----------



HBox(children=(FloatProgress(value=0.0, description='Training', max=32.0, style=ProgressStyle(description_widt…



Train loss 0.6540139000862837 accuracy 0.622



HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=4.0, style=ProgressStyle(description_wid…



Val   loss 0.4141501262784004 accuracy 0.78

Epoch 2/5
----------



HBox(children=(FloatProgress(value=0.0, description='Training', max=32.0, style=ProgressStyle(description_widt…



Train loss 0.3276493112789467 accuracy 0.864



HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=4.0, style=ProgressStyle(description_wid…



Val   loss 0.3254726273007691 accuracy 0.87

Epoch 3/5
----------



HBox(children=(FloatProgress(value=0.0, description='Training', max=32.0, style=ProgressStyle(description_widt…



Train loss 0.12970392164424993 accuracy 0.9530000000000001



HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=4.0, style=ProgressStyle(description_wid…



Val   loss 0.4319960339926183 accuracy 0.8300000000000001

Epoch 4/5
----------



HBox(children=(FloatProgress(value=0.0, description='Training', max=32.0, style=ProgressStyle(description_widt…



Train loss 0.0639086696319282 accuracy 0.982



HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=4.0, style=ProgressStyle(description_wid…



Val   loss 0.5208611574489623 accuracy 0.8300000000000001

Epoch 5/5
----------



HBox(children=(FloatProgress(value=0.0, description='Training', max=32.0, style=ProgressStyle(description_widt…



Train loss 0.01748604617023375 accuracy 0.996



HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=4.0, style=ProgressStyle(description_wid…



Val   loss 0.4388579736405518 accuracy 0.9