from nlp_preprocessing.dataset import Dataset

Dataset allow to split in test-train and encode (label or one-hot). It allow both multi-label and multi-class split

class Dataset[source]

Dataset(data_config)

Dataset allow split and encoding using external config file

Args:

data_config (dict): config dict
example:
    data_config = {
                'data_class':'multi-class',
                'x_columns':[],
                'y_columns':[],
                'one_hot_encoded_columns':[],
                'label_encoded_columns':[],
                'data':None,
                'split_ratio':0.2,
                'random_state':3107
            }
    where data_class: ['multi-class','multi-label'] and data: it should be dataframe

Example

import pandas as pd
text = ['I am Test 1','I am Test 2']
label = ['A','B']
aspect = ['C','D']
data = pd.DataFrame({'text':text*5,'label':label*5,'aspect':aspect*5})
data
text label aspect
0 I am Test 1 A C
1 I am Test 2 B D
2 I am Test 1 A C
3 I am Test 2 B D
4 I am Test 1 A C
5 I am Test 2 B D
6 I am Test 1 A C
7 I am Test 2 B D
8 I am Test 1 A C
9 I am Test 2 B D
data_config = {
                'data_class':'multi-class',
                'x_columns':['text','aspect'],
                'y_columns':['label'],
                'one_hot_encoded_columns':[],
                'label_encoded_columns':['label','aspect'],
                'data':data,
                'split_ratio':0.2
              }
dataset = Dataset(data_config)
dataset.data_config
{'data_class': 'multi-class',
 'x_columns': ['text', 'aspect'],
 'y_columns': ['label'],
 'one_hot_encoded_columns': [],
 'label_encoded_columns': ['label', 'aspect'],
 'data':           text label aspect
 0  I am Test 1     A      C
 1  I am Test 2     B      D
 2  I am Test 1     A      C
 3  I am Test 2     B      D
 4  I am Test 1     A      C
 5  I am Test 2     B      D
 6  I am Test 1     A      C
 7  I am Test 2     B      D
 8  I am Test 1     A      C
 9  I am Test 2     B      D,
 'split_ratio': 0.2,
 'random_state': 3107}
train, test = dataset.get_train_test_data()
train['Y_train'],train['X_train']
({'label': array([0, 1, 1, 0, 0, 1, 0, 1])},
 {'text': array(['I am Test 1', 'I am Test 2', 'I am Test 2', 'I am Test 1',
         'I am Test 1', 'I am Test 2', 'I am Test 1', 'I am Test 2'],
        dtype=object), 'aspect': array([0, 1, 1, 0, 0, 1, 0, 1])})
test['Y_test'],test['X_test']
({'label': array([0, 1])},
 {'text': array(['I am Test 1', 'I am Test 2'], dtype=object),
  'aspect': array([0, 1])})