Text Cleaning

The nlp_preprocessing.clean module allow efficient way of text cleaning.

from nlp_preprocessing.clean import *

Clean class initiator takes ordered list of cleaning functions(list given below) and using __call__ function we can apply each function on list of string.

CLEAN_FUNS = {
    'to_lower' : to_lower(),
    'to_normalize' : to_normalize(),
    'remove_href' : remove_href(),
    'remove_control_char' : remove_control_char(),
    'remove_duplicate' : remove_duplicate(),
    'remove_underscore' : remove_underscore(),
    'seperate_spam_chars' : seperate_spam_chars(),
    'seperate_brakets_quotes' : seperate_brakets_quotes(),
    'break_short_words' : break_short_words(),
    'break_long_words' : break_long_words(),
    'remove_ending_underscore' : remove_ending_underscore(),
    'remove_starting_underscore' : remove_starting_underscore(),
    'seperate_end_word_punctuations' : seperate_end_word_punctuations(),
    'seperate_start_word_punctuations' : seperate_start_word_punctuations(),
    'clean_contractions' : clean_contractions(),
    'remove_s' : remove_s(),
    'isolate_numbers' : isolate_numbers(),
    'regex_split_word' : regex_split_word(),
    'leet_clean' : leet_clean(),
    'clean_open_holded_words' : clean_open_holded_words(),
    'clean_multiple_form' : clean_multiple_form()
}

class Clean[source]

Clean(clean_fn_ordered_list=['to_lower', 'to_normalize', 'remove_href', 'remove_control_char', 'remove_duplicate', 'remove_underscore', 'seperate_spam_chars', 'seperate_brakets_quotes', 'break_short_words', 'break_long_words', 'remove_ending_underscore', 'remove_starting_underscore', 'seperate_end_word_punctuations', 'seperate_start_word_punctuations', 'clean_contractions', 'remove_s', 'isolate_numbers', 'regex_split_word', 'leet_clean', 'clean_open_holded_words', 'clean_multiple_form'])

Examples:

texts = ['Hi, how are you', "I am's good"]
cleaned_text = Clean()(texts)
print('Output :', cleaned_text)
########## Step - Lowering everything:
########## Step - Normalize chars and dots:
########## Step - Remove hrefs:
########## Step - Control Chars:
########## Step - Duplicated Chars:
Total Words : 7
########## Step - Remove underscore:
Total Words : 7
['hi, how are you']
########## Step - Spam chars repetition:
Total Words : 7
{}
########## Step - Brackets and quotes:
########## Step - Break long words:
########## Step - Break long words:
########## Step - Remove ending underscore:
Total Words : 7
########## Step - Remove starting underscore:
Total Words : 7
########## Step - End word punctuations:
hi, --- hi ,
########## Step - Start word punctuations:
########## Step - Contractions:
Total Words : 8
########## Step - Remove "s:
Total Words : 8
am's --- am
########## Step - Isolate numbers:
Total Words : 8
Total Words : 8
, ---  , 
########## Step - L33T (with vocab check):
Total Words : 8
########## Step - Open Holded words:
########## Step - Multiple form:
Total Words : 8
Output : ['hi , how are you', 'i am good']
texts = ['Hi, how are you', "I am's good"]
cleaned_text = Clean(['to_lower','remove_s','seperate_end_word_punctuations'])(texts)
print('Output :', cleaned_text)
########## Step - Lowering everything:
########## Step - Remove "s:
Total Words : 7
am's --- am
########## Step - End word punctuations:
hi, --- hi ,
Output : ['hi , how are you', 'i am good']

All cleaning functions takes list of string as input and map function on it

to_lower[source]

to_lower(data)

to_normalize[source]

to_normalize(data)

remove_href[source]

remove_href(data)

remove_control_char[source]

remove_control_char(data)

remove_duplicate[source]

remove_duplicate(data)

remove_underscore[source]

remove_underscore(data)

seperate_spam_chars[source]

seperate_spam_chars(data)

seperate_brakets_quotes[source]

seperate_brakets_quotes(data)

break_short_words[source]

break_short_words(data)

break_long_words[source]

break_long_words(data)

remove_ending_underscore[source]

remove_ending_underscore(data)

remove_starting_underscore[source]

remove_starting_underscore(data)

seperate_end_word_punctuations[source]

seperate_end_word_punctuations(data)

seperate_start_word_punctuations[source]

seperate_start_word_punctuations(data)

clean_contractions[source]

clean_contractions(data)

remove_s[source]

remove_s(data)

isolate_numbers[source]

isolate_numbers(data)

regex_split_word[source]

regex_split_word(data)

leet_clean[source]

leet_clean(data)

clean_open_holded_words[source]

clean_open_holded_words(data)

clean_multiple_form[source]

clean_multiple_form(data)