Commit 66ff81ba authored by mjboos's avatar mjboos

newest

parent 0559937c
......@@ -17,6 +17,7 @@ import helpers as hlp
import models
import preprocessing as pre
from keras import optimizers
from keras.layers import Bidirectional, TimeDistributed
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, CSVLogger
import json
import feature_engineering
......@@ -48,11 +49,23 @@ def make_callback_list(model_name, save_weights=True, patience=10):
checkpoints.append(checkpoint)
return checkpoints
def continue_training_DNN_last_layer(model_name, old_model_name, fit_args, *args, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
old_weights_path="{}_best.hdf5".format(old_model_name)
model = models.Embedding_Blanko_DNN(**kwargs)
model.model.load_weights(old_weights_path)
model.model = freeze_layers(model.model, unfrozen_keyword='main_output')
callbacks_list = make_callback_list(best_weights_path, patience=5)
fit_args['callbacks'] = callbacks_list
model.fit(*args, **fit_args)
model.model.load_weights(best_weights_path)
return model
def continue_training_DNN(model_name, fit_args, *args, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
model = models.Embedding_Blanko_DNN(**kwargs)
model.model.load_weights(best_weights_path)
callbacks_list = make_callback_list(model_name+'_more', patience=3)
callbacks_list = make_callback_list(model_name+'_more', patience=5)
fit_args['callbacks'] = callbacks_list
model.fit(*args, **fit_args)
model.model.load_weights(best_weights_path)
......@@ -129,16 +142,50 @@ def transfer_weights_multi_to_one(weights, model, i):
# now for the last layer
model.layers[-1].set_weights([weights[-1][0][:,i][:,None], weights[-1][1][i][None]])
def change_trainable(layer, trainable, verbose=False):
""" Helper method that fixes some of Keras' issues with wrappers and
trainability. Freezes or unfreezes a given layer.
# Arguments:
layer: Layer to be modified.
trainable: Whether the layer should be frozen or unfrozen.
verbose: Verbosity flag.
"""
layer.trainable = trainable
if type(layer) == Bidirectional:
layer.backward_layer.trainable = trainable
layer.forward_layer.trainable = trainable
if type(layer) == TimeDistributed:
layer.backward_layer.trainable = trainable
if verbose:
action = 'Unfroze' if trainable else 'Froze'
print("{} {}".format(action, layer.name))
def extend_and_finetune_last_layer_model(model_name, fit_args, train_X, train_y, test_text, **kwargs):
'''Fits and returns a model for one label (provided as index i)'''
if 'compilation_args' in kwargs:
kwargs['compilation_args']['optimizer'] = optimizers.Adam(lr=0.001, clipnorm=1.)
for i in xrange(6):
new_name = model_name + '_{}'.format(i)
model = continue_training_DNN_last_layer(new_name, model_name, fit_args, train_X, train_y[:,i], **kwargs)
joblib.dump(model.predict(test_text), '{}.pkl'.format(new_name))
K.clear_session()
if 'compilation_args' in kwargs:
kwargs['compilation_args']['optimizer'] = optimizers.Adam(lr=0.001, clipnorm=1.)
def fine_tune_model(model_name, old_model, fit_args, train_X, train_y, test_text, **kwargs):
'''Fits and returns a model for one label (provided as index i)'''
weights = [layer.get_weights() for layer in old_model.layers]
if 'compilation_args' in kwargs:
kwargs['compilation_args']['optimizers'] = optimizers.Adam(lr=0.0001, clipnorm=1.)
kwargs['compilation_args']['optimizer'] = optimizers.Adam(lr=0.0001, clipnorm=1.)
for i in xrange(6):
new_name = model_name + '_{}'.format(i)
model = continue_training_DNN_one_output(new_name, i, weights, fit_args, train_X, train_y[:,i], **kwargs)
joblib.dump(model.predict(test_text), '{}.pkl'.format(new_name))
K.clear_session()
if 'compilation_args' in kwargs:
kwargs['compilation_args']['optimizers'] = optimizers.Adam(lr=0.0001, clipnorm=1.)
kwargs['compilation_args']['optimizer'] = optimizers.Adam(lr=0.0001, clipnorm=1.)
......@@ -35,7 +35,7 @@ lr = LearningRateScheduler(schedule)
callbacks_list = [checkpoint, early] #early
fit_args = {'batch_size' : 100, 'epochs' : 30,
fit_args = {'batch_size' : 80, 'epochs' : 30,
'validation_split' : 0.2, 'callbacks' : callbacks_list}
train_text, train_y = pre.load_data()
......@@ -132,7 +132,7 @@ if __name__=='__main__':
loss = partial(models.weighted_binary_crossentropy, weights=weight_tensor)
loss.__name__ = 'weighted_binary_crossentropy'
model_params = simple_attention()
model_name = '300_fasttext_attention_GRU'
model_name = '300_fasttext_attention_diffpre2_GRU'
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
......@@ -142,14 +142,14 @@ if __name__=='__main__':
# np.random.shuffle(row_idx)
# train_text, train_y, aux_task, train_data_augmentation = train_text[row_idx], train_y[row_idx], aux_task[row_idx], train_data_augmentation[row_idx]
# model = load_keras_model(model_name)
model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = fit_model(model_name, fit_args, {'main_input':train_text, 'aug_input':train_data_augmentation}, {'main_output':aux_task}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict({'main_input':test_text,'aug_input':test_data_augmentation}))
# hlp.make_training_set_preds(model, {'main_input':train_text, 'aug_input':train_data_augmentation}, train_y)
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
model = fit_model(model_name, fit_args, {'main_input':train_text, 'aug_input':train_data_augmentation}, {'main_output': train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
hlp.write_model(model.predict({'main_input':test_text,'aug_input':test_data_augmentation}))
hlp.make_training_set_preds(model, {'main_input':train_text, 'aug_input':train_data_augmentation}, train_y)
# model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output':train_y, 'aux_output':aux_task}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = continue_training_DNN(model_name, fit_args, train_text, train_y, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict(test_text))
# K.clear_session()
# model = continue_training_DNN(model_name, fit_args, train_text, train_y, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
model_params = simple_attention_1d()
fine_tune_model(model_name, model.model, fit_args, train_text, train_y, test_text, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model_params = simple_attention_1d()
# extend_and_finetune_last_layer_model(model_name, fit_args, train_text, train_y, test_text, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
......@@ -169,25 +169,32 @@ def prune_zero_vectors(matrix):
matrix = matrix[np.array([True] + [vec.any() for vec in matrix[1:-1]] + [True])]
return matrix
def which_words_are_zero_vectors(embedding_matrix, word_index, oov_token):
def which_words_are_zero_vectors(embedding_matrix, word_index, oov_token, exclude_ids=False):
'''Returns a list of words which are zero vectors (not found) in the embedding matrix'''
word_list = []
DEBUG_cnt = 0
for word, i in word_index.items():
if word == oov_token:
continue
if exclude_ids:
if word.startswith('_') and word.endswith('_'):
DEBUG_cnt += 1
continue
if i >= embedding_matrix.shape[0]:
# word is out of max features
word_list.append(word)
elif not embedding_matrix[i].any():
# word is a zero vector
word_list.append(word)
print DEBUG_cnt
return word_list
#TODO: more flexible spelling correction
def make_embedding_matrix(embedding, word_index, max_features=20000, maxlen=200, embedding_dim=50, correct_spelling=None, diagnostics=False, **kwargs):
def make_embedding_matrix(embedding, word_index, max_features=20000, maxlen=200, embedding_dim=50, meta_features=True, **kwargs):
num_words = min(max_features, len(word_index))
# add one element for zero vector, else initialize randomly
# add one element for zero vector
embedding_matrix = np.zeros((num_words+1, embedding_dim))
for word, i in word_index.items():
if i >= max_features:
continue
......@@ -195,21 +202,10 @@ def make_embedding_matrix(embedding, word_index, max_features=20000, maxlen=200,
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
else:
if correct_spelling:
# replace with autocorrected word IF this word is in embeddings
suggested_word = correct_spelling(word)
embedding_vector = embedding.get(suggested_word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
# check which words are not recognized
if diagnostics:
word_list = which_words_are_zero_vectors(embedding_matrix, word_index)
print('WORDs not found: {}'.format(len(word_list)))
print('################################')
with open('../notfound.txt', 'w+') as fl:
json.dump(word_list, fl)
# add random activations for meta features
elif meta_features:
if word.startswith('_') and word.endswith('_'):
embedding_matrix[i] = np.random.uniform(-1, 1, size=(embedding_dim,))
return embedding_matrix
......@@ -308,6 +304,8 @@ class Embedding_Blanko_DNN(BaseEstimator):
def predict(self, X):
if isinstance(X, dict):
X['main_input'] = self.tokenizer.transform(X['main_input'])
else:
X = self.tokenizer.transform(X)
return self.model.predict(X)
def weighted_binary_crossentropy(y_true, y_pred, weights):
......
......@@ -13,6 +13,9 @@ import re, string
from sklearn.base import BaseEstimator, TransformerMixin
import feature_engineering
import string
import json
from functools import partial
eng_stopwords = set(stopwords.words("english"))
memory = joblib.Memory(cachedir='/home/mboos/joblib')
......@@ -36,12 +39,28 @@ control_chars = ''.join(map(unichr, range(0,32) + range(127,160)))
control_char_re = re.compile('[%s]' % re.escape(control_chars))
#with open('bad_words_translation.json', 'r') as fl:
# bad_word_dict = json.load(fl)
bad_word_dict = joblib.load('bad_words_misspellings.pkl')
some_bad_words = joblib.load('some_bad_words.pkl')
def check_for_duplicates(word, zero_words):
regex = r'^' + ''.join('[{}]+'.format(c) for c in word) + '$'
matches = [re.search(regex, s) for s in zero_words]
is_match = np.array([m is not None for m in matches])
return is_match, np.where(is_match)[0]
def replacement_regex(word):
regex = r'\b' + ''.join('[{}]+'.format(c) for c in word) + r'\b'
return regex
def remove_control_chars(s):
return control_char_re.sub('', s)
def clean_comment(text):
def clean_comment(text, replace_misspellings=False):
import unicodedata as ud
text = ud.normalize('NFD', text.encode('utf-8').decode('utf-8'))
text = text.lower()
text = re.sub(r'[^\x00-\x7f]', r' ' , text)
text = re.sub(r'[\n\r]', r' ', text)
s = re.sub(r"what's", "what is ", text, flags=re.IGNORECASE)
......@@ -54,23 +73,47 @@ def clean_comment(text):
s = re.sub(r"\'d", " would ", s, flags=re.IGNORECASE)
s = re.sub(r"\'ll", " will ", s, flags=re.IGNORECASE)
s = re.sub(r"\'scuse", " excuse ", s, flags=re.IGNORECASE)
s = re.sub(r'([_])', r' \1 ', s)
s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s)
s = re.sub(r'\b[a-z]+\d+\b', ' _user_ ', s)
s = re.sub(r'\b[0-9]+\b', ' _number_ ', s)
#hard coded replacements
for bad_word in some_bad_words:
s = re.sub(replacement_regex(bad_word), ' ' + bad_word + ' ', s)
s = re.sub(r'\bfukc\b', ' fuck ', s)
s = re.sub(r'\bfcuk\b', ' fuck ', s)
s = re.sub(r'\bfucc\b', ' fuck ', s)
s = re.sub(r'\bfukk\b', ' fuck ', s)
s = re.sub(r'\bfukker\b', ' fuck ', s)
s = re.sub(r'\bfucka\b', ' fucker ', s)
#wikipedia specific features
s = re.sub(r'(?<=\(talk\)).*?(?=$)', ' _date_ ', s)
s = re.sub(r'\b\(talk\)', ' _wikipedia_ ', s)
s = re.sub(ur'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' _url_ ', s)
s = re.sub(ur'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b', ' _mail_ ', s)
#without_controls = ' '.join(control_char_re.sub(' ', text).split(' '))
# add space between punctuation
s = re.sub(r'([.,!?():;_^`<=>$%&@|{}\-+\[\]#~*\/"])', r' \1 ', s)
s = re.sub(r'([.,!?():;^`<=>$%&@|{}\-+\[\]#~*\/"])', r' \1 ', s)
s = re.sub(r"(['])", r' \1 ', s)
s = re.sub('\s{2,}', ' ', s)
if replace_misspellings:
for key, val in bad_word_dict.iteritems():
s = re.sub(r'\b{}\b'.format(key.lower()), ' '+val.lower()+' ', s)
return s.encode('utf-8')
@memory.cache
def data_preprocessing(df):
def data_preprocessing(df, replace_misspellings=False):
df['comment_text'].fillna(' ', inplace=True)
df['comment_text'] = df['comment_text'].apply(clean_comment)
clean_comment_dummy = partial(clean_comment, replace_misspellings=replace_misspellings)
df['comment_text'] = df['comment_text'].apply(clean_comment_dummy)
return df
def load_data(name='train.csv', preprocess=True, cut=False):
def load_data(name='train.csv', preprocess=True, cut=False, replace_misspellings=False):
data = pd.read_csv('../input/{}'.format(name), encoding='utf-8')
if preprocess:
data = data_preprocessing(data)
data = data_preprocessing(data, replace_misspellings=replace_misspellings)
if cut and name=='train.csv':
# these comments are often (or always) mis-labeled
not_toxic_but_nz = np.logical_and(data.iloc[:,2].values==0, data.iloc[:,2:].values.any(axis=1))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment