Commit 5a6147f4 authored by mjboos's avatar mjboos

recent

parent 96ebf3ff
......@@ -270,6 +270,14 @@ def simple_attention_word_dropout(trainable=False, prune=True):
model_params['model_function'] = partial(models.RNN_time_dropout_attention, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, dropout_dense=0.5, dropout=0.5, train_embedding=False)
return model_params
def simple_huge_aux_net(trainable=False, prune=True):
model_func = partial(models.RNN_conc_aux, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.0005, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy', 'aux_output' : 'binary_crossentropy'}, 'loss_weights' : {'main_output':1., 'aux_output' : 0.1}}}
return model_params
def simple_huge_net(trainable=False, prune=True):
model_func = partial(models.RNN_conc, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96)
model_params = {
......@@ -278,6 +286,14 @@ def simple_huge_net(trainable=False, prune=True):
'compilation_args' : {'optimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.0005, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_huge_dropout_net(trainable=False, prune=True):
model_func = partial(models.RNN_dropout_conc, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.001, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_small_trainable_net(trainable=False, prune=True):
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=1, hidden_rnn=96, hidden_dense=64)
model_params = {
......@@ -310,4 +326,12 @@ def add_net():
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def old_gru_net(trainable=False, prune=True):
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 300,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.001, 'clipnorm' : 1., 'beta_2' : 0.99}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
......@@ -26,7 +26,7 @@ from DNN import *
memory = joblib.Memory(cachedir='/home/mboos/joblib')
best_weights_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_main_output_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=10)
def schedule(ind):
a = [0.002,0.002,0.002,0.001,0.001]
......@@ -35,15 +35,14 @@ lr = LearningRateScheduler(schedule)
callbacks_list = [checkpoint, early] #early
fit_args = {'batch_size' : 128, 'epochs' : 30,
fit_args = {'batch_size' : 80, 'epochs' : 20,
'validation_split' : 0.2, 'callbacks' : callbacks_list}
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
if __name__=='__main__':
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
import keras_lr_finder as lrf
# for toxic skip
aux_task = train_y[:,0]
aux_task = feature_engineering.compute_features(train_text, which_features=['bad_word'])
# train_y = np.delete(train_y, 0, axis=1)
train_data_augmentation = pre.pad_and_extract_capitals(train_text)[..., None]
test_data_augmentation = pre.pad_and_extract_capitals(test_text)[..., None]
......@@ -51,27 +50,35 @@ if __name__=='__main__':
weight_tensor = tf.convert_to_tensor(class_weights, dtype=tf.float32)
loss = partial(models.weighted_binary_crossentropy, weights=weight_tensor)
loss.__name__ = 'weighted_binary_crossentropy'
model_params = simple_small_trainable_net(trainable=True, prune=True)
model_name = '300_fasttext_trainable_all_GRU'
model_params = simple_huge_aux_net(prune=True)
# model_params['compilation_args']['loss']['main_output'] = models.roc_auc_score
model_name = '300_fasttext_aux_conc_GRU'
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
list_of_tokens = frozen_tokenizer.tokenizer.texts_to_sequences(pd.concat([train_text, test_text]))
embedding = hlp.get_glove_embedding('../glove.twitter.27B.200d.txt')
# list_of_tokens = frozen_tokenizer.tokenizer.texts_to_sequences(pd.concat([train_text, test_text]))
embedding = hlp.get_glove_embedding('../crawl-300d-2M.vec')
opt = model_params['compilation_args'].pop('optimizer_func')
optargs = model_params['compilation_args'].pop('optimizer_args')
model_params['compilation_args']['optimizer'] = opt(**optargs)
model = models.Embedding_Blanko_DNN(tokenizer=frozen_tokenizer, embedding=embedding, **model_params)
optargs['lr'] = 0.0005
model_params['compilation_args']['optimizer'] = opt(beta_2=0.99, **optargs)
# model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output': train_y, 'aux_output' : aux_task}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict({'main_input':test_text})[0])
# hlp.make_training_set_preds(model, {'main_input':train_text}, train_y)
# model = models.Embedding_Blanko_DNN(tokenizer=frozen_tokenizer, embedding=embedding, **model_params)
# old_model.load_weights(model_name+'_best.hdf5')
lrfinder = lrf.LRFinder(model.model)
train_x = frozen_tokenizer.transform(train_text)
lrfinder.find(train_x, train_y, 0.0001, 0.01, batch_size=80, epochs=1)
lrfinder.plot_loss()
plt.savefig('losses_small.svg')
plt.close()
lrfinder.plot_loss_change()
plt.savefig('loss_change_small.svg')
plt.close()
# joblib.dump([lrfinder.losses, lrfinder.lrs], 'lrfinder.pkl')
# lrfinder = lrf.LRFinder(model.model)
# train_x = frozen_tokenizer.transform(train_text)
# lrfinder.find(train_x, train_y, 0.0001, 0.01, batch_size=80, epochs=1)
# lrfinder.losses = [np.log(loss) for loss in lrfinder.losses]
# lrfinder.plot_loss()
# plt.savefig('losses_aux.svg')
# plt.close()
# lrfinder.plot_loss_change()
# plt.savefig('loss_change_aux.svg')
# plt.close()
# joblib.dump([lrfinder.losses, lrfinder.lrs], 'lrfinder_aux.pkl')
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# SHUFFLE TRAINING SET so validation split is different every time
......@@ -80,9 +87,6 @@ if __name__=='__main__':
# train_text, train_y, aux_task, train_data_augmentation = train_text[row_idx], train_y[row_idx], aux_task[row_idx], train_data_augmentation[row_idx]
# model = load_keras_model(model_name)
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output': train_y}, embedding=embedding, tokenizer=frozen_tokenizer, list_of_tokens=list_of_tokens, **model_params)
# hlp.write_model(model.predict({'main_input':test_text,'aug_input':test_data_augmentation}))
# hlp.make_training_set_preds(model, {'main_input':train_text, 'aug_input':train_data_augmentation}, train_y)
# model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output':train_y, 'aux_output':aux_task}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = continue_training_DNN(model_name, fit_args, train_text, train_y, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict(test_text))
......
......@@ -14,10 +14,48 @@ import re, string
from sklearn.base import BaseEstimator, TransformerMixin
import string
import langid
import preprocessing as pre
bad_word_dict = joblib.load('bad_words_misspellings.pkl')
some_bad_words = joblib.load('some_bad_words.pkl')
some_bad_words2 = [u'bastard',
u'jerk',
u'moron',
u'idiot',
u'retard',
u'assfucker',
u'arsehole',
u'nazi',
u'assfuck',
u'fuckhead',
u'fuckwit',
u'cocksucker',
u'asshole',
u'bullshit',
u'motherfucker',
u'fucked',
u'shit',
u'fuck',
u'fucking',
u'gay',
u'fag',
u'faggot',
u'bitch',
u'whore',
u'fucker',
u'nigg',
u'nigger']
some_bad_words = list(set(some_bad_words+some_bad_words2))
eng_stopwords = set(stopwords.words("english"))
memory = joblib.Memory(cachedir='/home/mboos/joblib')
with open('bad-words.txt', 'r') as fl:
other_bad_words = fl.readlines()
def count_symbol(row, symbol='!'):
return row.count(symbol)
......@@ -36,15 +74,27 @@ def proportion_unique_words(row):
def language_identity(row):
return langid.classify(row)[0]
bad_word_regex = '(' + '|'.join([r'\b'+bw+r'\b' for bw in some_bad_words])+')'
def contains_bad_word(row):
match = re.search(bad_word_regex, row)
return match is not None
bad_word_regex2 = '(' + '|'.join(some_bad_words+list(np.unique(bad_word_dict.keys())))+')'
def contains_bad_word2(row):
match = re.search(bad_word_regex2, row)
return match is not None
feature_mapping_dict = {
'count_symbol' : count_symbol,
'bad_word' : contains_bad_word,
'bad_word2' : contains_bad_word2,
'count_capitals' : count_capitals,
'proportion_capitals' : proportion_capitals,
'num_unique_words' : num_unique_words,
'proportion_unique_words' : proportion_unique_words,
'language' : language_identity}
'proportion_unique_words' : proportion_unique_words}
@memory.cache
def compute_features(text_df, which_features=None):
if which_features:
feature_funcs = [feature_mapping_dict[feature_name] for feature_name in which_features]
......
from __future__ import division
import preprocessing as pre
from sklearn.metrics import roc_auc_score
import glob
import joblib
train_text, train_y = pre.load_data()
models = glob.glob('../predictions/cval*')
model_dict = {model.split('/')[-1].split('.')[0] : roc_auc_score(train_y, joblib.load(model)) for model in models}
......@@ -53,15 +53,30 @@ def write_model(predictions, correct=None,
submission.to_csv('../submissions/submission_{}.csv'.format(timestr), index=False)
def logit(x):
if x == 1.:
x -= np.finfo(np.float32).eps
elif x == 0.:
x += np.finfo(np.float32).eps
x[x==1.] -= np.finfo(np.float32).eps
x[x==0.] += np.finfo(np.float32).eps
return np.log(x/(1-x))
def cross_val_score_with_estimators(classifier_func, X, y, cv=6, scoring=None):
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
if scoring is None:
scoring = roc_auc_score
kfold = KFold(n_splits=cv, shuffle=False)
estimators = []
scores = []
for train, test in kfold.split(X):
clf = classifier_func().fit(X[train], y[train])
scores.append(scoring(y[test], clf.predict_proba(X[test])[:,1]))
estimators.append(clf)
return scores, estimators
def sparse_to_dense(X):
return X.toarray()
def predict_proba_conc(estimator, X):
return np.concatenate([preds[:,1][:,None] for preds in estimator.predict_proba(X)], axis=-1)
@memory.cache
def get_glove_embedding(glove_path):
embeddings_index = {}
......@@ -85,6 +100,13 @@ def get_fasttext_embedding(fasttext_path):
embeddings_index[word] = coefs
return embeddings_index
def get_model_specs(model_name):
import json
with open('../model_specs/{}.json'.format(model_name), 'r') as fl:
modelspecs = fl.read()
modelspecs_dict = json.reads(modelspecs)
return modelspecs_dict
def predictions_for_language(language_dict, test_data=None):
'''Expects a language_dict, where the keys correspond to languages and the values to models that implement fit'''
if test_data is None:
......
from __future__ import division
import numpy as np
import pandas as pd
import glob
import sys
from sklearn.metrics import log_loss, roc_auc_score
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
def load_data(name):
data = pd.read_csv(name)
predictions = np.concatenate([data['predictions_{}'.format(label)].values[:,None] for label in cols], axis=-1)
ground_truth = np.concatenate([data['{}'.format(label)].values[:,None] for label in cols], axis=-1)
return data['text'], ground_truth, predictions
def point_wise_logloss(labels, predictions):
losses = -(labels*np.log(np.clip(predictions, 1e-15, 1-1e-15)) + (1-labels)*np.log(1-np.clip(predictions, 1e-15, 1-1e-15)))
return losses
def max_loss_iterator(losses, text, labels, predictions, col=None, stop=100):
'''returns an iterator over the maximum losses'''
if col:
argmax_loss = np.argsort(losses[:,np.where(np.array(cols)==col)[0][0]])[::-1]
else:
argmax_loss = np.argsort(losses.mean(axis=1))[::-1]
for i, loss_index in enumerate(argmax_loss):
if i >= stop:
break
else:
yield losses[loss_index], text[loss_index], labels[loss_index], predictions[loss_index], loss_index
......@@ -39,7 +39,8 @@ import copy
from keras.engine.topology import Layer
import keras.backend as K
from keras import initializers
from nltk.corpus import stopwords
eng_stopwords = set(stopwords.words("english"))
corr_dict1 = enchant.request_dict('en_US')
maketrans = string.maketrans
......@@ -52,6 +53,7 @@ some_bad_words = [u'bastard',
u'retard',
u'assfucker',
u'arsehole',
u'nazi',
u'assfuck',
u'fuckhead',
u'fuckwit',
......@@ -127,15 +129,19 @@ class NBMLR(BaseEstimator):
def predict_proba(self, X):
return self.lr.predict_proba(X.multiply(self.r))
def tfidf_model(pre_args={'ngram_range' : (1,2), 'tokenizer' : None,
import re, string
re_tok = re.compile('([{}“”¨«»®´·º½¾¿¡§£₤‘’])'.format(string.punctuation))
def tokenize(s): return re_tok.sub(r' \1 ', s).split()
def get_tfidf_model(pre_args={'ngram_range' : (1,2), 'tokenizer' : None,
'min_df' : 3, 'max_df' : 0.9, 'strip_accents' : 'unicode',
'use_idf' : 1, 'smooth_idf' : 1, 'sublinear_tf' : 1},
model_func=None, **kwargs):
'''Returns unfitted tfidf_NBSVM pipeline object'''
if model_func is None:
model_func = NBMLR
return pipe.Pipeline(steps=[('tfidf', TfidfVectorizer(**pre_args)),
('model', MultiOutputClassifier(model_func(**kwargs)))])
'use_idf' : 1, 'smooth_idf' : 1, 'sublinear_tf' : 1}):
if pre_args['tokenizer'] is None:
pre_args['tokenizer'] = tokenize
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data()
tfidf = TfidfVectorizer(stop_words=eng_stopwords, **pre_args).fit(pd.concat([train_text, test_text]))
return tfidf
def keras_token_model(model_fuction=None, max_features=20000, maxlen=100, embed_size=128):
if model_function is None:
......@@ -575,24 +581,6 @@ def RNN_aux_aug(x, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32, rnn_func=Non
x = Dense(6, activation="sigmoid", name='main_output')(x)
return [x, aux_dense], None
def RNN_aux_loss(x, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32, rnn_func=None, dropout=0.5, aux_dim=1):
if rnn_func is None:
rnn_func = LSTM
if not isinstance(hidden_rnn, list):
hidden_rnn = [hidden_rnn] * no_rnn_layers
if len(hidden_rnn) != no_rnn_layers:
raise ValueError('list of recurrent units needs to be equal to no_rnn_layers')
for rnn_size in hidden_rnn:
x = Dropout(dropout)(x)
x = Bidirectional(rnn_func(rnn_size, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(dropout)(x)
aux_dense = Dense(aux_dim, activation='sigmoid', name='aux_output')(x)
x = Dense(hidden_dense, activation='relu')(x)
x = Dropout(dropout)(x)
x = Dense(6, activation="sigmoid", name='main_output')(x)
return [x, aux_dense], None
def RNN_aux_attention(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_func=None, dropout=0.5, dropout_dense=0.8, input_len=500):
if rnn_func is None:
rnn_func = CuDNNLSTM
......@@ -870,6 +858,71 @@ def RNN_augment(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_func=Non
x = Dense(6, activation="sigmoid", name='main_output')(x)
return x, aug_input
def RNN_conc_aux(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_func=None, dropout=0.5, aux_dim=1):
if rnn_func is None:
rnn_func = CuDNNLSTM
if not isinstance(hidden_rnn, list):
hidden_rnn = [hidden_rnn] * no_rnn_layers
if len(hidden_rnn) != no_rnn_layers:
raise ValueError('list of recurrent units needs to be equal to no_rnn_layers')
vals = []
for rnn_size in hidden_rnn:
x = Dropout(dropout)(x)
x = Bidirectional(rnn_func(int(rnn_size), return_sequences=True))(x)
vals.append(x)
# y = GlobalMaxPool1D()(x)
# y = Dropout(dropout)(y)
# aux_dense = Dense(aux_dim, activation='sigmoid', name='aux_output')(y)
x = concatenate([GlobalAveragePooling1D()(x)] + [GlobalMaxPool1D()(val) for val in vals] + [Lambda(lambda x : x[:,-1, :])(val) for val in vals])
x = Dropout(dropout)(x)
aux_dense = Dense(aux_dim, activation='sigmoid', name='aux_output')(y)
# x = BatchNormalization(x)
# x = Dense(int(hidden_dense), activation='relu')(x)
# x = Dropout(dropout)(x)
x = Dense(6, activation="sigmoid", name='main_output')(x)
return [x, aux_dense], None
def RNN_aux_loss(x, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32, rnn_func=None, dropout=0.5, aux_dim=1):
if rnn_func is None:
rnn_func = LSTM
if not isinstance(hidden_rnn, list):
hidden_rnn = [hidden_rnn] * no_rnn_layers
if len(hidden_rnn) != no_rnn_layers:
raise ValueError('list of recurrent units needs to be equal to no_rnn_layers')
for rnn_size in hidden_rnn:
x = Dropout(dropout)(x)
x = Bidirectional(rnn_func(rnn_size, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(dropout)(x)
aux_dense = Dense(aux_dim, activation='sigmoid', name='aux_output')(x)
x = Dense(hidden_dense, activation='relu')(x)
x = Dropout(dropout)(x)
x = Dense(6, activation="sigmoid", name='main_output')(x)
return [x, aux_dense], None
def RNN_dropout_conc(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_func=None, dropout=0.5, dropout_embed=0.5):
if rnn_func is None:
rnn_func = CuDNNLSTM
if not isinstance(hidden_rnn, list):
hidden_rnn = [hidden_rnn] * no_rnn_layers
if len(hidden_rnn) != no_rnn_layers:
raise ValueError('list of recurrent units needs to be equal to no_rnn_layers')
vals = []
x = Dropout(dropout_embed, noise_shape=(None, 1, int(x.shape[-1])))(x)
for i, rnn_size in enumerate(hidden_rnn):
if i > 0:
x = Dropout(dropout)(x)
x = Bidirectional(rnn_func(int(rnn_size), return_sequences=True))(x)
vals.append(x)
x = concatenate([GlobalAveragePooling1D()(x)] + [GlobalMaxPool1D()(val) for val in vals] + [Lambda(lambda x : x[:,-1, :])(val) for val in vals])
x = Dropout(dropout)(x)
# x = BatchNormalization(x)
# x = Dense(int(hidden_dense), activation='relu')(x)
# x = Dropout(dropout)(x)
x = Dense(6, activation="sigmoid", name='main_output')(x)
return x, None
def RNN_conc(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_func=None, dropout=0.5):
if rnn_func is None:
rnn_func = CuDNNLSTM
......@@ -878,6 +931,11 @@ def RNN_conc(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_func=None,
if len(hidden_rnn) != no_rnn_layers:
raise ValueError('list of recurrent units needs to be equal to no_rnn_layers')
vals = []
for rnn_size in hidden_rnn:
x = Dropout(dropout)(x)
x = Bidirectional(rnn_func(int(rnn_size), return_sequences=True))(x)
vals.append(x)
for rnn_size in hidden_rnn:
x = Dropout(dropout)(x)
x = Bidirectional(rnn_func(int(rnn_size), return_sequences=True))(x)
......@@ -951,3 +1009,33 @@ def CNN_shallow_1d(x, n_filters=100, kernel_sizes=[3,4,5], dropout=0.5):
x = Dropout(rate=dropout)(x)
x = Dense(1, activation="sigmoid", name='main_output')(x)
return x, None
def roc_auc_score(y_true, y_pred):
""" ROC AUC Score.
Approximates the Area Under Curve score, using approximation based on
the Wilcoxon-Mann-Whitney U statistic.
Yan, L., Dodier, R., Mozer, M. C., & Wolniewicz, R. (2003).
Optimizing Classifier Performance via an Approximation to the Wilcoxon-Mann-Whitney Statistic.
Measures overall performance for a full range of threshold levels.
Arguments:
y_pred: `Tensor`. Predicted values.
y_true: `Tensor` . Targets (labels), a probability distribution.
"""
with tf.name_scope("RocAucScore"):
pos = tf.boolean_mask(y_pred, tf.cast(y_true, tf.bool))
neg = tf.boolean_mask(y_pred, ~tf.cast(y_true, tf.bool))
pos = tf.expand_dims(pos, 0)
neg = tf.expand_dims(neg, 1)
# original paper suggests performance is robust to exact parameter choice
gamma = 0.2
p = 3
difference = tf.zeros_like(pos * neg) + pos - neg - gamma
masked = tf.boolean_mask(difference, difference < 0.0)
return tf.reduce_sum(tf.pow(-masked, p))
......@@ -44,6 +44,38 @@ control_char_re = re.compile('[%s]' % re.escape(control_chars))
bad_word_dict = joblib.load('bad_words_misspellings.pkl')
some_bad_words = joblib.load('some_bad_words.pkl')
some_bad_words2 = [u'bastard',
u'jerk',
u'moron',
u'idiot',
u'retard',
u'assfucker',
u'arsehole',
u'nazi',
u'assfuck',
u'fuckhead',
u'fuckwit',
u'cocksucker',
u'asshole',
u'bullshit',
u'motherfucker',
u'fucked',
u'shit',
u'fuck',
u'fucking',
u'gay',
u'fag',
u'faggot',
u'bitch',
u'whore',
u'fucker',
u'nigg',
u'nigger']
some_bad_words = list(set(some_bad_words+some_bad_words2))
wikipedia_indicators = [r'\(diff \| hist\)', 'User talk', r'\(current\)']
def check_for_duplicates(word, zero_words):
regex = r'^' + ''.join('[{}]+'.format(c) for c in word) + '$'
matches = [re.search(regex, s) for s in zero_words]
......@@ -73,10 +105,8 @@ def clean_comment(text, replace_misspellings=False):
s = re.sub(r"\'d", " would ", s, flags=re.IGNORECASE)
s = re.sub(r"\'ll", " will ", s, flags=re.IGNORECASE)
s = re.sub(r"\'scuse", " excuse ", s, flags=re.IGNORECASE)
s = re.sub(r'([_])', r' \1 ', s)
s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s)
s = re.sub(r'\b[a-z]+\d+\b', ' _user_ ', s)
s = re.sub(r'\b[0-9]+\b', ' _number_ ', s)
#hard coded replacements
for bad_word in some_bad_words:
......@@ -87,15 +117,19 @@ def clean_comment(text, replace_misspellings=False):
s = re.sub(r'\bfukk\b', ' fuck ', s)
s = re.sub(r'\bfukker\b', ' fuck ', s)
s = re.sub(r'\bfucka\b', ' fucker ', s)
s = re.sub(r'\bcrackaa\b', ' cracker ', s)
#wikipedia specific features
# wikipedia_regex = [r'\(talk\)', r'\(utc\)', r'\(talk|email\)']
# wikipedia_matches = [re.search(regex, s) for regex in wikipedia_regex]
s = re.sub(r'(?<=\(talk\)).*?(?=\(utc\))', ' _date_ ', s)
s = re.sub(r'\(talk\)', ' _wikipedia ', s)
s = re.sub(r'\d\d:\d\d, \d+ (?:January|February|March|April|May|June|July|August|September|November|December) \d+', ' _date_ ', s)
s = re.sub(r'\(talk\)', ' _talk_ ', s)
s = re.sub(r'user talk', ' _talk2_ ', s)
s = re.sub(r'\(utc\)', ' _wikipedia_ ', s)
s = re.sub(r'\(talk|email\)', ' _wikipedia_ ', s)
# s = re.sub(r'\b[0-9]+\b', ' _number_ ', s)
s = re.sub(ur'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' _url_ ', s)
s = re.sub(ur'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b', ' _mail_ ', s)
#without_controls = ' '.join(control_char_re.sub(' ', text).split(' '))
......@@ -112,8 +146,9 @@ def clean_comment(text, replace_misspellings=False):
s = re.sub(r'\b{}\b'.format(key.lower()), ' '+val.lower()+' ', s)
return s.encode('utf-8')
def data_preprocessing(df, replace_misspellings=False):
df['comment_text'].fillna(' ', inplace=True)
@memory.cache
def data_preprocessing(df, replace_misspellings=True):
df['comment_text'].fillna('', inplace=True)
clean_comment_dummy = partial(clean_comment, replace_misspellings=replace_misspellings)
df['comment_text'] = df['comment_text'].apply(clean_comment_dummy)
return df
......
......@@ -22,32 +22,69 @@ from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, CSVLogger
import feature_engineering
import DNN
import copy
import mkl
mkl.set_num_threads(2)
def DNN_model_validate(X, y, fit_args, fixed_args, kwargs, cv=5):
def DNN_model_validate(X, y, fit_args, fixed_args, kwargs, cv=6, model_name=None):
'''Builds and evaluates a CNN on train_text, train_labels'''
new_dict = {key:val for key, val in fixed_args.items()}
new_dict.update(kwargs)
new_time = 'cval_{}'.format(time.strftime("%m%d-%H%M"))
if model_name is None:
model_name = 'cval_{}'.format(time.strftime("%m%d-%H%M"))
kfold = KFold(n_splits=cv, shuffle=False)
scores = []
Xs = np.zeros((len(X),1), dtype='int8')
Xs = np.zeros((len(X['main_input']),1), dtype='int8')
predictions = []
opt = new_dict['compilation_args'].pop('optimizer_func')
optargs = new_dict['compilation_args'].pop('optimizer_args')
has_loss_func = (not isinstance(new_dict['compilation_args']['loss']['main_output'], str)) and new_dict['compilation_args']['loss']['main_output'].__name__ == 'tmp_func'
if has_loss_func:
loss_func = new_dict['compilation_args']['loss']['main_output']
for train, test in kfold.split(Xs):
new_dict['compilation_args']['optimizer'] = opt(**optargs)
train_x = X.loc[train]
test_x = X.loc[test]
model_time = '{}_{}'.format(new_time, time.strftime("%m%d-%H%M"))
estimator = DNN.fit_model(model_time, fit_args, train_x, y[train], **new_dict)
predictions.append(estimator.predict(test_x))
scores.append(roc_auc_score(y[test], predictions[-1]))
joblib.dump(scores, '../scores/{}.pkl'.format(new_time))
if has_loss_func:
new_dict['compilation_args']['loss']['main_output'] = loss_func()
train_x = {key:val.loc[train] for key, val in X.iteritems()}
test_x = {key:val.loc[test] for key, val in X.iteritems()}
train_y = {key:val[train] for key, val in y.iteritems()}
test_y = {key:val[test] for key, val in y.iteritems()}
model_time = '{}_{}'.format(model_name, time.strftime("%m%d-%H%M"))
estimator = DNN.fit_model(model_time, fit_args, train_x, train_y, **new_dict)
preds = estimator.predict(test_x)
if isinstance(preds, list):
for pred in preds:
if pred.shape[1] == 6:
preds = pred
break
predictions.append(preds)
scores.append(roc_auc_score(test_y['main_output'], predictions[-1]))
joblib.dump(scores, '../scores/{}.pkl'.format(model_name))
K.clear_session()
predictions = np.vstack(predictions)
score_dict = {'loss' : roc_auc_score(y['main_output'], predictions), 'loss_fold' : scores, 'mean_loss':np.mean(scores), 'status' : STATUS_OK}
predictions = np.vstack(predictions)
joblib.dump(predictions, '../predictions/{}.pkl'.format(model_name), compress=3)
joblib.dump(score_dict, '../scores/{}.pkl'.format(model_name))
return score_dict
def predict_parallel(X, y, train, test, estimator):
estimator.fit(X[train],y[train])
predictions = hlp.predict_proba_conc(estimator, X[test])
scores = roc_auc_score(y[test], predictions)
return (predictions, scores, estimator)
def model_validate(X, y, model, cv=6):
'''Builds and evaluates a model on X, y'''
from sklearn.base import clone
kfold = KFold(n_splits=cv, shuffle=False)
new_time = 'cval_{}'.format(time.strftime("%m%d-%H%M"))
predictions, scores, estimators = zip(*joblib.Parallel(n_jobs=6)(joblib.delayed(predict_parallel)(X, y, train, test, clone(model)) for train, test in kfold.split(X)))
score_dict = {'loss' : np.mean(scores), 'loss_fold' : scores, 'status' : STATUS_OK}
predictions = np.vstack(predictions)
joblib.dump(predictions, '../predictions/{}.pkl'.format(new_time), compress=3)
joblib.dump(score_dict, '../scores/{}.pkl'.format(new_time))
joblib.dump(estimators, '../models/{}.pkl'.format(new_time))
return score_dict
def do_hyper_search(space, model_function, **kwargs):
......@@ -128,43 +165,89 @@ def validate_feature_model(model_name, model_function, space, fixed_params_file=
return best
def do_hyperparameter_search():
DNN_search_space = {'model_function' : {'no_rnn_layers' : hp.choice('no_rnn_layers', [2]),
DNN_search_space = {'model_function' : {'no_rnn_layers' : hp.choice('no_rnn_layers', [1,2]),
'hidden_rnn' : hp.quniform('hidden_rnn', 32, 96, 16),
'hidden_dense' : hp.quniform('hidden_dense', 16, 64, 8)}}
'hidden_dense' : hp.quniform('hidden_dense', 16, 256, 16)}}
token_models_to_test = {
'DNN' : (DNN_model_validate, DNN_search_space, DNN.simple_attention())}
for model_name, (func, space, fixed_args) in token_models_to_test.iteritems():
best = hyperopt_token_model(model_name, func, space, fixed_args)
joblib.dump(best, 'best_{}.pkl'.format(model_name))
def test_tfidf_models():
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegressionCV
tfidf = models.get_tfidf_model()
train_text, train_y = pre.load_data()
train_X = tfidf.transform(train_text)
tfidf_based = {'NB' : MultiOutputClassifier(models.NBMLR(dual=True, C=4)),
'extra_trees' : ExtraTreesClassifier(),
'gbc' : MultiOutputClassifier(GradientBoostingClassifier())}
score_dict = { model_name : model_validate(train_X, train_y, clf) for model_name, clf in tfidf_based.items()}
return score_dict
def make_loss_function(class_weights):
import tensorflow as tf