Commit a9756e45 authored by mjboos's avatar mjboos

validation

parent fca05728
......@@ -113,7 +113,7 @@ def conc_finetuned_preds(model_name):
hlp.write_model(predictions)
def fit_model(model_name, fit_args, *args, **kwargs):
fit_args['callbacks'] = make_callback_list(model_name)
fit_args['callbacks'] = make_callback_list(model_name, patience=3)
model = train_DNN(model_name, fit_args, *args, **kwargs)
return model
......@@ -189,3 +189,108 @@ def fine_tune_model(model_name, old_model, fit_args, train_X, train_y, test_text
if 'compilation_args' in kwargs:
kwargs['compilation_args']['optimizer'] = optimizers.Adam(lr=0.0001, clipnorm=1.)
def aux_net():
model_func = partial(models.RNN_aux_loss, rnn_func=keras.layers.CuDNNLSTM, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 300,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy', 'aux_output' : 'binary_crossentropy'}, 'loss_weights' : [1., 0.1]}}
return model_params
def simple_one_output_net():
model_func = partial(models.RNN_general_one_class, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def toxic_skip_net():
model_func = partial(models.RNN_aux_loss_skip, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=48, hidden_dense=20)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy', 'aux_output' : 'binary_crossentropy'}, 'loss_weights' : [1., 1.]}}
return model_params
def simple_aug_net(trainable=False, prune=True):
model_func = partial(models.RNN_augment, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_embedding_net(trainable=False, prune=True):
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_attention_1d(trainable=False, prune=True):
model_func = partial(models.RNN_attention_1d, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, dropout_dense=0.5, dropout=0.5)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def conc_attention(trainable=False, prune=True):
model_func = partial(models.RNN_diff_attention, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, dropout_dense=0.5, dropout=0.5, train_embedding=False)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_attention(trainable=False, prune=True):
model_func = partial(models.RNN_attention, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, dropout_dense=0.5, dropout=0.5, train_embedding=False)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_attention_dropout(trainable=False, prune=True):
model_params = simple_attention(trainable=trainable, prune=prune)
model_params['model_function'] = partial(models.RNN_dropout_attention, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, dropout_dense=0.5, dropout=0.5, train_embedding=False)
return model_params
def simple_attention_channel_dropout(trainable=False, prune=True):
model_params = simple_attention(trainable=trainable, prune=prune)
model_params['model_function'] = partial(models.RNN_channel_dropout_attention, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, dropout_dense=0.5, dropout=0.5, train_embedding=False)
return model_params
def simple_attention_word_dropout(trainable=False, prune=True):
model_params = simple_attention(trainable=trainable, prune=prune)
model_params['model_function'] = partial(models.RNN_time_dropout_attention, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, dropout_dense=0.5, dropout=0.5, train_embedding=False)
return model_params
def simple_net(trainable=False, prune=True):
model_func = partial(models.RNN_conc, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=1, hidden_rnn=128, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def shallow_CNN(trainable=False, prune=True):
model_func = partial(models.CNN_shallow, n_filters=50, kernel_sizes=[3,4,5], dropout=0.5)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def add_net():
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 400, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
......@@ -40,87 +40,6 @@ fit_args = {'batch_size' : 80, 'epochs' : 30,
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
def aux_net():
model_func = partial(models.RNN_aux_loss, rnn_func=keras.layers.CuDNNLSTM, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 300,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy', 'aux_output' : 'binary_crossentropy'}, 'loss_weights' : [1., 0.1]}}
return model_params
def simple_one_output_net():
model_func = partial(models.RNN_general_one_class, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def toxic_skip_net():
model_func = partial(models.RNN_aux_loss_skip, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=48, hidden_dense=20)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy', 'aux_output' : 'binary_crossentropy'}, 'loss_weights' : [1., 1.]}}
return model_params
def simple_aug_net(trainable=False, prune=True):
model_func = partial(models.RNN_augment, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_embedding_net(trainable=False, prune=True):
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_attention_1d(trainable=False, prune=True):
model_func = partial(models.RNN_attention_1d, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, dropout_dense=0.5, dropout=0.5)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_attention(trainable=False, prune=True):
model_func = partial(models.RNN_attention, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, dropout_dense=0.5, dropout=0.5, train_embedding=False)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_net(trainable=False, prune=True):
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def shallow_CNN(trainable=False, prune=True):
model_func = partial(models.CNN_shallow, n_filters=50, kernel_sizes=[3,4,5], dropout=0.5)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def add_net():
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 400, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
if __name__=='__main__':
# for toxic skip
aux_task = train_y[:,0]
......@@ -132,7 +51,7 @@ if __name__=='__main__':
loss = partial(models.weighted_binary_crossentropy, weights=weight_tensor)
loss.__name__ = 'weighted_binary_crossentropy'
model_params = simple_attention(trainable=False)
model_name = '300_fasttext_attention_diffpre3_GRU'
model_name = '300_fasttext_attention_smaller_voc_GRU'
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
......@@ -143,7 +62,7 @@ if __name__=='__main__':
# train_text, train_y, aux_task, train_data_augmentation = train_text[row_idx], train_y[row_idx], aux_task[row_idx], train_data_augmentation[row_idx]
# model = load_keras_model(model_name)
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
model = fit_model(model_name, fit_args, {'main_input':train_text, 'aug_input':train_data_augmentation}, {'main_output': train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output': train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
hlp.write_model(model.predict({'main_input':test_text,'aug_input':test_data_augmentation}))
hlp.make_training_set_preds(model, {'main_input':train_text, 'aug_input':train_data_augmentation}, train_y)
# model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output':train_y, 'aux_output':aux_task}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
......
This diff is collapsed.
......@@ -92,7 +92,7 @@ def clean_comment(text, replace_misspellings=False):
# wikipedia_regex = [r'\(talk\)', r'\(utc\)', r'\(talk|email\)']
# wikipedia_matches = [re.search(regex, s) for regex in wikipedia_regex]
s = re.sub(r'(?<=\(talk\)).*?(?=\(utc\))', ' _date_ ', s)
s = re.sub(r'\(talk\)', ' _wikipedia_ ', s)
s = re.sub(r'\(talk\)', ' _wikipedia ', s)
s = re.sub(r'\(utc\)', ' _wikipedia_ ', s)
s = re.sub(r'\(talk|email\)', ' _wikipedia_ ', s)
......@@ -100,6 +100,10 @@ def clean_comment(text, replace_misspellings=False):
s = re.sub(ur'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b', ' _mail_ ', s)
#without_controls = ' '.join(control_char_re.sub(' ', text).split(' '))
# add space between punctuation
#shorten words
s = re.sub(r'(\w)\1\1+', r' \1\1 ', s)
s = re.sub(r'([.,!?():;^`<=>$%&@|{}\-+\[\]#~*\/"])', r' \1 ', s)
s = re.sub(r"(['])", r' \1 ', s)
s = re.sub('\s{2,}', ' ', s)
......
......@@ -23,23 +23,28 @@ from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateSchedule
import feature_engineering
import DNN
def DNN_model_validate(X, y, fit_args, fixed_args, kwargs, cv=5):
def DNN_model_validate(X, y, fit_args, fixed_args, kwargs, cv=3):
'''Builds and evaluates a CNN on train_text, train_labels'''
new_dict = {key:val for key, val in fixed_args.items()}
new_dict.update(kwargs)
new_dict['compilation_args']['optimizer'] = optimizers.Adam(lr=0.001, beta_2=0.99)
new_time = 'cval_{}'.format(time.strftime("%m%d-%H%M"))
kfold = KFold(n_splits=cv, shuffle=True)
kfold = KFold(n_splits=cv, shuffle=False)
scores = []
Xs = np.zeros((len(X),1), dtype='int8')
predictions = []
for train, test in kfold.split(Xs):
new_dict['compilation_args']['optimizer'] = optimizers.Adam(lr=0.001, clipnorm=1.)
train_x = [X[i] for i in train]
test_x = [X[i] for i in test]
estimator = DNN.fit_model(new_time, fit_args, train_x, y[train], **new_dict)
predictions = estimator.predict(test_x)
scores.append(hlp.mean_log_loss(y[test], predictions))
model_time = '{}_{}'.format(new_time, time.strftime("%m%d-%H%M"))
estimator = DNN.fit_model(model_time, fit_args, train_x, y[train], **new_dict)
predictions.append(estimator.predict(test_x))
scores.append(hlp.mean_log_loss(y[test], predictions[-1]))
K.clear_session()
score_dict = {'loss' : np.mean(scores), 'loss_fold' : scores, 'status' : STATUS_OK}
K.clear_session()
predictions = np.vstack(predictions)
joblib.dump(predictions, '../predictions/{}.pkl'.format(new_time), compress=3)
joblib.dump(score_dict, '../scores/{}.pkl'.format(new_time))
return score_dict
def do_hyper_search(space, model_function, **kwargs):
......@@ -115,7 +120,7 @@ def validate_feature_model(model_name, model_function, space, fixed_params_file=
hlp.dump_trials(trials, fname=model_name)
return best
if __name__=='__main__':
def do_hyperparameter_search():
DNN_search_space = {'model_function' : {'no_rnn_layers' : hp.choice('no_rnn_layers', [2]),
'rnn_func' : hp.choice('rnn_func', [models.CuDNNLSTM, models.CuDNNGRU]),
'hidden_rnn' : hp.quniform('hidden_rnn', 32, 96, 16),
......@@ -125,3 +130,28 @@ if __name__=='__main__':
for model_name, (func, space) in token_models_to_test.iteritems():
best = hyperopt_token_model(model_name, func, space)
joblib.dump(best, 'best_{}.pkl'.format(model_name))
def test_models():
fit_args = {'batch_size' : 80, 'epochs' : 30,
'validation_split' : 0.2}
fixed_args = DNN.simple_attention_dropout()
kwargs = {}
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=fixed_args['max_features'], maxlen=fixed_args['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
kwargs['embedding'] = embedding
kwargs['tokenizer'] = frozen_tokenizer
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, cv=3)
fixed_args = DNN.simple_attention()
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, cv=3)
fixed_args = DNN.simple_attention_channel_dropout()
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, cv=3)
fixed_args = DNN.simple_attention_word_dropout()
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, cv=3)
if __name__=='__main__':
test_models()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment