Commit 483ae041 authored by mjboos's avatar mjboos

average of word embeddings

parent 8e886f6d
......@@ -53,6 +53,7 @@ def continue_training_DNN_last_layer(model_name, old_model_name, fit_args, *args
best_weights_path="{}_best.hdf5".format(model_name)
old_weights_path="{}_best.hdf5".format(old_model_name)
model = models.Embedding_Blanko_DNN(**kwargs)
old_model = load_keras_model(old_model_name)
model.model.load_weights(old_weights_path)
model.model = freeze_layers(model.model, unfrozen_keyword='main_output')
callbacks_list = make_callback_list(best_weights_path, patience=5)
......@@ -167,16 +168,16 @@ def change_trainable(layer, trainable, verbose=False):
def extend_and_finetune_last_layer_model(model_name, fit_args, train_X, train_y, test_text, **kwargs):
'''Fits and returns a model for one label (provided as index i)'''
if 'compilation_args' in kwargs:
kwargs['compilation_args']['optimizer'] = optimizers.Adam(lr=0.001, clipnorm=1.)
kwargs['compilation_args']['optimizer'] = optimizers.Adam(lr=0.0005, clipnorm=1.)
for i in xrange(6):
new_name = model_name + '_{}'.format(i)
model = continue_training_DNN_last_layer(new_name, model_name, fit_args, train_X, train_y[:,i], **kwargs)
joblib.dump(model.predict(test_text), '{}.pkl'.format(new_name))
K.clear_session()
if 'compilation_args' in kwargs:
kwargs['compilation_args']['optimizer'] = optimizers.Adam(lr=0.001, clipnorm=1.)
kwargs['compilation_args']['optimizer'] = optimizers.Adam(lr=0.0005, clipnorm=1.)
def fine_tune_model(model_name, old_model, fit_args, train_X, train_y, test_text, **kwargs):
def finetune_model(model_name, old_model, fit_args, train_X, train_y, test_text, **kwargs):
'''Fits and returns a model for one label (provided as index i)'''
weights = [layer.get_weights() for layer in old_model.layers]
if 'compilation_args' in kwargs:
......@@ -234,7 +235,7 @@ def simple_attention_1d(trainable=False, prune=True):
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'opzimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.001, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
'compilation_args' : {'optimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.0005, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def conc_attention(trainable=False, prune=True):
......@@ -242,15 +243,15 @@ def conc_attention(trainable=False, prune=True):
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'opzimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.001, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
'compilation_args' : {'optimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.001, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_attention(trainable=False, prune=True):
model_func = partial(models.RNN_attention, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, dropout_dense=0.5, dropout=0.5, train_embedding=False)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'opzimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.001, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune, 'augment_data' : False,
'compilation_args' : {'optimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.0005, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_attention_dropout(trainable=False, prune=True):
......
......@@ -41,6 +41,7 @@ fit_args = {'batch_size' : 80, 'epochs' : 30,
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
if __name__=='__main__':
import keras_lr_finder as lrf
# for toxic skip
aux_task = train_y[:,0]
# train_y = np.delete(train_y, 0, axis=1)
......@@ -51,10 +52,27 @@ if __name__=='__main__':
loss = partial(models.weighted_binary_crossentropy, weights=weight_tensor)
loss.__name__ = 'weighted_binary_crossentropy'
model_params = simple_attention(trainable=False)
model_name = '300_fasttext_attention_smaller_voc_GRU'
model_name = '300_fasttext_attention_avg_meta_ft_GRU'
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
list_of_tokens = frozen_tokenizer.tokenizer.texts_to_sequences(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
opt = model_params['compilation_args'].pop('optimizer_func')
optargs = model_params['compilation_args'].pop('optimizer_args')
model_params['compilation_args']['optimizer'] = opt(**optargs)
# old_model = models.Embedding_Blanko_DNN(tokenizer=frozen_tokenizer, embedding=embedding, **model_params).model
# old_model.load_weights(model_name+'_best.hdf5')
# lrfinder = lrf.LRFinder(model.model)
# train_x = frozen_tokenizer.transform(train_text)
# lrfinder.find(train_x, train_y, 0.0001, 0.01, batch_size=80, epochs=1)
# lrfinder.plot_loss()
# plt.savefig('losses_2.svg')
# plt.close()
# lrfinder.plot_loss_change()
# plt.savefig('loss_change_2.svg')
# plt.close()
# joblib.dump([lrfinder.losses, lrfinder.lrs], 'lrfinder.pkl')
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# SHUFFLE TRAINING SET so validation split is different every time
# row_idx = np.arange(0, train_text.shape[0])
......@@ -62,7 +80,7 @@ if __name__=='__main__':
# train_text, train_y, aux_task, train_data_augmentation = train_text[row_idx], train_y[row_idx], aux_task[row_idx], train_data_augmentation[row_idx]
# model = load_keras_model(model_name)
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output': train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output': train_y}, embedding=embedding, tokenizer=frozen_tokenizer, list_of_tokens=list_of_tokens, **model_params)
hlp.write_model(model.predict({'main_input':test_text,'aug_input':test_data_augmentation}))
hlp.make_training_set_preds(model, {'main_input':train_text, 'aug_input':train_data_augmentation}, train_y)
# model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output':train_y, 'aux_output':aux_task}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
......@@ -71,4 +89,7 @@ if __name__=='__main__':
# K.clear_session()
# model = continue_training_DNN(model_name, fit_args, train_text, train_y, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model_params = simple_attention_1d()
# extend_and_finetune_last_layer_model(model_name, fit_args, train_text, train_y, test_text, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# opt = model_params['compilation_args'].pop('optimizer_func')
# optargs = model_params['compilation_args'].pop('optimizer_args')
# model_params['compilation_args']['optimizer'] = opt(**optargs)
# finetune_model(model_name, old_model, fit_args, train_text, train_y, test_text, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
This diff is collapsed.
......@@ -104,7 +104,7 @@ def clean_comment(text, replace_misspellings=False):
#shorten words
s = re.sub(r'(\w)\1\1+', r' \1\1 ', s)
s = re.sub(r'([.,!?():;^`<=>$%&@|{}\-+\[\]#~*\/"])', r' \1 ', s)
s = re.sub(r'([.,!?():;^`<=>$%&@|{}\-+\[\]#~*\\/"])', r' \1 ', s)
s = re.sub(r"(['])", r' \1 ', s)
s = re.sub('\s{2,}', ' ', s)
if replace_misspellings:
......@@ -112,7 +112,6 @@ def clean_comment(text, replace_misspellings=False):
s = re.sub(r'\b{}\b'.format(key.lower()), ' '+val.lower()+' ', s)
return s.encode('utf-8')
@memory.cache
def data_preprocessing(df, replace_misspellings=False):
df['comment_text'].fillna(' ', inplace=True)
clean_comment_dummy = partial(clean_comment, replace_misspellings=replace_misspellings)
......@@ -139,7 +138,7 @@ def keras_pad_sequence_to_sklearn_transformer(maxlen=100):
class KerasPaddingTokenizer(BaseEstimator, TransformerMixin):
def __init__(self, max_features=20000, maxlen=200,
filters="\t\n{}&%$§^°[]<>|@[]+`' ", **kwargs):
filters="\t\n{}&%$§^°[]<>|@[]+`'\\/", **kwargs):
self.max_features = max_features
self.maxlen = maxlen
self.is_trained = False
......
No preview for this file type
......@@ -32,14 +32,17 @@ def DNN_model_validate(X, y, fit_args, fixed_args, kwargs, cv=5):
scores = []
Xs = np.zeros((len(X),1), dtype='int8')
predictions = []
opt = new_dict['compilation_args'].pop('optimizer_func')
optargs = new_dict['compilation_args'].pop('optimizer_args')
for train, test in kfold.split(Xs):
new_dict['compilation_args']['optimizer'] = new_dict['compilation_args']['optimizer_func'](**new_dict['compilation_args']['optimizer_args'])
train_x = [X[i] for i in train]
test_x = [X[i] for i in test]
new_dict['compilation_args']['optimizer'] = opt(**optargs)
train_x = X.loc[train]
test_x = X.loc[test]
model_time = '{}_{}'.format(new_time, time.strftime("%m%d-%H%M"))
estimator = DNN.fit_model(model_time, fit_args, train_x, y[train], **new_dict)
predictions.append(estimator.predict(test_x))
scores.append(roc_auc_score(y[test], predictions[-1]))
joblib.dump(scores, '../scores/{}.pkl'.format(new_time))
K.clear_session()
score_dict = {'loss' : np.mean(scores), 'loss_fold' : scores, 'status' : STATUS_OK}
predictions = np.vstack(predictions)
......@@ -137,21 +140,18 @@ def do_hyperparameter_search():
def test_models():
fit_args = {'batch_size' : 80, 'epochs' : 10,
'validation_split' : 0.2}
fixed_args = DNN.simple_attention_channel_dropout()
fixed_args = DNN.simple_attention()
kwargs = {}
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
adam_args = {'clipnorm' : 1., 'lr' : 0.001}
fixed_args['compilation_args']['optimizer_args'] = {'clipnorm' : 1., 'lr' : 0.001}
fixed_args['compilation_args']['optimizer_func'] = optimizers.Adam
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=fixed_args['max_features'], maxlen=fixed_args['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
kwargs['embedding'] = embedding
kwargs['tokenizer'] = frozen_tokenizer
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, adam_args, cv=3)
fixed_args = DNN.simple_attention_channel_dropout()
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, adam_args, cv=3)
fixed_args = DNN.conc_attention()
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, adam_args, cv=3)
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, cv=6)
if __name__=='__main__':
test_models()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment