Commit 96ebf3ff authored by mjboos's avatar mjboos

meta models

parent 4d5f30ef
......@@ -278,13 +278,20 @@ def simple_huge_net(trainable=False, prune=True):
'compilation_args' : {'optimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.0005, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_small_trainable_net(trainable=False, prune=True):
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=1, hidden_rnn=96, hidden_dense=64)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 200,
'embedding_dim' : 200, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.0005, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_net(trainable=False, prune=True):
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=128)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
'compilation_args' : {'optimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.0005, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def shallow_CNN(trainable=False, prune=True):
......
......@@ -35,7 +35,7 @@ lr = LearningRateScheduler(schedule)
callbacks_list = [checkpoint, early] #early
fit_args = {'batch_size' : 80, 'epochs' : 30,
fit_args = {'batch_size' : 128, 'epochs' : 30,
'validation_split' : 0.2, 'callbacks' : callbacks_list}
train_text, train_y = pre.load_data()
......@@ -51,26 +51,26 @@ if __name__=='__main__':
weight_tensor = tf.convert_to_tensor(class_weights, dtype=tf.float32)
loss = partial(models.weighted_binary_crossentropy, weights=weight_tensor)
loss.__name__ = 'weighted_binary_crossentropy'
model_params = simple_attention(trainable=False)
model_name = '300_fasttext_attention_avg_meta_ft_GRU'
model_params = simple_small_trainable_net(trainable=True, prune=True)
model_name = '300_fasttext_trainable_all_GRU'
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
list_of_tokens = frozen_tokenizer.tokenizer.texts_to_sequences(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
embedding = hlp.get_glove_embedding('../glove.twitter.27B.200d.txt')
opt = model_params['compilation_args'].pop('optimizer_func')
optargs = model_params['compilation_args'].pop('optimizer_args')
model_params['compilation_args']['optimizer'] = opt(**optargs)
# old_model = models.Embedding_Blanko_DNN(tokenizer=frozen_tokenizer, embedding=embedding, **model_params).model
model = models.Embedding_Blanko_DNN(tokenizer=frozen_tokenizer, embedding=embedding, **model_params)
# old_model.load_weights(model_name+'_best.hdf5')
# lrfinder = lrf.LRFinder(model.model)
# train_x = frozen_tokenizer.transform(train_text)
# lrfinder.find(train_x, train_y, 0.0001, 0.01, batch_size=80, epochs=1)
# lrfinder.plot_loss()
# plt.savefig('losses_2.svg')
# plt.close()
# lrfinder.plot_loss_change()
# plt.savefig('loss_change_2.svg')
# plt.close()
lrfinder = lrf.LRFinder(model.model)
train_x = frozen_tokenizer.transform(train_text)
lrfinder.find(train_x, train_y, 0.0001, 0.01, batch_size=80, epochs=1)
lrfinder.plot_loss()
plt.savefig('losses_small.svg')
plt.close()
lrfinder.plot_loss_change()
plt.savefig('loss_change_small.svg')
plt.close()
# joblib.dump([lrfinder.losses, lrfinder.lrs], 'lrfinder.pkl')
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
......@@ -80,9 +80,9 @@ if __name__=='__main__':
# train_text, train_y, aux_task, train_data_augmentation = train_text[row_idx], train_y[row_idx], aux_task[row_idx], train_data_augmentation[row_idx]
# model = load_keras_model(model_name)
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output': train_y}, embedding=embedding, tokenizer=frozen_tokenizer, list_of_tokens=list_of_tokens, **model_params)
hlp.write_model(model.predict({'main_input':test_text,'aug_input':test_data_augmentation}))
hlp.make_training_set_preds(model, {'main_input':train_text, 'aug_input':train_data_augmentation}, train_y)
# model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output': train_y}, embedding=embedding, tokenizer=frozen_tokenizer, list_of_tokens=list_of_tokens, **model_params)
# hlp.write_model(model.predict({'main_input':test_text,'aug_input':test_data_augmentation}))
# hlp.make_training_set_preds(model, {'main_input':train_text, 'aug_input':train_data_augmentation}, train_y)
# model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output':train_y, 'aux_output':aux_task}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = continue_training_DNN(model_name, fit_args, train_text, train_y, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict(test_text))
......
......@@ -53,6 +53,10 @@ def write_model(predictions, correct=None,
submission.to_csv('../submissions/submission_{}.csv'.format(timestr), index=False)
def logit(x):
if x == 1.:
x -= np.finfo(np.float32).eps
elif x == 0.:
x += np.finfo(np.float32).eps
return np.log(x/(1-x))
def sparse_to_dense(X):
......
......@@ -36,6 +36,10 @@ import string
import json
import enchant
import copy
from keras.engine.topology import Layer
import keras.backend as K
from keras import initializers
corr_dict1 = enchant.request_dict('en_US')
maketrans = string.maketrans
......@@ -314,6 +318,52 @@ def data_augmentation(text_df, labels):
concat_df = pd.concat([text_df, new_text]).sort_index().reset_index(drop=True)
return concat_df, np.tile(labels, (2,1))
class EmbeddingSemiTrainable(Layer):
def __init__(self, input_dim, output_dim, fixed_weights, embeddings_initializer='uniform',
input_length=None, **kwargs):
kwargs['dtype'] = 'int32'
if 'input_shape' not in kwargs:
if input_length:
kwargs['input_shape'] = (input_length,)
else:
kwargs['input_shape'] = (None,)
super(EmbeddingSemiTrainable, self).__init__(**kwargs)
self.input_dim = input_dim
self.output_dim = output_dim
self.embeddings_initializer = embeddings_initializer
self.fixed_weights = fixed_weights
self.num_trainable = input_dim - len(fixed_weights)
self.input_length = input_length
def build(self, input_shape, name='embeddings'):
initializer = initializers.get(self.embeddings_initializer)
shape1 = (self.num_trainable, self.output_dim)
variable_weight = K.variable(initializer(shape1), dtype=K.floatx(), name=name+'_var')
fixed_weight = K.variable(self.fixed_weights, name=name+'_fixed')
self._trainable_weights.append(variable_weight)
self._non_trainable_weights.append(fixed_weight)
self.embeddings = K.concatenate([fixed_weight, variable_weight], axis=0)
self.built = True
def call(self, inputs):
if K.dtype(inputs) != 'int32':
inputs = K.cast(inputs, 'int32')
out = K.gather(self.embeddings, inputs)
return out
def compute_output_shape(self, input_shape):
if not self.input_length:
input_length = input_shape[1]
else:
input_length = self.input_length
return (input_shape[0], input_length, self.output_dim)
class Embedding_Blanko_DNN(BaseEstimator):
def __init__(self, embedding=None, max_features=20000, model_function=None, tokenizer=None,
maxlen=300, embedding_dim=300, trainable=False, prune=True, augment_data=False, list_of_tokens=None,
......
......@@ -138,24 +138,24 @@ def do_hyperparameter_search():
joblib.dump(best, 'best_{}.pkl'.format(model_name))
def test_models():
fit_args = {'batch_size' : 80, 'epochs' : 20,
fit_args = {'batch_size' : 128, 'epochs' : 20,
'validation_split' : 0.2}
fixed_args = DNN.simple_huge_net()
fixed_args = DNN.simple_small_trainable_net()
kwargs = {}
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
fixed_args['compilation_args']['optimizer_args'] = {'clipnorm' : 1., 'lr' : 0.001}
fixed_args['compilation_args']['optimizer_args'] = {'clipnorm' : 1., 'lr' : 0.0005}
fixed_args['compilation_args']['optimizer_func'] = optimizers.Adam
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=fixed_args['max_features'], maxlen=fixed_args['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
embedding = hlp.get_glove_embedding('../glove.twitter.27B.200d.txt')
kwargs['embedding'] = embedding
kwargs['tokenizer'] = frozen_tokenizer
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, cv=6)
def make_average_test_set_predictions(model_name):
import glob
all_model_names = [mname.split('_best')[0] for mname in glob.glob(model_name + '*')]
all_model_names = [mname for mname in glob.glob(model_name + '*')]
fixed_args = DNN.conc_attention()
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
......@@ -166,14 +166,72 @@ def make_average_test_set_predictions(model_name):
fixed_args['compilation_args'].pop('optimizer_func')
fixed_args['compilation_args']['optimizer'] = 'adam'
prediction_list = []
for submodel_name in all_model_names:
model = DNN.load_full_model(submodel_name, embedding=embedding, tokenizer=frozen_tokenizer, **fixed_args)
model = DNN.load_full_model(all_model_names[0].split('_best')[0], embedding=embedding, tokenizer=frozen_tokenizer, **fixed_args)
prediction_list.append(model.predict(test_text)[..., None])
for submodel_name in all_model_names[1:]:
model.model.load_weights(submodel_name)
prediction_list.append(model.predict(test_text)[..., None])
predictions = np.concatenate(prediction_list, axis=-1)
predictions = predictions.mean(axis=-1)
hlp.write_model(predictions)
# hlp.write_model(predictions)
joblib.dump(predictions, '../predictions/test_set_{}.pkl'.format(model_name))
def report_ensembling(model_name_list, ensemble_name='generic'):
from scipy.optimize import minimize
import matplotlib.pyplot as plt
import seaborn as sns
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
prediction_dict = {model_name : joblib.load('../predictions/{}.pkl'.format(model_name)) for model_name in model_name_list}
for i,col in enumerate(cols):
predictions_col_df = pd.DataFrame.from_dict({model_name : prediction[:,i] for model_name, prediction in prediction_dict.iteritems()})
g = sns.pairplot(predictions_col_df, kind='reg')
plt.savefig('../reports/{}_ensemble_{}.png'.format(ensemble_name, col))
plt.close()
def stack_ensembling(predictions_col_dict, clf_func, train_y):
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
estimator_list = {}
score_list = {}
for i, col in enumerate(cols):
predictions_col = predicitons_col_dict[col]
classifier = clf_func()
score_list[col] = cross_val_score(classifier, predictions_col, train_y[:,i], cv=6, scoring='roc_auc')
estimator_list[col] = clf_func().fit(predictions_col, train_y[:,i])
return estimator_list, score_list
# gbr = GradientBoostingClassifier(n_estimators=50)
def test_meta_models(model_name_list, meta_features=None):
from scipy.special import logit, expit
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
classifier_dict = {'logistic_regression' : LogisticRegressionCV,
'extra_trees' : partial(GridSearchCV, ExtraTreesClassifier, {'n_estimators' : [5, 10, 15]}),
'gbc' : partial(GridSearchCV, GradientBoostingClassifier, {'n_estimators' : [50, 100, 150], 'max_depth' : [2, 3, 4]})}
_, train_y = pre.load_data()
prediction_dict = {model_name : joblib.load('../predictions/{}.pkl'.format(model_name)) for model_name in model_name_list}
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
predictions_col_dict = {}
for i, col in enumerate(cols):
pred_col = np.hstack([logit(prediction_dict[model_name][:,i])[:,None] for model_name in sorted(prediction_dict.keys())])
if meta_features:
pred_col = np.hstack([pred_col, meta_features])
predictions_col_dict[col] = pred_col
result_dict = { meta_model : stack_ensembling(predictions_col_dict, clf_func, train_y) for meta_model, clf_func in classifier_dict.iteritems()}
result_dict['model_names'] = model_name_list
return result_dict
def apply_meta_models(estimator_dict, test_predictions):
'''same order necessary for estimator_dict and test_predictions'''
pass
if __name__=='__main__':
# make_average_test_set_predictions('cval_0218-1903')
test_models()
# joblib.dump(big_dict = test_meta_models(['cval_0215-1830', 'cval_0218-1903', 'cval_0219-0917', 'cval_0220-1042']), 'test_meta_models.pkl')
# report_ensembling(['cval_0218-1903', 'cval_0219-0917', 'cval_0220-1042'], 'attention_and_huge_ensemble')
# estimator_list, score_list = stack_ensembling(['cval_0218-1903', 'cval_0215-1830', 'cval_0219-0917', 'cval_0220-1042'], 'attention_and_huge_ensemble')
for model in ['cval_0215-1830', 'cval_0218-1903', 'cval_0219-0917', 'cval_0220-1042']:
make_average_test_set_predictions(model)
# test_models()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment