Commit e4c1fd4e authored by mjboos's avatar mjboos

most recent

parent 7641750f
......@@ -93,7 +93,7 @@ def continue_training_DNN_one_output(model_name, i, weights, fit_args, *args, **
best_weights_path="{}_best.hdf5".format(model_name)
model = models.Embedding_Blanko_DNN(n_out=1, **kwargs)
transfer_weights_multi_to_one(weights, model.model, i)
callbacks_list = make_callback_list(model_name, patience=5)
callbacks_list = make_callback_list(model_name, patience=3)
model.model = freeze_layers(model.model, unfrozen_keyword='main_output')
fit_args['callbacks'] = callbacks_list
model.fit(*args, **fit_args)
......@@ -278,14 +278,22 @@ def simple_huge_aux_net(trainable=False, prune=True):
'compilation_args' : {'optimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.0005, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy', 'aux_output' : 'binary_crossentropy'}, 'loss_weights' : {'main_output':1., 'aux_output' : 0.1}}}
return model_params
def simple_huge_net(trainable=False, prune=True):
model_func = partial(models.RNN_conc, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96)
def simple_huge_1_layer_net(trainable=False, prune=True):
model_func = partial(models.RNN_conc, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.0005, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_huge_net(trainable=False, prune=True):
model_func = partial(models.RNN_conc, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=None)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.001, 'clipnorm' : 1., 'clipvalue':1., 'beta_2':0.99}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_huge_dropout_net(trainable=False, prune=True):
model_func = partial(models.RNN_dropout_conc, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96)
model_params = {
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import matplotlib
matplotlib.use('agg')
import numpy as np
import pandas as pd
import tensorflow as tf
......@@ -50,9 +52,9 @@ if __name__=='__main__':
weight_tensor = tf.convert_to_tensor(class_weights, dtype=tf.float32)
loss = partial(models.weighted_binary_crossentropy, weights=weight_tensor)
loss.__name__ = 'weighted_binary_crossentropy'
model_params = simple_huge_aux_net(prune=True)
model_params = simple_huge_net(prune=True)
# model_params['compilation_args']['loss']['main_output'] = models.roc_auc_score
model_name = '300_fasttext_aux_conc_GRU'
model_name = 'no_clipping'
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
# list_of_tokens = frozen_tokenizer.tokenizer.texts_to_sequences(pd.concat([train_text, test_text]))
......@@ -60,25 +62,25 @@ if __name__=='__main__':
opt = model_params['compilation_args'].pop('optimizer_func')
optargs = model_params['compilation_args'].pop('optimizer_args')
optargs['lr'] = 0.0005
model_params['compilation_args']['optimizer'] = opt(beta_2=0.99, **optargs)
model_params['compilation_args']['optimizer'] = opt(**optargs)
# model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output': train_y, 'aux_output' : aux_task}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict({'main_input':test_text})[0])
# hlp.make_training_set_preds(model, {'main_input':train_text}, train_y)
# model = models.Embedding_Blanko_DNN(tokenizer=frozen_tokenizer, embedding=embedding, **model_params)
model = models.Embedding_Blanko_DNN(tokenizer=frozen_tokenizer, embedding=embedding, **model_params)
# old_model.load_weights(model_name+'_best.hdf5')
# lrfinder = lrf.LRFinder(model.model)
# train_x = frozen_tokenizer.transform(train_text)
# lrfinder.find(train_x, train_y, 0.0001, 0.01, batch_size=80, epochs=1)
lrfinder = lrf.LRFinder(model.model)
train_x = frozen_tokenizer.transform(train_text)
lrfinder.find(train_x, train_y, 0.001, 0.05, batch_size=80, epochs=1)
# lrfinder.losses = [np.log(loss) for loss in lrfinder.losses]
# lrfinder.plot_loss()
# plt.savefig('losses_aux.svg')
# plt.close()
# lrfinder.plot_loss_change()
# plt.savefig('loss_change_aux.svg')
# plt.close()
# joblib.dump([lrfinder.losses, lrfinder.lrs], 'lrfinder_aux.pkl')
joblib.dump([lrfinder.losses, lrfinder.lrs], '{}.pkl'.format(model_name))
lrfinder.plot_loss()
plt.savefig('loss_{}.svg'.format(model_name))
plt.close()
lrfinder.plot_loss_change()
plt.savefig('loss_change_{}.svg'.format(model_name))
plt.close()
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# SHUFFLE TRAINING SET so validation split is different every time
......
......@@ -37,6 +37,22 @@ def get_class_weights(y_mat, smooth_factor=0.):
mat_counts += p
return mat_counts.max() / mat_counts
def rank(arr):
return arr.argsort().argsort()
def preds_to_norm_rank(predictions, cols=True):
all_cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
if cols is None:
return predictions
elif cols is True:
cols = all_cols
which_cols = np.array([i for i,col in enumerate(all_cols) if col in cols])
return np.concatenate([norm_rank(preds)[:,None] if i in which_cols else preds[:,None] for i, preds in enumerate(predictions.T)], axis=-1)
def norm_rank(arr):
from sklearn.preprocessing import minmax_scale
return minmax_scale(rank(arr))
def make_weight_matrix(y_mat, weights):
return np.tile(weights[None], (y_mat.shape[0], 1))
......
This diff is collapsed.
......@@ -89,12 +89,27 @@ def replacement_regex(word):
def remove_control_chars(s):
return control_char_re.sub('', s)
def clean_comment(text, replace_misspellings=False):
def replace_specific_patterns(s):
s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s)
s = re.sub(r'\b[a-z]+\d+\b', ' _user_ ', s)
s = re.sub(r'(?<=\(talk\)).*?(?=\(utc\))', ' _date_ ', s)
s = re.sub(r'\d\d:\d\d, \d+ (?:January|February|March|April|May|June|July|August|September|November|December) \d+', ' _date_ ', s)
# s = re.sub(r'\(talk\)', ' _talk_ ', s)
# s = re.sub(r'user talk', ' _talk_ ', s)
# s = re.sub(r'\(utc\)', ' _wikipedia_ ', s)
# s = re.sub(r'\(talk|email\)', ' _wikipedia_ ', s)
# s = re.sub(r'\b[0-9]+\b', ' _number_ ', s)
s = re.sub(ur'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' _url_ ', s)
s = re.sub(ur'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b', ' _mail_ ', s)
return s, ['_ip_', '_user_', '_date_', '_number_', '_url_', '_mail_']
def clean_comment(text, replace_misspellings=True):
import unicodedata as ud
text = ud.normalize('NFD', text.encode('utf-8').decode('utf-8'))
text = text.lower()
text = re.sub(r'[^\x00-\x7f]', r' ' , text)
text = re.sub(r'[\n\r]', r' ', text)
# text = re.sub(r'[\n\r]', r' ', text)
s = re.sub(r"what's", "what is ", text, flags=re.IGNORECASE)
s = re.sub(r"\'s", " ", s, flags=re.IGNORECASE)
s = re.sub(r"\'ve", " have ", s, flags=re.IGNORECASE)
......@@ -105,9 +120,6 @@ def clean_comment(text, replace_misspellings=False):
s = re.sub(r"\'d", " would ", s, flags=re.IGNORECASE)
s = re.sub(r"\'ll", " will ", s, flags=re.IGNORECASE)
s = re.sub(r"\'scuse", " excuse ", s, flags=re.IGNORECASE)
s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s)
s = re.sub(r'\b[a-z]+\d+\b', ' _user_ ', s)
#hard coded replacements
for bad_word in some_bad_words:
s = re.sub(replacement_regex(bad_word), ' ' + bad_word + ' ', s)
......@@ -122,19 +134,11 @@ def clean_comment(text, replace_misspellings=False):
#wikipedia specific features
# wikipedia_regex = [r'\(talk\)', r'\(utc\)', r'\(talk|email\)']
# wikipedia_matches = [re.search(regex, s) for regex in wikipedia_regex]
s = re.sub(r'(?<=\(talk\)).*?(?=\(utc\))', ' _date_ ', s)
s = re.sub(r'\d\d:\d\d, \d+ (?:January|February|March|April|May|June|July|August|September|November|December) \d+', ' _date_ ', s)
s = re.sub(r'\(talk\)', ' _talk_ ', s)
s = re.sub(r'user talk', ' _talk2_ ', s)
s = re.sub(r'\(utc\)', ' _wikipedia_ ', s)
s = re.sub(r'\(talk|email\)', ' _wikipedia_ ', s)
# s = re.sub(r'\b[0-9]+\b', ' _number_ ', s)
s = re.sub(ur'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' _url_ ', s)
s = re.sub(ur'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b', ' _mail_ ', s)
#without_controls = ' '.join(control_char_re.sub(' ', text).split(' '))
# add space between punctuation
s, patterns = replace_specific_patterns(s)
s = ' '.join([re.sub(r'([_])', ' ', sub_s) if sub_s not in patterns else sub_s for sub_s in s.split(' ')])
#shorten words
s = re.sub(r'(\w)\1\1+', r' \1\1 ', s)
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import matplotlib
matplotlib.use('agg')
import numpy as np
import pandas as pd
from functools import partial
......@@ -71,7 +73,9 @@ def DNN_model_validate(X, y, fit_args, fixed_args, kwargs, cv=6, model_name=None
K.clear_session()
estimator_subname = model_time+'_finetune_{}'.format(i)
new_dict['compilation_args']['optimizer'] = opt(**optargs)
estimator_sub = DNN.continue_training_DNN_one_output(estimator_subname, i, weights, fit_args, train_x, train_y[:,i], **new_dict)
sub_y = { key : val for key, val in train_y.iteritems()}
sub_y['main_output'] = sub_y['main_output'][:,i]
estimator_sub = DNN.continue_training_DNN_one_output(estimator_subname, i, weights, fit_args, train_x, sub_y, **new_dict)
sub_predictions.append(np.squeeze(estimator_sub.predict(test_x))[:,None])
sub_scores.append(roc_auc_score(test_y['main_output'][:,i],sub_predictions[-1]))
sub_predictions_list.append(np.concatenate(sub_predictions, axis=-1))
......@@ -220,7 +224,7 @@ def make_loss_function(class_weights):
def test_models():
fit_args = {'batch_size' : 80, 'epochs' : 30,
'validation_split' : 0.2}
fixed_args = DNN.simple_huge_dropout_net()
fixed_args = DNN.simple_huge_net()
kwargs = {}
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
......@@ -235,26 +239,26 @@ def test_models():
# embedding = hlp.get_glove_embedding('../glove.twitter.27B.200d.txt')
kwargs['embedding'] = embedding
kwargs['tokenizer'] = frozen_tokenizer
DNN_model_validate({'main_input':train_text}, {'main_output':train_y, 'aux_output':aux_task}, fit_args, fixed_args, kwargs, cv=6, model_name='huge_channel_2_dropout')
DNN_model_validate({'main_input':train_text}, {'main_output':train_y, 'aux_output':aux_task}, fit_args, fixed_args, kwargs, cv=6, model_name='huge_finetune', finetune=True)
def make_average_general_test_set_predictions(model_name):
def make_average_general_test_set_predictions(model_name, rank_avg=True):
import glob
estimators = joblib.load('../models/{}.pkl'.format(model_name))
test_text, _ = pre.load_data('test.csv')
tfidf = models.get_tfidf_model()
test_tfidf = tfidf.transform(test_text)
predictions = np.concatenate([hlp.predict_proba_conc(estimator, test_tfidf)[...,None] for estimator in estimators], axis=-1).mean(axis=-1)
predictions = np.concatenate([hlp.preds_to_norm_rank(hlp.predict_proba_conc(estimator, test_tfidf), cols=rank_avg)[...,None] for estimator in estimators], axis=-1).mean(axis=-1)
joblib.dump(predictions, '../predictions/test_set_{}.pkl'.format(model_name))
hlp.write_model(predictions)
def make_average_test_set_predictions(model_name):
def make_average_test_set_predictions(model_name, **kwargs):
import os
if os.path.exists('../models/{}.pkl'.format(model_name)):
make_average_general_test_set_predictions(model_name)
make_average_general_test_set_predictions(model_name, **kwargs)
else:
make_average_DNN_test_set_predictions(model_name)
make_average_DNN_test_set_predictions(model_name, **kwargs)
def make_average_DNN_test_set_predictions(model_name):
def make_average_DNN_test_set_predictions(model_name, rank_avg=True):
import glob
all_model_names = [mname for mname in glob.glob(model_name + '*')]
# fixed_args = DNN.old_gru_net()
......@@ -272,12 +276,17 @@ def make_average_DNN_test_set_predictions(model_name):
prediction_list = []
model = DNN.load_full_model(all_model_names[0].split('_best')[0], embedding=embedding, tokenizer=frozen_tokenizer, **fixed_args)
prediction_list.append(model.predict(test_text)[..., None])
train_preds = []
for submodel_name in all_model_names[1:]:
model.model.load_weights(submodel_name)
prediction_list.append(model.predict(test_text)[..., None])
prediction_list.append(hlp.preds_to_norm_rank(model.predict(test_text), cols=rank_avg)[..., None])
train_preds.append(hlp.preds_to_norm_rank(model.predict(train_text), cols=rank_avg)[..., None])
predictions = np.concatenate(prediction_list, axis=-1)
predictions = predictions.mean(axis=-1)
train_preds = np.concatenate(train_preds, axis=-1).mean(axis=-1)
joblib.dump(predictions, '../predictions/test_set_{}.pkl'.format(model_name))
joblib.dump(train_preds, '../predictions/train_set_avg_{}.pkl'.format(model_name))
hlp.write_model(predictions)
def report_ensembling(model_name_list, ensemble_name='generic'):
......@@ -306,7 +315,7 @@ def stack_ensembling(predictions_col_dict, clf_func, train_y):
return estimator_list, score_list
# gbr = GradientBoostingClassifier(n_estimators=50)
def test_meta_models(model_name_list, meta_features=None):
def test_meta_models(model_name_list, meta_features=None, rank=True):
from scipy.special import logit, expit
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegressionCV
......@@ -318,7 +327,8 @@ def test_meta_models(model_name_list, meta_features=None):
# 'gbc' : partial(GridSearchCV, GradientBoostingClassifier(), {'n_estimators' : [30], 'max_depth' : [2, 3]})}
_, train_y = pre.load_data()
prediction_dict = {model_name : joblib.load('../predictions/{}.pkl'.format(model_name)) for model_name in model_name_list}
prediction_dict = {model_name :joblib.load('../predictions/{}.pkl'.format(model_name)) for model_name in model_name_list}
prediction_dict = {key:hlp.preds_to_norm_rank(val, rank) for key, val in prediction_dict.iteritems()}
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
predictions_col_dict = {}
for i, col in enumerate(cols):
......@@ -335,29 +345,32 @@ def find_weighted_average(model_preds, train_y):
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score
start_pos = np.ones((model_preds.shape[-1],))/model_preds.shape[-1]
cons = ({'type' : 'eq', 'fun' : lambda x : 1-np.sum(x)})
bounds = [(0,1)]*model_preds.shape[-1]
res = minimize(lambda x : -roc_auc_score(train_y, model_preds.dot(x)), start_pos, method='SLSQP', bounds=bounds, constraints=cons, options={'ftol':'1e-11'})
# cons = ({'type' : 'eq', 'fun' : lambda x : 1-np.sum(x)})
# bounds = [(0,1)]*model_preds.shape[-1]
res = minimize(lambda x : -roc_auc_score(train_y, model_preds.dot(x)), start_pos)#, method='SLSQP', bounds=bounds, constraints=cons, options={'ftol':'1e-11'})
return res
def apply_meta_models(estimator_dict, model_name_list, meta_features=None):
def apply_meta_models(estimator_dict, model_name_list, meta_features=None, rank=True, rank_cv=False):
'''same order necessary for estimator_dict and test_predictions'''
from scipy.special import logit, expit
model_name_list = sorted(model_name_list)
prediction_dict = {}
for model_name in model_name_list:
try:
prediction_dict[model_name] = joblib.load('../predictions/test_set_{}.pkl'.format(model_name))
prediction_dict[model_name] = hlp.preds_to_norm_rank(joblib.load('../predictions/test_set_{}.pkl'.format(model_name)), rank)
except IOError:
make_average_test_set_predictions(model_name)
prediction_dict[model_name] = joblib.load('../predictions/test_set_{}.pkl'.format(model_name))
prediction_dict[model_name] = hlp.preds_to_norm_rank(joblib.load('../predictions/test_set_{}.pkl'.format(model_name)), rank)
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
predictions_test = []
for i, col in enumerate(cols):
pred_col = np.hstack([hlp.logit(prediction_dict[model_name][:,i])[:,None] for model_name in sorted(prediction_dict.keys())])
if meta_features is not None:
pred_col = np.hstack([pred_col, meta_features])
predictions_test.append(np.concatenate([est.predict_proba(pred_col)[:,1][:,None] for est in estimator_dict[col]], axis=-1).mean(axis=-1)[:,None])
if rank_cv:
predictions_test.append(np.concatenate([hlp.norm_rank(est.predict_proba(pred_col)[:,1])[:,None] for est in estimator_dict[col]], axis=-1).mean(axis=-1)[:,None])
else:
predictions_test.append(np.concatenate([est.predict_proba(pred_col)[:,1][:,None] for est in estimator_dict[col]], axis=-1).mean(axis=-1)[:,None])
predictions_test = np.concatenate(predictions_test, axis=-1)
hlp.write_model(predictions_test)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment