Commit 0559937c authored by mjboos's avatar mjboos

more coms

parent 6020f288
......@@ -37,7 +37,7 @@ def train_DNN(model_name, fit_args, *args, **kwargs):
model.model.load_weights(best_weights_path)
return model
def make_callback_list(model_name, save_weights=True, patience=5):
def make_callback_list(model_name, save_weights=True, patience=10):
'''Makes and returns a callback list for logging, saving the best model, and early stopping with patience=patience'''
best_weights_path="{}_best.hdf5".format(model_name)
early = EarlyStopping(monitor="val_loss", mode="min", patience=patience)
......@@ -55,6 +55,24 @@ def continue_training_DNN(model_name, fit_args, *args, **kwargs):
callbacks_list = make_callback_list(model_name+'_more', patience=3)
fit_args['callbacks'] = callbacks_list
model.fit(*args, **fit_args)
model.model.load_weights(best_weights_path)
return model
def freeze_layers(model, unfrozen_types=[], unfrozen_keyword=None):
""" Freezes all layers in the given model, except for ones that are
explicitly specified to not be frozen.
# Arguments:
model: Model whose layers should be modified.
unfrozen_types: List of layer types which shouldn't be frozen.
unfrozen_keyword: Name keywords of layers that shouldn't be frozen.
# Returns:
Model with the selected layers frozen.
"""
for l in model.layers:
if len(l.trainable_weights):
trainable = (type(l) in unfrozen_types or
(unfrozen_keyword is not None and unfrozen_keyword in l.name))
change_trainable(l, trainable, verbose=False)
return model
def continue_training_DNN_one_output(model_name, i, weights, fit_args, *args, **kwargs):
......@@ -62,6 +80,7 @@ def continue_training_DNN_one_output(model_name, i, weights, fit_args, *args, **
model = models.Embedding_Blanko_DNN(**kwargs)
transfer_weights_multi_to_one(weights, model.model, i)
callbacks_list = make_callback_list(model_name, patience=5)
model.model = freeze_layers(model.model, unfrozen_keyword='main_output')
fit_args['callbacks'] = callbacks_list
model.fit(*args, **fit_args)
model.model.load_weights(best_weights_path)
......@@ -77,7 +96,7 @@ def predict_for_all(model):
hlp.write_model(predictions)
def conc_finetuned_preds(model_name):
predictions = np.concatenate([joblib.load('{}_{}.pkl'.format(model_name,i))[:,None] for i in xrange(6)], axis=1)
predictions = np.concatenate([joblib.load('{}_{}.pkl'.format(model_name,i)) for i in xrange(6)], axis=1)
hlp.write_model(predictions)
def fit_model(model_name, fit_args, *args, **kwargs):
......@@ -113,9 +132,13 @@ def transfer_weights_multi_to_one(weights, model, i):
def fine_tune_model(model_name, old_model, fit_args, train_X, train_y, test_text, **kwargs):
'''Fits and returns a model for one label (provided as index i)'''
weights = [layer.get_weights() for layer in old_model.layers]
for i in xrange(4, 6):
if 'compilation_args' in kwargs:
kwargs['compilation_args']['optimizers'] = optimizers.Adam(lr=0.0001, clipnorm=1.)
for i in xrange(6):
new_name = model_name + '_{}'.format(i)
model = continue_training_DNN_one_output(new_name, i, weights, fit_args, train_X, train_y[:,i], **kwargs)
joblib.dump(model.predict(test_text), '{}.pkl'.format(new_name))
K.clear_session()
if 'compilation_args' in kwargs:
kwargs['compilation_args']['optimizers'] = optimizers.Adam(lr=0.0001, clipnorm=1.)
......@@ -57,12 +57,60 @@ def simple_one_output_net():
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_net():
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=48)
def toxic_skip_net():
model_func = partial(models.RNN_aux_loss_skip, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=48, hidden_dense=20)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy', 'aux_output' : 'binary_crossentropy'}, 'loss_weights' : [1., 1.]}}
return model_params
def simple_aug_net(trainable=False, prune=True):
model_func = partial(models.RNN_augment, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_embedding_net(trainable=False, prune=True):
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_attention_1d(trainable=False, prune=True):
model_func = partial(models.RNN_attention_1d, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, dropout_dense=0.5, dropout=0.5)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_attention(trainable=False, prune=True):
model_func = partial(models.RNN_attention, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, dropout_dense=0.5, dropout=0.5)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_net(trainable=False, prune=True):
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def shallow_CNN(trainable=False, prune=True):
model_func = partial(models.CNN_shallow, n_filters=50, kernel_sizes=[3,4,5], dropout=0.5)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def add_net():
......@@ -74,29 +122,34 @@ def add_net():
return model_params
if __name__=='__main__':
aux_task = train_y.sum(axis=1) > 0
# for toxic skip
aux_task = train_y[:,0]
# train_y = np.delete(train_y, 0, axis=1)
train_data_augmentation = pre.pad_and_extract_capitals(train_text)[..., None]
test_data_augmentation = pre.pad_and_extract_capitals(test_text)[..., None]
class_weights = hlp.get_class_weights(train_y)
weight_tensor = tf.convert_to_tensor(class_weights, dtype=tf.float32)
loss = partial(models.weighted_binary_crossentropy, weights=weight_tensor)
loss.__name__ = 'weighted_binary_crossentropy'
model_params = simple_one_output_net()
model_name = '300_fasttext_cuda_2_layers_larger_GRU'
model_params = simple_attention()
model_name = '300_fasttext_attention_GRU'
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# SHUFFLE TRAINING SET so validation split is different every time
row_idx = np.arange(0, train_text.shape[0])
np.random.shuffle(row_idx)
train_text, train_y, aux_task = train_text[row_idx], train_y[row_idx], aux_task[row_idx]
keras_model = load_keras_model(model_name)
# model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output':train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# row_idx = np.arange(0, train_text.shape[0])
# np.random.shuffle(row_idx)
# train_text, train_y, aux_task, train_data_augmentation = train_text[row_idx], train_y[row_idx], aux_task[row_idx], train_data_augmentation[row_idx]
# model = load_keras_model(model_name)
model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = fit_model(model_name, fit_args, {'main_input':train_text, 'aug_input':train_data_augmentation}, {'main_output':aux_task}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict({'main_input':test_text,'aug_input':test_data_augmentation}))
# hlp.make_training_set_preds(model, {'main_input':train_text, 'aug_input':train_data_augmentation}, train_y)
# model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output':train_y, 'aux_output':aux_task}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = continue_training_DNN(model_name, fit_args, train_text, train_y, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = load_full_model(model_name+'_0', embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
fine_tune_model(model_name, keras_model, fit_args, train_text, train_y, test_text, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
conc_finetuned_preds(model_name)
# hlp.write_model(model.predict(test_text))
# K.clear_session()
# model_params['compilation_args']['optimizer'] = optimizers.Adam(lr=0.0005, beta_2=0.99)
# model = continue_training_DNN(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict(test_text))
# model = continue_training_DNN(model_name, fit_args, train_text, train_y, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
model_params = simple_attention_1d()
fine_tune_model(model_name, model.model, fit_args, train_text, train_y, test_text, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
......@@ -58,3 +58,6 @@ def compute_features(text_df, which_features=None):
feature_data[:,i] = features
return feature_data
def caps_vec(input_text):
split_text = text.text_to_word_sequence(input_text, filters="\n\t", lower=False)
return np.array([1 if (word.isupper() and len(word)>1) else 0 for word in split_text])
......@@ -44,8 +44,8 @@ def write_model(predictions, correct=None,
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
import pandas as pd
import time
if correct:
predictions = correct(predictions)
if isinstance(predictions, list):
predictions = np.concatenate(predictions, axis=-1)
timestr = time.strftime("%m%d-%H%M")
subm = pd.read_csv('../input/sample_submission.csv')
submid = pd.DataFrame({'id': subm["id"]})
......@@ -94,14 +94,24 @@ def predictions_for_language(language_dict, test_data=None):
@memory.cache
def get_fasttext_rank(fasttext_path):
rank_idx = {}
rank_index = {}
with open(fasttext_path, 'r') as f:
for nr, line in enumerate(f):
values = line.split()
word = values[0]
rank_index[word] = nr
return rank_idx
return rank_index
def make_training_set_preds(model, train_data, train_y, split=0.2):
import time
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
split_n = np.round(split*train_data['main_input'].shape[0]).astype('int')
predictions = model.predict({label: data[-split_n:] for label, data in train_data.iteritems()})
df_dict = {'predictions_{}'.format(lbl) : preds for lbl, preds in zip(cols, predictions.T)}
df_dict.update({lbl : lbl_col for lbl, lbl_col in zip(cols, train_y[-split_n:].T)})
df_dict['text'] = train_data['main_input'][-split_n:]
df = pd.DataFrame(df_dict)
df.to_csv('predictions_{}.csv'.format(time.strftime("%m%d-%H%M")))
def dump_trials(trials, fname=''):
import time
......@@ -121,3 +131,4 @@ def join_embedding_vec(word_dict, path):
except KeyError:
word_dict[word] = np.concatenate([word_dict[word], np.zeros(n_dim)])
return word_dict
This diff is collapsed.
......@@ -11,7 +11,7 @@ from keras.preprocessing import text, sequence
from nltk.corpus import stopwords
import re, string
from sklearn.base import BaseEstimator, TransformerMixin
import feature_engineering
import string
eng_stopwords = set(stopwords.words("english"))
memory = joblib.Memory(cachedir='/home/mboos/joblib')
......@@ -44,27 +44,38 @@ def clean_comment(text):
text = ud.normalize('NFD', text.encode('utf-8').decode('utf-8'))
text = re.sub(r'[^\x00-\x7f]', r' ' , text)
text = re.sub(r'[\n\r]', r' ', text)
s = re.sub(r"what's", "what is ", text, flags=re.IGNORECASE)
s = re.sub(r"\'s", " ", s, flags=re.IGNORECASE)
s = re.sub(r"\'ve", " have ", s, flags=re.IGNORECASE)
s = re.sub(r"can't", "cannot ", s, flags=re.IGNORECASE)
s = re.sub(r"n't", " not ", s, flags=re.IGNORECASE)
s = re.sub(r"i'm", "i am ", s, flags=re.IGNORECASE)
s = re.sub(r"\'re", " are ", s, flags=re.IGNORECASE)
s = re.sub(r"\'d", " would ", s, flags=re.IGNORECASE)
s = re.sub(r"\'ll", " will ", s, flags=re.IGNORECASE)
s = re.sub(r"\'scuse", " excuse ", s, flags=re.IGNORECASE)
#without_controls = ' '.join(control_char_re.sub(' ', text).split(' '))
# add space between punctuation
s = re.sub(r'([.,!?():;_^`<=>$%&@|{}\-+#~*\/"])', r' \1 ', text)
s = re.sub(r'([.,!?():;_^`<=>$%&@|{}\-+\[\]#~*\/"])', r' \1 ', s)
s = re.sub(r"(['])", r' \1 ', s)
s = re.sub('\s{2,}', ' ', s)
return s.encode('utf-8')
@memory.cache
def data_preprocessing(df):
df['comment_text'].fillna(' ', inplace=True)
df['comment_text'].fillna(' ', inplace=True)
df['comment_text'] = df['comment_text'].apply(clean_comment)
return df
def load_data(name='train.csv', preprocess=True, cut=True):
def load_data(name='train.csv', preprocess=True, cut=False):
data = pd.read_csv('../input/{}'.format(name), encoding='utf-8')
if preprocess:
data = data_preprocessing(data)
if cut:
if cut and name=='train.csv':
# these comments are often (or always) mis-labeled
not_toxic_but_nz = np.logical_and(train_y[:,0]==0, np.logical_not(all_zero))
not_toxic_but_nz = np.logical_and(data.iloc[:,2].values==0, data.iloc[:,2:].values.any(axis=1))
data = data.drop(data.index[np.where(not_toxic_but_nz)[0]])
text = data['comment_text']
text = data['comment_text'].reset_index(drop=True)
labels = data.iloc[:, 2:].values
# data_dict = {'babel' : [text, labels]}
return text, labels
......@@ -76,7 +87,7 @@ def keras_pad_sequence_to_sklearn_transformer(maxlen=100):
class KerasPaddingTokenizer(BaseEstimator, TransformerMixin):
def __init__(self, max_features=20000, maxlen=200,
filters='\t\n', **kwargs):
filters="\t\n", **kwargs):
self.max_features = max_features
self.maxlen = maxlen
self.is_trained = False
......@@ -89,3 +100,7 @@ class KerasPaddingTokenizer(BaseEstimator, TransformerMixin):
def transform(self, list_of_sentences):
return sequence.pad_sequences(self.tokenizer.texts_to_sequences(list_of_sentences), maxlen=self.maxlen)
def pad_and_extract_capitals(df, maxlen=500):
train_data_augmentation = df.apply(feature_engineering.caps_vec)
return sequence.pad_sequences([caps for caps in train_data_augmentation], maxlen=maxlen)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment