Commit cb0328d3 authored by mjboos's avatar mjboos

party

parent da30ef6a
......@@ -21,25 +21,32 @@ from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateSchedule
import json
import feature_engineering
from functools import partial
from keras.utils import plot_model
memory = joblib.Memory(cachedir='/home/mboos/joblib')
def train_DNN(model_name, *args, **kwargs):
#TODO: make more things optional
def train_DNN(model_name, fit_args, *args, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
model = models.Embedding_Blanko_DNN(**kwargs)
with open('../model_specs/{}.json'.format(model_name), 'w') as fl:
json.dump(model.model.to_json(), fl)
plot_model(model.model, '../model_specs/{}.png'.format(model_name), show_shapes=True)
model.fit(*args, **fit_args)
model.model.load_weights(best_weights_path)
return model
def make_callback_list(model_name, patience=5):
def make_callback_list(model_name, save_weights=True, patience=5):
'''Makes and returns a callback list for logging, saving the best model, and early stopping with patience=patience'''
best_weights_path="{}_best.hdf5".format(model_name)
early = EarlyStopping(monitor="val_loss", mode="min", patience=patience)
logger = CSVLogger('../logs/{}.csv'.format(model_name), separator=',', append=False)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
return [logger, checkpoint, early]
checkpoints = [early, logger]
if save_weights:
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
checkpoints.append(checkpoint)
return checkpoints
def continue_training_DNN(model_name, *args, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
......@@ -47,6 +54,7 @@ def continue_training_DNN(model_name, *args, **kwargs):
model.model.load_weights(best_weights_path)
logger = CSVLogger('../logs/{}_more.csv'.format(model_name), separator=',', append=True)
best_weights_path="{}_more_best.hdf5".format(model_name)
early = EarlyStopping(monitor="val_loss", mode="min", patience=10)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
......@@ -74,13 +82,9 @@ def predict_for_all(model):
predictions = model.predict(test_text)
hlp.write_model(predictions)
def fit_model(model_name, *args, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
logger = CSVLogger('../logs/{}.csv'.format(model_name), separator=',', append=False)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
model = train_DNN(model_name, *args, **kwargs)
def fit_model(model_name, fit_args, *args, **kwargs):
fit_args['callbacks'] = make_callback_list(model_name)
model = train_DNN(model_name, fit_args, *args, **kwargs)
return model
def load_keras_model(model_name, **kwargs):
......
......@@ -101,3 +101,12 @@ def update_embedding_vec(word_dict, path):
word_dict.update(other_words)
return word_dict
def join_embedding_vec(word_dict, path):
other_embeddings = get_fasttext_embedding(path)
n_dim = other_embeddings.values()[0].shape
for word in word_dict:
try:
word_dict[word] = np.concatenate([word_dict[word], other_embeddings[word]])
except KeyError:
word_dict[word] = np.concatenate([word_dict[word], np.zeros(n_dim)])
return word_dict
......@@ -235,7 +235,7 @@ def make_model_function(**kwargs):
class Embedding_Blanko_DNN(BaseEstimator):
def __init__(self, embedding=None, max_features=20000, model_function=None, tokenizer=None,
maxlen=200, embedding_dim=300, correct_spelling=False, trainable=False, preprocess_embedding=False,
maxlen=300, embedding_dim=300, correct_spelling=False, trainable=False, preprocess_embedding=False,
compilation_args={'optimizer':'adam','loss':'binary_crossentropy','metrics':['accuracy']}, embedding_args={'n_components' : 100}):
self.compilation_args = compilation_args
self.max_features = max_features
......@@ -298,8 +298,11 @@ class Embedding_Blanko_DNN(BaseEstimator):
inputs = sequence_input
self.model = Model(inputs=inputs, outputs=outputs)
self.model.compile(**self.compilation_args)
if isinstance(X, dict):
X['main_input'] = self.tokenizer.transform(X['main_input'])
else:
X = self.tokenizer.transform(X)
X['main_input'] = self.tokenizer.transform(X['main_input'])
self.model.fit(X, y, **kwargs)
return self
......@@ -369,9 +372,9 @@ def RNN_aux_loss(x, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32, rnn_func=No
for rnn_size in hidden_rnn:
x = Dropout(dropout)(x)
x = Bidirectional(rnn_func(rnn_size, return_sequences=True))(x)
aux_dense = Dense(aux_dim, activation='sigmoid', name='aux_output')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(dropout)(x)
aux_dense = Dense(aux_dim, activation='sigmoid', name='aux_output')(x)
x = Dense(hidden_dense, activation='relu')(x)
x = Dropout(dropout)(x)
x = Dense(6, activation="sigmoid", name='main_output')(x)
......@@ -386,10 +389,10 @@ def RNN_general(x, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32, rnn_func=Non
raise ValueError('list of recurrent units needs to be equal to no_rnn_layers')
for rnn_size in hidden_rnn:
x = Dropout(dropout)(x)
x = Bidirectional(rnn_func(rnn_size, return_sequences=True))(x)
x = Bidirectional(rnn_func(int(rnn_size), return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(dropout)(x)
x = Dense(hidden_dense, activation='relu')(x)
x = Dense(int(hidden_dense), activation='relu')(x)
x = Dropout(dropout)(x)
x = Dense(6, activation="sigmoid", name='main_output')(x)
return x, None
......
......@@ -41,8 +41,10 @@ def remove_control_chars(s):
def clean_comment(text):
import unicodedata as ud
text = ud.normalize('NFD', text)
text = ud.normalize('NFD', text.encode('utf-8').decode('utf-8'))
text = re.sub(r'[^\x00-\x7f]', r' ' , text)
text = re.sub(r'[\n\r]', r' ', text)
text = re.sub(r'["]', r' ', text)
#without_controls = ' '.join(control_char_re.sub(' ', text).split(' '))
# add space between punctuation
s = re.sub(r"([.,!?():;_^`<=>$%&@|{}\-+#~*\/])", r' \1 ', text)
......@@ -51,7 +53,7 @@ def clean_comment(text):
@memory.cache
def data_preprocessing(df):
df['comment_text'].fillna(' ', inplace=True)
df['comment_text'].fillna(' ', inplace=True)
df['comment_text'] = df['comment_text'].apply(clean_comment)
return df
......
......@@ -9,21 +9,39 @@ import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import cross_val_score, KFold, train_test_split
import helpers as hlp
import models
import preprocessing as pre
import json
from keras import optimizers
from keras import backend as K
from sklearn.metrics import roc_auc_score
import time
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, CSVLogger
import feature_engineering
import DNN
def DNN_model(X, y, fit_args={}, **kwargs):
def DNN_model_validate(X, y, fit_args, fixed_args, kwargs, cv=5):
'''Builds and evaluates a CNN on train_text, train_labels'''
model = models.Embedding_Blanko_DNN(**kwargs)
return validator(model, X, y, fit_args=fit_args)
new_dict = {key:val for key, val in fixed_args.items()}
new_dict.update(kwargs)
new_dict['compilation_args']['optimizer'] = optimizers.Adam(lr=0.001, beta_2=0.99)
new_time = 'cval_{}'.format(time.strftime("%m%d-%H%M"))
kfold = KFold(n_splits=cv, shuffle=True)
scores = []
Xs = np.zeros((len(X),1), dtype='int8')
for train, test in kfold.split(Xs):
train_x = [X[i] for i in train]
test_x = [X[i] for i in test]
estimator = DNN.fit_model(new_time, fit_args, train_x, y[train], **new_dict)
predictions = estimator.predict(test_x)
scores.append(hlp.mean_log_loss(y[test], predictions))
score_dict = {'loss' : np.mean(scores), 'loss_fold' : scores, 'status' : STATUS_OK}
K.clear_session()
return score_dict
def do_hyper_search(space, model_function, **kwargs):
'''Do a search over the space using a frozen model function'''
......@@ -38,6 +56,13 @@ def RF_model(X, y, kwargs):
model = MultiOutputClassifier(RandomForestClassifier(**kwargs))
return validator(model, X, y)
def test_set_validator(estimator_dict, X, y, split=0.3, fit_args={}, **kwargs):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split)
estimator.fit(X_train, y_train, **fit_args)
predictions = estimator.predict(X_test)
score_dict = {'loss' : hlp.mean_log_loss(y_test, predictions), 'auc' : roc_auc_score(y_test, predictions), 'status' : STATUS_OK}
return score_dict
#TODO: more information??
def validator(estimator, X, y, cv=3, fit_args={}, **kwargs):
'''Validate mean log loss of model'''
......@@ -55,7 +80,7 @@ def validator(estimator, X, y, cv=3, fit_args={}, **kwargs):
#TODO: add other params
#TODO: model_func_param
def validate_token_model(model_name, model_function, space, maxlen=300, max_features=500000):
def hyperopt_token_model(model_name, model_function, space, maxlen=300, max_features=500000):
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
......@@ -63,40 +88,41 @@ def validate_token_model(model_name, model_function, space, maxlen=300, max_feat
max_features=max_features)
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
compilation_args = {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}
callbacks_list = [EarlyStopping(monitor="val_loss", mode="min", patience=5)]
fit_args = {'batch_size' : 256, 'epochs' : 15,
'validation_split' : 0.1, 'callbacks' : callbacks_list}
compilation_args = {'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}
fit_args = {'batch_size' : 256, 'epochs' : 30,
'validation_split' : 0.1}
fixed_args = {'tokenizer':frozen_tokenizer, 'embedding':embedding, 'compilation_args':compilation_args}
# freeze all constant parameters
frozen_model_func = partial(model_function, train_text, train_y, fit_args=fit_args,
tokenizer=frozen_tokenizer, embedding=embedding, compilation_args=compilation_args)
frozen_model_func = partial(model_function, train_text, train_y, fit_args, fixed_args)
trials = Trials()
best = fmin(model_function, space=space, algo=tpe.suggest, max_evals=20, trials=trials)
best = fmin(frozen_model_func, space=space, algo=tpe.suggest, max_evals=5, trials=trials)
hlp.dump_trials(trials, fname=model_name)
return best
#TODO: better feature selection
def validate_feature_model(model_name, model_function, space, fixed_params_file='../parameters/fixed_features.json', max_evals=10):
def validate_feature_model(model_name, model_function, space, fixed_params_file='../parameters/fixed_features.json', max_evals=20, trials=None):
with open(fixed_params_file, 'r') as fl:
fixed_params_dict = json.load(fl)
which_features = fixed_params_dict.pop('features')
train_text, train_y = pre.load_data()
train_ft = feature_engineering.compute_features(train_text, which_features=which_features)
frozen_model_func = partial(model_function, train_ft, train_y, **fixed_params_dict)
trials = Trials()
best = fmin(frozen_model_func, space=space, algo=tpe.suggest, max_evals=10, trials=trials)
if not trials:
trials = Trials()
best = fmin(frozen_model_func, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials)
hlp.dump_trials(trials, fname=model_name)
return best
if __name__=='__main__':
DNN_search_space = {'model_function' : {'no_rnn_layers' : hp.choice('no_rnn_layers', [1, 2]),
# 'rnn_func' : hp.choice('rnn_func', [models.CuDNNLSTM, models.CuDNNGRU]),
'hidden_rnn' : hp.quniform('hidden_rnn', 32,128,16),
'hidden_dense' : hp.quniform('hidden_dense', 16, 64, 8),
'dropout' : hp.uniform('dropout', 0.3, 0.9)}}
DNN_search_space = {'model_function' : {'no_rnn_layers' : hp.choice('no_rnn_layers', [2]),
'rnn_func' : hp.choice('rnn_func', [models.CuDNNLSTM, models.CuDNNGRU]),
'hidden_rnn' : hp.quniform('hidden_rnn', 32, 96, 16),
'hidden_dense' : hp.quniform('hidden_dense', 16, 64, 8)}}
token_models_to_test = {
'DNN' : (DNN_model, DNN_search_space)}
'DNN' : (DNN_model_validate, DNN_search_space)}
for model_name, (func, space) in token_models_to_test.iteritems():
best = validate_token_model(model_name, func, space)
best = hyperopt_token_model(model_name, func, space)
joblib.dump(best, 'best_{}.pkl'.format(model_name))
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment