Commit ba182e10 authored by mjboos's avatar mjboos

recent

parent a9756e45
......@@ -234,7 +234,7 @@ def simple_attention_1d(trainable=False, prune=True):
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
'compilation_args' : {'opzimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.001, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def conc_attention(trainable=False, prune=True):
......@@ -242,7 +242,7 @@ def conc_attention(trainable=False, prune=True):
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
'compilation_args' : {'opzimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.001, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_attention(trainable=False, prune=True):
......@@ -250,7 +250,7 @@ def simple_attention(trainable=False, prune=True):
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
'compilation_args' : {'opzimizer_func' : optimizers.Adam, 'optimizer_args' : {'lr' : 0.001, 'clipnorm' : 1.}, 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_attention_dropout(trainable=False, prune=True):
......@@ -270,7 +270,7 @@ def simple_attention_word_dropout(trainable=False, prune=True):
return model_params
def simple_net(trainable=False, prune=True):
model_func = partial(models.RNN_conc, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=1, hidden_rnn=128, hidden_dense=48)
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=128)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_val_score
import helpers as hlp
import models
import preprocessing as pre
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, CSVLogger
import json
memory = joblib.Memory(cachedir='/home/mboos/joblib')
best_weights_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
def schedule(ind):
a = [0.002,0.002,0.002,0.001,0.001]
return a[ind]
lr = LearningRateScheduler(schedule)
callbacks_list = [checkpoint, early] #early
fit_args = {'batch_size' : 256, 'epochs' : 20,
'validation_split' : 0.2, 'callbacks' : callbacks_list}
# for now use only english as model
train_per_language = pre.load_data()
train_text, train_y = train_per_language['en']
test_per_language = pre.load_data('test.csv')
test_text, _ = test_per_language['en']
#FOR NOW!!
#train_text, train_y = pre.load_data(language=False)['babel']
#test_text, _ = pre.load_data('test.csv', language=False)['babel']
def train_DNN(embeddings_index, **kwargs):
model = models.Embedding_Blanko_DNN(embeddings_index=embeddings_index, **kwargs)
model.fit(train_text, train_y, **fit_args)
model.model.load_weights(best_weights_path)
return model
def load_DNN_weights(embeddings_index, weights_path='weights_base.best.hdf5',**kwargs):
model = models.Embedding_Blanko_DNN(embeddings_index=embeddings_index, **kwargs)
fit_args_tmp = {'batch_size' : 128, 'epochs' : 1,
'validation_split' : 0.9}
model.fit(train_text, train_y, **fit_args_tmp)
model.model.load_weights(weights_path)
return model
def DNN_EN_to_language_dict(model_english, train_per_language, simple_for=None):
language_dict = models.make_default_language_dict()
language_dict['en'] = model_english
if simple_for:
for simple_lan in simple_for:
language_dict[simple_lan] = models.tfidf_model().fit(*train_per_language[simple_lan])
hlp.write_model(hlp.predictions_for_language(language_dict))
def predict_for_all(model):
test_text, _ = pre.load_data('test.csv', language=False)['babel']
predictions = model.predict(test_text)
hlp.write_model(predictions)
if __name__=='__main__':
maxlen = 200
max_features = 500000
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=max_features, maxlen=maxlen)
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
model_name = '300_fasttext_LSTM'
logger = CSVLogger('../logs/{}.csv'.format(model_name), separator=',', append=False)
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
embedding_dim = 300
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
model = train_DNN(embedding, maxlen=maxlen,
max_features=max_features, model_function=models.LSTM_dropout_model,
embedding_dim=embedding_dim, tokenizer=frozen_tokenizer,
compilation_args={'optimizer' : 'nadam', 'loss':'binary_crossentropy','metrics':['accuracy']})
# joblib.pickle(model, '../models/{}.pkl'.format(model_name))
predict_for_all(model)
# DNN_EN_to_language_dict(model, train_per_language)
#
# checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# logger = CSVLogger('../logs/300_fasttext_LSTM.csv', separator=',', append=False)
# callbacks_list = [logger, checkpoint, early] #early
# fit_args['callbacks'] = callbacks_list
# DNN_EN_to_language_dict(
# train_DNN(embedding, trainable=False, maxlen=maxlen,
# max_features=max_features, model_function=models.LSTM_dropout_model,
# embedding_dim=embedding_dim, tokenizer=frozen_tokenizer,
# compilation_args={'optimizer' : 'adam', 'loss':'binary_crossentropy','metrics':['accuracy']}))
#
......@@ -551,7 +551,7 @@ def RNN_diff_attention(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_f
x = Dense(6, activation="sigmoid", name='main_output')(x)
return x, None
def RNN_channel_dropout_attention(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_func=None, dropout=0.5, dropout_dense=0.5, input_len=500, train_embedding=False):
def RNN_channel_dropout_attention(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_func=None, dropout_embed=0.2, dropout=0.5, dropout_dense=0.5, input_len=500, train_embedding=False):
if rnn_func is None:
rnn_func = CuDNNLSTM
if not isinstance(hidden_rnn, list):
......@@ -562,8 +562,10 @@ def RNN_channel_dropout_attention(x, no_rnn_layers=2, hidden_rnn=48, hidden_dens
vals = [x]
else:
vals = []
x = Dropout(dropout_embed, noise_shape=(None, 1, int(x.shape[-1])))(x)
for rnn_size in hidden_rnn:
x = Dropout(dropout)(x, noise_shape=(None, 1, x.shape[-1]))
x = Dropout(dropout)(x)
x = Bidirectional(rnn_func(int(rnn_size), return_sequences=True))(x)
vals.append(x)
if len(vals) > 1:
......@@ -578,7 +580,7 @@ def RNN_channel_dropout_attention(x, no_rnn_layers=2, hidden_rnn=48, hidden_dens
x = Dense(6, activation="sigmoid", name='main_output')(x)
return x, None
def RNN_time_dropout_attention(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_func=None, dropout=0.5, dropout_dense=0.5, input_len=500, train_embedding=False):
def RNN_time_dropout_attention(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_func=None, dropout_embed=0.2, dropout=0.5, dropout_dense=0.5, input_len=500, train_embedding=False):
if rnn_func is None:
rnn_func = CuDNNLSTM
if not isinstance(hidden_rnn, list):
......@@ -589,8 +591,9 @@ def RNN_time_dropout_attention(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=4
vals = [x]
else:
vals = []
x = Dropout(dropout_embed, noise_shape=(None, int(x.shape[1]), 1))(x)
for rnn_size in hidden_rnn:
x = Dropout(dropout)(x, noise_shape=(None, x.shape[1], 1))
x = Dropout(dropout)(x)
x = Bidirectional(rnn_func(int(rnn_size), return_sequences=True))(x)
vals.append(x)
if len(vals) > 1:
......
......@@ -23,7 +23,7 @@ from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateSchedule
import feature_engineering
import DNN
def DNN_model_validate(X, y, fit_args, fixed_args, kwargs, cv=3):
def DNN_model_validate(X, y, fit_args, fixed_args, kwargs, cv=5):
'''Builds and evaluates a CNN on train_text, train_labels'''
new_dict = {key:val for key, val in fixed_args.items()}
new_dict.update(kwargs)
......@@ -33,13 +33,13 @@ def DNN_model_validate(X, y, fit_args, fixed_args, kwargs, cv=3):
Xs = np.zeros((len(X),1), dtype='int8')
predictions = []
for train, test in kfold.split(Xs):
new_dict['compilation_args']['optimizer'] = optimizers.Adam(lr=0.001, clipnorm=1.)
new_dict['compilation_args']['optimizer'] = new_dict['compilation_args']['optimizer_func'](**new_dict['compilation_args']['optimizer_args'])
train_x = [X[i] for i in train]
test_x = [X[i] for i in test]
model_time = '{}_{}'.format(new_time, time.strftime("%m%d-%H%M"))
estimator = DNN.fit_model(model_time, fit_args, train_x, y[train], **new_dict)
predictions.append(estimator.predict(test_x))
scores.append(hlp.mean_log_loss(y[test], predictions[-1]))
scores.append(roc_auc_score(y[test], predictions[-1]))
K.clear_session()
score_dict = {'loss' : np.mean(scores), 'loss_fold' : scores, 'status' : STATUS_OK}
predictions = np.vstack(predictions)
......@@ -84,16 +84,20 @@ def validator(estimator, X, y, cv=3, fit_args={}, **kwargs):
#TODO: add other params
#TODO: model_func_param
def hyperopt_token_model(model_name, model_function, space, maxlen=300, max_features=500000):
def hyperopt_token_model(model_name, model_function, space, fixed_args):
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
frozen_tokenizer = pre.KerasPaddingTokenizer(maxlen=maxlen,
max_features=max_features)
# remove keys that are in space from fixed_args
all_search_space_keys = space.keys() + list(*[sp[key].keys() for key in sp])
fixed_args = {key : val for key, val in fixed_args.iteritems() if key not in all_search_space_keys}
frozen_tokenizer = pre.KerasPaddingTokenizer(maxlen=fixed_args['maxlen'],
max_features=fixed_args['max_features'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
compilation_args = {'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}
fit_args = {'batch_size' : 256, 'epochs' : 30,
fit_args = {'batch_size' : 80, 'epochs' : 30,
'validation_split' : 0.1}
fixed_args = {'tokenizer':frozen_tokenizer, 'embedding':embedding, 'compilation_args':compilation_args}
......@@ -102,7 +106,7 @@ def hyperopt_token_model(model_name, model_function, space, maxlen=300, max_feat
frozen_model_func = partial(model_function, train_text, train_y, fit_args, fixed_args)
trials = Trials()
best = fmin(frozen_model_func, space=space, algo=tpe.suggest, max_evals=5, trials=trials)
best = fmin(frozen_model_func, space=space, algo=tpe.suggest, max_evals=20, trials=trials)
hlp.dump_trials(trials, fname=model_name)
return best
......@@ -122,36 +126,32 @@ def validate_feature_model(model_name, model_function, space, fixed_params_file=
def do_hyperparameter_search():
DNN_search_space = {'model_function' : {'no_rnn_layers' : hp.choice('no_rnn_layers', [2]),
'rnn_func' : hp.choice('rnn_func', [models.CuDNNLSTM, models.CuDNNGRU]),
'hidden_rnn' : hp.quniform('hidden_rnn', 32, 96, 16),
'hidden_dense' : hp.quniform('hidden_dense', 16, 64, 8)}}
token_models_to_test = {
'DNN' : (DNN_model_validate, DNN_search_space)}
for model_name, (func, space) in token_models_to_test.iteritems():
best = hyperopt_token_model(model_name, func, space)
'DNN' : (DNN_model_validate, DNN_search_space, DNN.simple_attention())}
for model_name, (func, space, fixed_args) in token_models_to_test.iteritems():
best = hyperopt_token_model(model_name, func, space, fixed_args)
joblib.dump(best, 'best_{}.pkl'.format(model_name))
def test_models():
fit_args = {'batch_size' : 80, 'epochs' : 30,
fit_args = {'batch_size' : 80, 'epochs' : 10,
'validation_split' : 0.2}
fixed_args = DNN.simple_attention_dropout()
fixed_args = DNN.simple_attention_channel_dropout()
kwargs = {}
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
adam_args = {'clipnorm' : 1., 'lr' : 0.001}
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=fixed_args['max_features'], maxlen=fixed_args['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
kwargs['embedding'] = embedding
kwargs['tokenizer'] = frozen_tokenizer
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, cv=3)
fixed_args = DNN.simple_attention()
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, cv=3)
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, adam_args, cv=3)
fixed_args = DNN.simple_attention_channel_dropout()
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, cv=3)
fixed_args = DNN.simple_attention_word_dropout()
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, cv=3)
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, adam_args, cv=3)
fixed_args = DNN.conc_attention()
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, adam_args, cv=3)
if __name__=='__main__':
test_models()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment