Commit da30ef6a authored by mjboos's avatar mjboos

many changes

parent c16dc699
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import joblib
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from keras import backend as K
from sklearn.utils import compute_class_weight
import keras
from sklearn.model_selection import cross_val_score
import helpers as hlp
import models
import preprocessing as pre
from keras import optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, CSVLogger
import json
import feature_engineering
from functools import partial
memory = joblib.Memory(cachedir='/home/mboos/joblib')
def train_DNN(model_name, *args, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
model = models.Embedding_Blanko_DNN(**kwargs)
with open('../model_specs/{}.json'.format(model_name), 'w') as fl:
json.dump(model.model.to_json(), fl)
model.fit(*args, **fit_args)
model.model.load_weights(best_weights_path)
return model
def make_callback_list(model_name, patience=5):
'''Makes and returns a callback list for logging, saving the best model, and early stopping with patience=patience'''
best_weights_path="{}_best.hdf5".format(model_name)
early = EarlyStopping(monitor="val_loss", mode="min", patience=patience)
logger = CSVLogger('../logs/{}.csv'.format(model_name), separator=',', append=False)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
return [logger, checkpoint, early]
def continue_training_DNN(model_name, *args, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
model = models.Embedding_Blanko_DNN(**kwargs)
model.model.load_weights(best_weights_path)
logger = CSVLogger('../logs/{}_more.csv'.format(model_name), separator=',', append=True)
best_weights_path="{}_more_best.hdf5".format(model_name)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
model.fit(*args, **fit_args)
return model
def continue_training_DNN_one_output(model_name, i, weights, *args, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
model = models.Embedding_Blanko_DNN(**kwargs)
transfer_weights_multi_to_one(weights, model.model, i)
logger = CSVLogger('../logs/{}.csv'.format(model_name), separator=',', append=False)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
model.fit(*args, **fit_args)
model.model.load_weights(best_weights_path)
return model
def predict_for_one_category(model_name, model):
predictions = model.predict(test_text)
joblib.dump(predictions, '{}.pkl'.format(model_name))
def predict_for_all(model):
test_text, _ = pre.load_data('test.csv')
predictions = model.predict(test_text)
hlp.write_model(predictions)
def fit_model(model_name, *args, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
logger = CSVLogger('../logs/{}.csv'.format(model_name), separator=',', append=False)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
model = train_DNN(model_name, *args, **kwargs)
return model
def load_keras_model(model_name, **kwargs):
from keras.models import model_from_json
model_path = '../model_specs/{}.json'.format(model_name)
with open(model_path, 'r') as fl:
model = model_from_json(json.load(fl))
return model
def load_full_model(model_name, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
model = models.Embedding_Blanko_DNN(**kwargs)
model.model.load_weights(best_weights_path)
return model
def hacky_load_LSTM():
model_name = '300_fasttext_LSTM_test'
model = load_keras_model(model_name)
model.load_weights('300_fasttext_LSTM_best.hdf5')
return model
def transfer_weights_multi_to_one(weights, model, i):
for weights_old, layer in zip(weights[2:-1], model.layers[2:-1]):
layer.set_weights(weights_old)
# now for the last layer
model.layers[-1].set_weights([weights[-1][0][:,i][:,None], weights[-1][1][i][None]])
def fine_tune_model(model_name, old_model, train_X, train_y, **kwargs):
'''Fits and returns a model for one label (provided as index i)'''
weights = [layer.get_weights() for layer in old_model.layers]
for i in xrange(6):
new_name = model_name + '_{}'.format(i)
predict_for_one_category(new_name,
continue_training_DNN_one_output(new_name, i, weights, train_X, train_y[:,i], **kwargs))
if __name__=='__main__':
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNLSTM, no_rnn_layers=1)
aux_task = train_y.sum(axis=1) > 0
class_weights = hlp.get_class_weights(train_y)
weight_tensor = tf.convert_to_tensor(class_weights, dtype=tf.float32)
loss = partial(models.weighted_binary_crossentropy, weights=weight_tensor)
loss.__name__ = 'weighted_binary_crossentropy'
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 300,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
model_name = '300_fasttext_cuda_just_that_LSTM'
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
model = fit_model(model_name, {'main_input':train_text}, {'main_output':train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
hlp.write_model(model.predict(test_text))
# K.clear_session()
# model_params['compilation_args']['optimizer'] = optimizers.Adam(lr=0.0005, beta_2=0.99)
# model = continue_training_DNN(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict(test_text))
......@@ -3,12 +3,14 @@
from __future__ import division
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import joblib
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from keras import backend as K
from sklearn.utils import compute_class_weight
import keras
from sklearn.model_selection import cross_val_score
import helpers as hlp
......@@ -19,12 +21,13 @@ from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateSchedule
import json
import feature_engineering
from functools import partial
from DNN import *
memory = joblib.Memory(cachedir='/home/mboos/joblib')
best_weights_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=3)
early = EarlyStopping(monitor="val_loss", mode="min", patience=10)
def schedule(ind):
a = [0.002,0.002,0.002,0.001,0.001]
return a[ind]
......@@ -32,112 +35,30 @@ lr = LearningRateScheduler(schedule)
callbacks_list = [checkpoint, early] #early
fit_args = {'batch_size' : 256, 'epochs' : 20,
fit_args = {'batch_size' : 256, 'epochs' : 30,
'validation_split' : 0.2, 'callbacks' : callbacks_list}
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
def train_DNN(model_name, *args, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
model = models.Embedding_Blanko_DNN(**kwargs)
with open('../model_specs/{}.json'.format(model_name), 'w') as fl:
json.dump(model.model.to_json(), fl)
model.fit(*args, **fit_args)
model.model.load_weights(best_weights_path)
return model
def continue_training_DNN(model_name, *args, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
model = models.Embedding_Blanko_DNN(**kwargs)
model.model.load_weights(best_weights_path)
logger = CSVLogger('../logs/{}_more.csv'.format(model_name), separator=',', append=True)
best_weights_path="{}_more_best.hdf5".format(model_name)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
model.fit(*args, **fit_args)
return model
def continue_training_DNN_one_output(model_name, i, weights, **kwargs):
best_weights_path="{}_{}_best.hdf5".format(model_name, i)
model = models.Embedding_Blanko_DNN(**kwargs)
transfer_weights_multi_to_one(weights, model.model, i)
logger = CSVLogger('../logs/{}.csv'.format(model_name), separator=',', append=False)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
model.fit(train_text, train_y[:,i], **fit_args)
model.model.load_weights(best_weights_path)
return model
def predict_for_one_category(model_name, model):
predictions = model.predict(test_text)
joblib.dump(predictions, '{}.pkl'.format(model_name))
def predict_for_all(model):
test_text, _ = pre.load_data('test.csv')
predictions = model.predict(test_text)
hlp.write_model(predictions)
def fit_model(model_name, *args, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
logger = CSVLogger('../logs/{}.csv'.format(model_name), separator=',', append=False)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
model = train_DNN(model_name, *args, **kwargs)
return model
def load_keras_model(model_name, **kwargs):
from keras.models import model_from_json
model_path = '../model_specs/{}.json'.format(model_name)
with open(model_path, 'r') as fl:
model = model_from_json(json.load(fl))
return model
def load_full_model(name, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
model = models.Embedding_Blanko_DNN(embedding, **kwargs)
model.model.load_weights(best_weights_path)
return model
def hacky_load_LSTM():
model_name = '300_fasttext_LSTM_test'
model = load_keras_model(model_name)
model.load_weights('300_fasttext_LSTM_best.hdf5')
return model
def transfer_weights_multi_to_one(weights, model, i):
for weights_old, layer in zip(weights[2:-1], model.layers[2:-1]):
layer.set_weights(weights_old)
# now for the last layer
model.layers[-1].set_weights([weights[-1][0][:,i][:,None], weights[-1][1][i][None]])
def fine_tune_model(model_name, old_model, **kwargs):
'''Fits and returns a model for one label (provided as index i)'''
weights = [layer.get_weights() for layer in old_model.layers]
for i in xrange(6):
new_name = model_name + '_{}'.format(i)
predict_for_one_category(new_name,
continue_training_DNN_one_output(new_name, i, weights, **kwargs))
if __name__=='__main__':
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNLSTM)
aux_task = train_y.sum(axis=1) > 0
class_weights = hlp.get_class_weights(train_y)
weight_tensor = tf.convert_to_tensor(class_weights, dtype=tf.float32)
loss = partial(models.weighted_binary_crossentropy, weights=weight_tensor)
loss.__name__ = 'weighted_binary_crossentropy'
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=1)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 300,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.9), 'loss':{'main_output':'binary_crossentropy'}, 'metrics':['accuracy']}}
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
model_name = '300_fasttext_cuda_GRU'
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
model_name = '300_fasttext_cudnn_LSTM'
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
# aux_task = train_text.apply(feature_engineering.proportion_capitals)
model = fit_model(model_name, {'main_input':train_text}, {'main_output':train_y,'aux_output':aux_task}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = fit_model(model_name, {'main_input':train_text}, {'main_output':train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
model = continue_training_DNN(model_name, {'main_input':train_text}, {'main_output':train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
hlp.write_model(model.predict(test_text))
K.clear_session()
# model_params['compilation_args']['optimizer'] = optimizers.Adam(lr=0.0005, beta_2=0.99)
......
from __future__ import division
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
......@@ -7,6 +8,7 @@ import json
import joblib
import preprocessing as pre
import pandas as pd
from collections import Counter
memory = joblib.Memory('/home/mboos/joblib')
......@@ -21,6 +23,23 @@ def correct_predictions(predictions, factor=0.5):
corrected = logit(predictions)-0.5
return np.exp(corrected)/(np.exp(corrected)+1)
def get_class_weights(y_mat, smooth_factor=0.):
"""
Returns the weights for each class based on the frequencies of the samples
:param smooth_factor: factor that smooths extremely uneven weights
:param y: list of true labels (the labels must be hashable)
:return: dictionary with the weight for each class
"""
mat_counts = y_mat.sum(axis=0)
if smooth_factor > 0:
p = mat_counts.max() * smooth_factor
mat_counts += p
return mat_counts.max() / mat_counts
def make_weight_matrix(y_mat, weights):
return np.tile(weights[None], (y_mat.shape[0], 1))
def write_model(predictions, correct=correct_predictions,
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
import pandas as pd
......
......@@ -18,22 +18,33 @@ from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
import tensorflow as tf
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras import optimizers
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, BatchNormalization, MaxPooling1D
from keras.layers import CuDNNLSTM, CuDNNGRU, GRU
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from functools import partial
import keras.preprocessing.text
import enchant
import string
import json
import enchant
import copy
corr_dict1 = enchant.request_dict('en_US')
maketrans = string.maketrans
memory = joblib.Memory(cachedir='/home/mboos/joblib')
@memory.cache
def get_fixed_DNN_params():
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 300,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99)}}
return model_params
def make_default_language_dict(train_X=None, train_labels=None):
'''Returns a defaultdict that can be used in predict_for_language to predict a prior'''
......@@ -57,7 +68,6 @@ def text_to_word_sequence(text,
return [i for i in seq if i]
text.text_to_word_sequence = text_to_word_sequence
memory = joblib.Memory(cachedir='/home/mboos/joblib')
class NBMLR(BaseEstimator):
def __init__(self, **kwargs):
......@@ -220,9 +230,12 @@ def add_oov_vector_and_prune(embedding_matrix, tokenizer):
embedding_matrix = np.vstack([embedding_matrix, np.zeros((1, embedding_matrix.shape[1]))])
return prune_matrix_and_tokenizer(embedding_matrix, tokenizer)
def make_model_function(**kwargs):
return partial(RNN_general, **kwargs)
class Embedding_Blanko_DNN(BaseEstimator):
def __init__(self, embedding=None, max_features=20000, model_function=None, tokenizer=None,
maxlen=200, embedding_dim=100, correct_spelling=False, trainable=False, preprocess_embedding=False,
maxlen=200, embedding_dim=300, correct_spelling=False, trainable=False, preprocess_embedding=False,
compilation_args={'optimizer':'adam','loss':'binary_crossentropy','metrics':['accuracy']}, embedding_args={'n_components' : 100}):
self.compilation_args = compilation_args
self.max_features = max_features
......@@ -246,9 +259,12 @@ class Embedding_Blanko_DNN(BaseEstimator):
self.embedding = hlp.get_glove_embedding('../glove.6B.100d.txt')
if model_function:
self.model_function = model_function
if callable(model_function):
self.model_function = model_function
else:
self.model_function = make_model_function(**model_function)
else:
self.model_function = LSTM_dropout_model
self.model_function = RNN_general
if self.tokenizer.is_trained:
word_index = self.tokenizer.tokenizer.word_index
......@@ -256,7 +272,7 @@ class Embedding_Blanko_DNN(BaseEstimator):
embedding_matrix, self.tokenizer.tokenizer = add_oov_vector_and_prune(embedding_matrix, self.tokenizer.tokenizer)
embedding_layer = make_embedding_layer(embedding_matrix, maxlen=self.maxlen,
trainable=self.trainable, preprocess_embedding=self.preprocess_embedding, **self.embedding_args)
sequence_input = Input(shape=(self.maxlen,), dtype='int32')
sequence_input = Input(shape=(self.maxlen,), dtype='int32', name='main_input')
embedded_sequences = embedding_layer(sequence_input)
outputs, aux_input = self.model_function(embedded_sequences)
if aux_input:
......@@ -283,14 +299,17 @@ class Embedding_Blanko_DNN(BaseEstimator):
self.model = Model(inputs=inputs, outputs=outputs)
self.model.compile(**self.compilation_args)
X_t = self.tokenizer.transform(X)
self.model.fit(X_t, y, **kwargs)
X['main_input'] = self.tokenizer.transform(X['main_input'])
self.model.fit(X, y, **kwargs)
return self
def predict(self, X):
X_t = self.tokenizer.transform(X)
return self.model.predict(X_t)
def weighted_binary_crossentropy(y_true, y_pred, weights):
return tf.keras.backend.mean(tf.multiply(tf.keras.backend.binary_crossentropy(y_true, y_pred), weights), axis=-1)
def transfer_model(old_model_path, new_model):
'''Transfers all the weights of the old model to the new one except the last layer'''
weights = old_model.model.get_weights()
......@@ -340,6 +359,24 @@ def LSTM_twice_dropout_model(x):
x = Dense(6, activation="sigmoid")(x)
return x
def RNN_aux_loss(x, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32, rnn_func=None, dropout=0.5, aux_dim=1):
if rnn_func is None:
rnn_func = LSTM
if not isinstance(hidden_rnn, list):
hidden_rnn = [hidden_rnn] * no_rnn_layers
if len(hidden_rnn) != no_rnn_layers:
raise ValueError('list of recurrent units needs to be equal to no_rnn_layers')
for rnn_size in hidden_rnn:
x = Dropout(dropout)(x)
x = Bidirectional(rnn_func(rnn_size, return_sequences=True))(x)
aux_dense = Dense(aux_dim, activation='sigmoid', name='aux_output')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(dropout)(x)
x = Dense(hidden_dense, activation='relu')(x)
x = Dropout(dropout)(x)
x = Dense(6, activation="sigmoid", name='main_output')(x)
return [x, aux_dense], None
def RNN_general(x, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32, rnn_func=None, dropout=0.5):
if rnn_func is None:
rnn_func = LSTM
......@@ -348,13 +385,14 @@ def RNN_general(x, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32, rnn_func=Non
if len(hidden_rnn) != no_rnn_layers:
raise ValueError('list of recurrent units needs to be equal to no_rnn_layers')
for rnn_size in hidden_rnn:
x = Bidirectional(rnn_func(rnn_size, return_sequences=True, dropout=dropout))(x)
x = Dropout(dropout)(x)
x = Bidirectional(rnn_func(rnn_size, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(dropout)(x)
x = Dense(hidden_dense, activation='relu')(x)
x = Dropout(dropout)(x)
x = Dense(6, activation="sigmoid", name='main_output')(x)
return x
return x, None
def LSTM_CUDA_dropout_model(x):
x = Bidirectional(CuDNNLSTM(64, return_sequences=True, dropout=0.5))(x)
......
......@@ -85,6 +85,7 @@ class KerasPaddingTokenizer(BaseEstimator, TransformerMixin):
def fit(self, list_of_sentences, y=None, **kwargs):
self.tokenizer.fit_on_texts(list(list_of_sentences))
print('fit this thing')
self.is_trained = True
return self
......
......@@ -14,14 +14,12 @@ import helpers as hlp
import models
import preprocessing as pre
import json
from keras import optimizers
import time
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, CSVLogger
import feature_engineering
#TODO: implement hyper parameter search
#TODO: get vocabulary on full corpus
def DNN_model(X, y, fit_args={}, **kwargs):
'''Builds and evaluates a CNN on train_text, train_labels'''
model = models.Embedding_Blanko_DNN(**kwargs)
......@@ -41,43 +39,40 @@ def RF_model(X, y, kwargs):
return validator(model, X, y)
#TODO: more information??
def validator(estimator, X, y, cv=5, fit_args={}, **kwargs):
def validator(estimator, X, y, cv=3, fit_args={}, **kwargs):
'''Validate mean log loss of model'''
kfold = KFold(n_splits=cv, shuffle=True)
scores = []
Xs = np.zeros((len(X),1), dtype='int8')
for train, test in kfold.split(Xs):
# train_x = X[train] #[X[i] for i in train]
# test_x = X[test]# for i in test]
estimator.fit(X[train], y[train], **fit_args)
predictions = estimator.predict(X[test])
train_x = [X[i] for i in train]
test_x = [X[i] for i in test]
estimator.fit(train_x, y[train], **fit_args)
predictions = estimator.predict(test_x)
scores.append(hlp.mean_log_loss(y[test], predictions))
score_dict = {'loss' : np.mean(scores), 'loss_fold' : scores, 'status' : STATUS_OK}
return score_dict
#for now for ALL languages
def validate_token_model(model_name, model_function, space, fixed_params_file='../parameters/fixed.json'):
with open(fixed_params_file, 'r') as fl:
fixed_params_dict = json.load(fl)
train_text, train_y = pre.load_data((
#TODO: add other params
#TODO: model_func_param
def validate_token_model(model_name, model_function, space, maxlen=300, max_features=500000):
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
frozen_tokenizer = pre.KerasPaddingTokenizer(maxlen=fixed_params_dict['maxlen'],
max_features=fixed_params_dict['max_features'])
frozen_tokenizer = pre.KerasPaddingTokenizer(maxlen=maxlen,
max_features=max_features)
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
fit_args = {'batch_size' : 256, 'epochs' : 20,
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
compilation_args = {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}
callbacks_list = [EarlyStopping(monitor="val_loss", mode="min", patience=5)]
fit_args = {'batch_size' : 256, 'epochs' : 15,
'validation_split' : 0.1, 'callbacks' : callbacks_list}
# freeze all constant parameters
frozen_model_func = partial(model_function, train_text, train_y, fit_args=fit_args,
tokenizer=frozen_tokenizer, **fixed_params_dict)
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
tokenizer=frozen_tokenizer, embedding=embedding, compilation_args=compilation_args)
trials = Trials()
best = fmin(model_function, space=space, algo=tpe.suggest, max_evals=10, trials=trials)
best = fmin(model_function, space=space, algo=tpe.suggest, max_evals=20, trials=trials)
hlp.dump_trials(trials, fname=model_name)
return best
......@@ -95,10 +90,13 @@ def validate_feature_model(model_name, model_function, space, fixed_params_file=
return best
if __name__=='__main__':
feature_models_to_test = {
'gbc' : (GBC_model, {'n_estimators' : 80+hp.randint('n_estimators', 100), 'max_depth' : 1 + hp.randint('max_depth', 6)}),
'rf' : (RF_model, {'n_estimators' : 5 + hp.randint('n_estimators', 30)})
}
for model_name, (func, space) in feature_models_to_test.iteritems():
best = validate_feature_model(model_name, func, space)
DNN_search_space = {'model_function' : {'no_rnn_layers' : hp.choice('no_rnn_layers', [1, 2]),
# 'rnn_func' : hp.choice('rnn_func', [models.CuDNNLSTM, models.CuDNNGRU]),
'hidden_rnn' : hp.quniform('hidden_rnn', 32,128,16),
'hidden_dense' : hp.quniform('hidden_dense', 16, 64, 8),
'dropout' : hp.uniform('dropout', 0.3, 0.9)}}
token_models_to_test = {
'DNN' : (DNN_model, DNN_search_space)}
for model_name, (func, space) in token_models_to_test.iteritems():
best = validate_token_model(model_name, func, space)
joblib.dump(best, 'best_{}.pkl'.format(model_name))
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment