Commit 7df06b25 authored by mjboos's avatar mjboos

finetuning

parent 8da088d5
......@@ -26,7 +26,7 @@ lr = LearningRateScheduler(schedule)
callbacks_list = [checkpoint, early] #early
fit_args = {'batch_size' : 256, 'epochs' : 20,
fit_args = {'batch_size' : 256, 'epochs' : 10,
'validation_split' : 0.2, 'callbacks' : callbacks_list}
train_text, train_y = pre.load_data()
......@@ -41,6 +41,22 @@ def train_DNN(model_name, embeddings_index, **kwargs):
model.model.load_weights(best_weights_path)
return model
def continue_training_DNN_one_output(model_name, i, weights, **kwargs):
best_weights_path="{}_{}_best.hdf5".format(model_name, i)
model = models.Embedding_Blanko_DNN(**kwargs)
transfer_weights_multi_to_one(weights, model.model, i)
logger = CSVLogger('../logs/{}.csv'.format(model_name), separator=',', append=False)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
model.fit(train_text, train_y[:,i], **fit_args)
model.model.load_weights(best_weights_path)
return model
def predict_for_one_category(model_name, model):
predictions = model.predict(test_text)
joblib.dump(predictions, '{}.pkl'.format(model_name))
def predict_for_all(model):
test_text, _ = pre.load_data('test.csv')
predictions = model.predict(test_text)
......@@ -56,12 +72,11 @@ def fit_model(name, **kwargs):
model = train_DNN(model_name, embedding, **kwargs)
return model
def load_keras_model(name, **kwargs):
def load_keras_model(model_name, **kwargs):
from keras.models import model_from_json
best_weights_path="{}_best.hdf5".format(model_name)
model_path = '../model_specs/{}.json'
model = model_from_json(model_path)
model.load_weights(best_weights_path)
model_path = '../model_specs/{}.json'.format(model_name)
with open(model_path, 'r') as fl:
model = model_from_json(json.load(fl))
return model
def load_full_model(name, **kwargs):
......@@ -71,16 +86,39 @@ def load_full_model(name, **kwargs):
model.model.load_weights(best_weights_path)
return model
def hacky_load_LSTM():
model_name = '300_fasttext_LSTM_test'
model = load_keras_model(model_name)
model.load_weights('300_fasttext_LSTM_best.hdf5')
return model
def transfer_weights_multi_to_one(weights, model, i):
for weights_old, layer in zip(weights[2:-1], model.layers[2:-1]):
layer.set_weights(weights_old)
# now for the last layer
model.layers[-1].set_weights([weights[-1][0][:,i][:,None], weights[-1][1][i][None]])
def fine_tune_model(model_name, old_model, **kwargs):
'''Fits and returns a model for one label (provided as index i)'''
weights = [layer.get_weights() for layer in old_model.layers]
for i in xrange(6):
new_name = model_name + '_{}'.format(i)
predict_for_one_category(new_name,
continue_training_DNN_one_output(new_name, i, weights, **kwargs))
if __name__=='__main__':
model_params = {
'max_features' : 500000, 'model_function' : models.LSTM_dropout_model, 'maxlen' : 200,
'max_features' : 500000, 'model_function' : models.LSTM_one_class, 'maxlen' : 200,
'embedding_dim' : 300,
'compilation_args' : {'optimizer' : 'adam', 'loss':'binary_crossentropy','metrics':['accuracy']}}
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
model_name = '300_fasttext_LSTM'
model_name = '300_fasttext_finetune_LSTM'
model_old = hacky_load_LSTM()
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
fine_tune_model(model_name, model_old, embeddings_index=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = load_keras_model(model_name, tokenizer=frozen_tokenizer, **model_params)
# model = fit_model(model_name, tokenizer=frozen_tokenizer, **mode_params )
# model = fit_model(model_name, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict(test_text))
......@@ -9,6 +9,7 @@ from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.base import BaseEstimator
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
import helpers as hlp
import preprocessing as pre
......@@ -199,9 +200,14 @@ def make_embedding_matrix(embeddings_index, word_index, max_features=20000, maxl
return embedding_matrix
def make_embedding_layer(embedding_matrix, maxlen=200, trainable=False):
def make_embedding_layer(embedding_matrix, maxlen=200, trainable=False, preprocess_embedding=False, **kwargs):
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
if preprocess_embedding:
pca = PCA(svd_solver='randomized', whiten=True, **kwargs)
embedding2 = pca.fit_transform(embedding_matrix[1:-1])
embedding_matrix[1:-1, :embedding2.shape[1]] = embedding2
embedding_matrix = embedding_matrix[:,: embedding2.shape[1]]
embedding_layer = Embedding(embedding_matrix.shape[0],
embedding_matrix.shape[1],
weights=[embedding_matrix],
......@@ -215,15 +221,17 @@ def add_oov_vector_and_prune(embedding_matrix, tokenizer):
class Embedding_Blanko_DNN(BaseEstimator):
def __init__(self, embeddings_index=None, max_features=20000, model_function=None, tokenizer=None,
maxlen=200, embedding_dim=100, correct_spelling=False, trainable=False,
compilation_args={'optimizer':'adam','loss':'binary_crossentropy','metrics':['accuracy']}):
maxlen=200, embedding_dim=100, correct_spelling=False, trainable=False, preprocess_embedding=False,
compilation_args={'optimizer':'adam','loss':'binary_crossentropy','metrics':['accuracy']}, embedding_args={'n_components' : 100}):
self.compilation_args = compilation_args
self.max_features = max_features
self.trainable = trainable
self.maxlen = maxlen
self.embedding_dim = embedding_dim
self.correct_spelling = correct_spelling
# test for embedding
self.preprocess_embedding = preprocess_embedding
self.embedding_args = embedding_args
if tokenizer:
self.tokenizer = copy.deepcopy(tokenizer)
if tokenizer.is_trained:
......@@ -245,7 +253,8 @@ class Embedding_Blanko_DNN(BaseEstimator):
word_index = self.tokenizer.tokenizer.word_index
embedding_matrix = make_embedding_matrix(self.embeddings_index, word_index, max_features=self.max_features, maxlen=self.maxlen, embedding_dim=self.embedding_dim, correct_spelling=self.correct_spelling)
embedding_matrix, self.tokenizer.tokenizer = add_oov_vector_and_prune(embedding_matrix, self.tokenizer.tokenizer)
embedding_layer = make_embedding_layer(embedding_matrix, maxlen=self.maxlen, trainable=self.trainable)
embedding_layer = make_embedding_layer(embedding_matrix, maxlen=self.maxlen,
trainable=self.trainable, preprocess_embedding=self.preprocess_embedding, **self.embedding_args)
sequence_input = Input(shape=(self.maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = self.model_function(embedded_sequences)
......@@ -258,7 +267,7 @@ class Embedding_Blanko_DNN(BaseEstimator):
word_index = self.tokenizer.tokenizer.word_index
embedding_matrix = make_embedding_matrix(self.embeddings_index, word_index, max_features=self.max_features, maxlen=self.maxlen, embedding_dim=self.embedding_dim, correct_spelling=self.correct_spelling)
embedding_matrix, self.tokenizer.tokenizer = add_oov_vector_and_prune(embedding_matrix, self.tokenizer.tokenizer)
embedding_layer = make_embedding_layer(embedding_matrix, maxlen=self.maxlen, trainable=self.trainable)
embedding_layer = make_embedding_layer(embedding_matrix, maxlen=self.maxlen, trainable=self.trainable, preprocess_embedding=self.preprocess_embedding, **self.embedding_args)
sequence_input = Input(shape=(self.maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = self.model_function(embedded_sequences)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment