Commit 8da088d5 authored by mjboos's avatar mjboos

alot of new stuff

parent 2aa4c002
......@@ -26,47 +26,61 @@ lr = LearningRateScheduler(schedule)
callbacks_list = [checkpoint, early] #early
fit_args = {'batch_size' : 128, 'epochs' : 20,
fit_args = {'batch_size' : 256, 'epochs' : 20,
'validation_split' : 0.2, 'callbacks' : callbacks_list}
# for now use only english as model
train_per_language = pre.load_data()
train_text, train_labels = train_per_language['en']
test_per_language = pre.load_data('test.csv')
test_text, _ = test_per_language['en']
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
train_y = train_labels.values
def train_DNN(embeddings_index, **kwargs):
def train_DNN(model_name, embeddings_index, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
model = models.Embedding_Blanko_DNN(embeddings_index=embeddings_index, **kwargs)
with open('../model_specs/{}.json'.format(model_name), 'w') as fl:
json.dump(model.model.to_json(), fl)
model.fit(train_text, train_y, **fit_args)
model.model.load_weights(best_weights_path)
return model
def DNN_EN_to_language_dict(model_english, train_per_language, simple_for=['fr', 'de', 'es', 'it']):
language_dict = models.make_default_language_dict()
language_dict['en'] = model_english
if simple_for:
for simple_lan in simple_for:
language_dict[simple_lan] = models.tfidf_model().fit(*train_per_language[simple_lan])
hlp.write_model(hlp.predictions_for_language(language_dict))
if __name__=='__main__':
maxlen = 200
max_features = 500000
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=max_features, maxlen=maxlen)
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding_dim = 300
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
def predict_for_all(model):
test_text, _ = pre.load_data('test.csv')
predictions = model.predict(test_text)
hlp.write_model(predictions)
def fit_model(name, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
logger = CSVLogger('../logs/{}.csv'.format(model_name), separator=',', append=False)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
logger = CSVLogger('../logs/300_fasttext_LSTM.csv', separator=',', append=False)
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
DNN_EN_to_language_dict(
train_DNN(embedding, trainable=False, maxlen=maxlen,
max_features=max_features, model_function=models.LSTM_dropout_model,
embedding_dim=embedding_dim, tokenizer=frozen_tokenizer,
compilation_args={'optimizer' : 'adam', 'loss':'binary_crossentropy','metrics':['accuracy']}))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
model = train_DNN(model_name, embedding, **kwargs)
return model
def load_keras_model(name, **kwargs):
from keras.models import model_from_json
best_weights_path="{}_best.hdf5".format(model_name)
model_path = '../model_specs/{}.json'
model = model_from_json(model_path)
model.load_weights(best_weights_path)
return model
def load_full_model(name, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
model = models.Embedding_Blanko_DNN(embedding, **kwargs)
model.model.load_weights(best_weights_path)
return model
if __name__=='__main__':
model_params = {
'max_features' : 500000, 'model_function' : models.LSTM_dropout_model, 'maxlen' : 200,
'embedding_dim' : 300,
'compilation_args' : {'optimizer' : 'adam', 'loss':'binary_crossentropy','metrics':['accuracy']}}
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
model_name = '300_fasttext_LSTM'
# model = load_keras_model(model_name, tokenizer=frozen_tokenizer, **model_params)
# model = fit_model(model_name, tokenizer=frozen_tokenizer, **mode_params )
# hlp.write_model(model.predict(test_text))
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import pandas as pd, numpy as np
import helpers as hlp
from keras.preprocessing import text, sequence
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
import re, string
from sklearn.base import BaseEstimator, TransformerMixin
import string
import langid
eng_stopwords = set(stopwords.words("english"))
memory = joblib.Memory(cachedir='/home/mboos/joblib')
def count_symbol(row, symbol='!'):
return row.count(symbol)
def count_capitals(row):
return np.sum([c.isupper() for c in row])
def proportion_capitals(row):
return count_capitals(row)/np.float(len(row))
def num_unique_words(row):
return np.float(len(set(w for w in row.split(' '))))
def proportion_unique_words(row):
return num_unique_words(row) / np.float(len(row.split(' ')))
def language_identity(row):
return langid.classify(row)[0]
feature_mapping_dict = {
'count_symbol' : count_symbol,
'count_capitals' : count_capitals,
'proportion_capitals' : proportion_capitals,
'num_unique_words' : num_unique_words,
'proportion_unique_words' : proportion_unique_words,
'language' : language_identity}
@memory.cache
def compute_features(text_df, which_features=None):
if which_features:
feature_funcs = [feature_mapping_dict[feature_name] for feature_name in which_features]
else:
feature_funcs = feature_mapping_dict.values()
feature_data = np.zeros((text_df.shape[0],len(feature_funcs)))
for i, ft_func in enumerate(feature_funcs):
features = text_df.apply(ft_func)
if features.dtype == 'object':
features = LabelEncoder().fit_transform(features)
feature_data[:,i] = features
return feature_data
......@@ -3,20 +3,22 @@ from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.base import BaseEstimator
from itertools import izip
import json
import joblib
import preprocessing as pre
import pandas as pd
memory = joblib.Memory('/home/mboos/joblib')
#TODO: is this really the metric??
def mean_log_loss(estimator, X, y):
def mean_log_loss(y_test, y_pred):
'''Returns the mean log loss'''
probas = [proba[:,1] for proba in estimator.predict_proba(X)]
column_loss = [log_loss(y_col, y_pred_col) for y_col, y_pred_col
in izip(y.T, probas)]
# probas = [proba[:,1] for proba in estimator.predict_proba(X)]
column_loss = [log_loss(y_test[:,i], y_pred[:,i]) for i in xrange(y_pred.shape[1])]
return np.mean(column_loss)
def correct_predictions(predictions, factor=0.5):
corrected = logit(predicions)-0.5
corrected = logit(predictions)-0.5
return np.exp(corrected)/(np.exp(corrected)+1)
def write_model(predictions, correct=correct_predictions,
......@@ -64,10 +66,13 @@ def predictions_for_language(language_dict, test_data=None):
'''Expects a language_dict, where the keys correspond to languages and the values to models that implement fit'''
if test_data is None:
test_data = pre.load_data(name='test.csv')
languages_test = pd.read_csv('language_test.csv')
languages_test = pd.read_csv('language_test.csv', header=None, squeeze=True)
predictions = np.zeros((languages_test.shape[0], 6))
# iterate through languages
for language, (language_data, _) in test_data.items():
predictions[languages_test==language] = language_dict[language].predict(language_data)
predictions[languages_test==language, :] = language_dict[language].predict_proba(language_data)
return predictions
def dump_trials(trials, fname=''):
import time
joblib.dump(trials, '../validation_logs/trial_{}_{}.json'.format(fname, time.strftime("%m%d-%H%M")))
......@@ -7,7 +7,7 @@ import joblib
import pandas as pd, numpy as np
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.base import BaseEstimator
from sklearn.ensemble import GradientBoostingClassifier
import helpers as hlp
......@@ -38,9 +38,9 @@ def make_default_language_dict(train_X=None, train_labels=None):
from collections import defaultdict
from sklearn.dummy import DummyClassifier
if not train_X or not train_labels:
_, train_labels = pre.load_data(language=False)
train_X = np.zeros_like(train_labels)[:,None]
return defaultdict(DummyClassifier().fit(train_X, train_labels))
train_X, train_labels = pre.load_data()
dummy_pipe = pipe.Pipeline(steps=[('pre',HashingVectorizer()),('model', MultiOutputClassifier(DummyClassifier()))])
return defaultdict(lambda:dummy_pipe.fit(train_X, train_labels))
def text_to_word_sequence(text,
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
......@@ -57,10 +57,9 @@ def text_to_word_sequence(text,
text.text_to_word_sequence = text_to_word_sequence
memory = joblib.Memory(cachedir='/home/mboos/joblib')
class NBMLR(BaseEstimator):
def __init__(self, C=4, dual=True, **kwargs):
self.lr = LogisticRegression(C=C, dual=dual, **kwargs)
def __init__(self, **kwargs):
self.lr = LogisticRegression(**kwargs)
self.r = None
def __prior(self, y_i, y, X):
......@@ -85,12 +84,12 @@ class NBMLR(BaseEstimator):
def tfidf_model(pre_args={'ngram_range' : (1,2), 'tokenizer' : None,
'min_df' : 3, 'max_df' : 0.9, 'strip_accents' : 'unicode',
'use_idf' : 1, 'smooth_idf' : 1, 'sublinear_tf' : 1},
estimator_args={}, model_func=None):
model_func=None, **kwargs):
'''Returns unfitted tfidf_NBSVM pipeline object'''
if model_func is None:
model_func = NBMLR
return pipe.Pipeline(steps=[('tfidf', TfidfVectorizer(**pre_args)),
('model', model_func(**estimator_args))])
('model', MultiOutputClassifier(model_func(**kwargs)))])
def keras_token_model(model_fuction=None, max_features=20000, maxlen=100, embed_size=128):
if model_function is None:
......@@ -171,7 +170,7 @@ def which_words_are_zero_vectors(embedding_matrix, word_index, oov_token):
return word_list
#TODO: more flexible spelling correction
def make_embedding_matrix(embeddings_index, word_index, max_features=20000, maxlen=200, embedding_dim=50, correct_spelling=None, diagnostics=False):
def make_embedding_matrix(embeddings_index, word_index, max_features=20000, maxlen=200, embedding_dim=50, correct_spelling=None, diagnostics=False, **kwargs):
num_words = min(max_features, len(word_index))
# add one element for zero vector
embedding_matrix = np.zeros((num_words+1, embedding_dim))
......@@ -227,6 +226,8 @@ class Embedding_Blanko_DNN(BaseEstimator):
if tokenizer:
self.tokenizer = copy.deepcopy(tokenizer)
if tokenizer.is_trained:
self.tokenizer.is_trained = True
else:
self.tokenizer = pre.KerasPaddingTokenizer(max_features=max_features, maxlen=maxlen)
......@@ -240,19 +241,31 @@ class Embedding_Blanko_DNN(BaseEstimator):
else:
self.model_function = LSTM_dropout_model
if self.tokenizer.is_trained:
word_index = self.tokenizer.tokenizer.word_index
embedding_matrix = make_embedding_matrix(self.embeddings_index, word_index, max_features=self.max_features, maxlen=self.maxlen, embedding_dim=self.embedding_dim, correct_spelling=self.correct_spelling)
embedding_matrix, self.tokenizer.tokenizer = add_oov_vector_and_prune(embedding_matrix, self.tokenizer.tokenizer)
embedding_layer = make_embedding_layer(embedding_matrix, maxlen=self.maxlen, trainable=self.trainable)
sequence_input = Input(shape=(self.maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = self.model_function(embedded_sequences)
self.model = Model(inputs=sequence_input, outputs=x)
self.model.compile(**self.compilation_args)
def fit(self, X, y, **kwargs):
if not self.tokenizer.is_trained:
self.tokenizer.fit(X)
word_index = self.tokenizer.tokenizer.word_index
embedding_matrix = make_embedding_matrix(self.embeddings_index, word_index, max_features=self.max_features, maxlen=self.maxlen, embedding_dim=self.embedding_dim, correct_spelling=self.correct_spelling)
embedding_matrix, self.tokenizer.tokenizer = add_oov_vector_and_prune(embedding_matrix, self.tokenizer.tokenizer)
embedding_layer = make_embedding_layer(embedding_matrix, maxlen=self.maxlen, trainable=self.trainable)
sequence_input = Input(shape=(self.maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = self.model_function(embedded_sequences)
self.model = Model(inputs=sequence_input, outputs=x)
self.model.compile(**self.compilation_args)
X_t = self.tokenizer.transform(X)
word_index = self.tokenizer.tokenizer.word_index
embedding_matrix = make_embedding_matrix(self.embeddings_index, word_index, max_features=self.max_features, maxlen=self.maxlen, embedding_dim=self.embedding_dim, correct_spelling=self.correct_spelling)
embedding_matrix, self.tokenizer.tokenizer = add_oov_vector_and_prune(embedding_matrix, self.tokenizer.tokenizer)
embedding_layer = make_embedding_layer(embedding_matrix, maxlen=self.maxlen, trainable=self.trainable)
sequence_input = Input(shape=(self.maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = self.model_function(embedded_sequences)
self.model = Model(inputs=sequence_input, outputs=x)
self.model.compile(**self.compilation_args)
self.model.fit(X_t, y, **kwargs)
return self
......@@ -260,6 +273,11 @@ class Embedding_Blanko_DNN(BaseEstimator):
X_t = self.tokenizer.transform(X)
return self.model.predict(X_t)
def transfer_model(old_model_path, new_model):
'''Transfers all the weights of the old model to the new one except the last layer'''
weights = old_model.model.get_weights()
pass
def CNN_batchnorm_model(x):
x = Conv1D(32, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
......@@ -284,6 +302,16 @@ def CNN_model(x):
x = Dense(6, activation="sigmoid")(x)
return x
def LSTM_larger_dense_dropout_model(x):
x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.5))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.5)(x)
x = Dense(40, activation="relu")(x)
x = Dropout(0.5)(x)
x = Dense(6, activation="sigmoid")(x)
return x
def LSTM_twice_dropout_model(x):
x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.5))(x)
x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.5))(x)
......@@ -303,3 +331,15 @@ def LSTM_dropout_model(x):
x = Dense(6, activation="sigmoid")(x)
return x
def LSTM_one_class(x, model_func=None):
if model_func is None:
model_func = LSTM_dropout_model
# not implemented for now
x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.5))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.5)(x)
x = Dense(32, activation="relu")(x)
x = Dropout(0.5)(x)
x = Dense(1, activation="sigmoid")(x)
return x
......@@ -41,23 +41,23 @@ def clean_comment(text):
def data_preprocessing(df):
COMMENT = 'comment_text'
df[COMMENT].fillna('_UNK_', inplace=True)
df[COMMENT] = df[COMMENT].apply(clean_comment)
# df[COMMENT] = df[COMMENT].apply(clean_comment)
return df
def load_data(name='train.csv', preprocess=True, language=True):
def load_data(name='train.csv', preprocess=True):
data = pd.read_csv('../input/{}'.format(name), encoding='utf-8')
if preprocess:
data = data_preprocessing(data)
if language:
languages = pd.read_csv('language_{}'.format(name), header=None).squeeze()
grouped_data = data.groupby(by=lambda x : languages[x])
data_dict = { language : [data['comment_text'], data.iloc[:, 2:].values]
for language, data in grouped_data }
else:
text = data['comment_text']
labels = data.iloc[:, 2:].values
data_dict = {'babel' : [text, labels]}
return data_dict
# if language:
# languages = pd.read_csv('language_{}'.format(name), header=None).squeeze()
# grouped_data = data.groupby(by=lambda x : languages[x])
# data_dict = { language : [data['comment_text'], data.iloc[:, 2:].values]
# for language, data in grouped_data }
# else:
text = data['comment_text']
labels = data.iloc[:, 2:].values
# data_dict = {'babel' : [text, labels]}
return text, labels
def keras_pad_sequence_to_sklearn_transformer(maxlen=100):
from sklearn.preprocessing import FunctionTransformer
......@@ -66,7 +66,7 @@ def keras_pad_sequence_to_sklearn_transformer(maxlen=100):
class KerasPaddingTokenizer(BaseEstimator, TransformerMixin):
def __init__(self, max_features=20000, maxlen=200,
filters='!\'"#$%&()*+,-./:;<=>?@[\\]^_`{|}~1234567890\t\n', **kwargs):
filters='!\'"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', **kwargs):
self.max_features = max_features
self.maxlen = maxlen
self.is_trained = False
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_val_score
import helpers as hlp
import models
import preprocessing as pre
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, CSVLogger
import json
import copy
memory = joblib.Memory(cachedir='/home/mboos/joblib')
best_weights_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
def schedule(ind):
a = [0.002,0.002,0.002,0.001,0.001]
return a[ind]
lr = LearningRateScheduler(schedule)
callbacks_list = [checkpoint, early] #early
fit_args = {'batch_size' : 128, 'epochs' : 20,
'validation_split' : 0.2, 'callbacks' : callbacks_list}
# for now use only english as model
train_per_language = pre.load_data(language=False)
train_text, train_y = train_per_language['babel']
test_per_language = pre.load_data('test.csv', language=False)
test_text, _ = test_per_language['babel']
def test_pruning(sentence, tokenizer_full, tokenizer_pruned, embedding_matrix_full, embedding_matrix_pruned):
'''Tests tokens in a tokenized sentence maps to the same words in both tokenizers AND
if both tokens map to the same word vector'''
index_to_word_full = { value : key for key, value in tokenizer_full.tokenizer.word_index.iteritems()}
index_to_word_pruned = { value : key for key, value in tokenizer_pruned.tokenizer.word_index.iteritems()}
tokenized_full = tokenizer_full.transform([sentence]).squeeze()
tokenized_pruned = tokenizer_pruned.transform([sentence]).squeeze()
assert len(tokenized_full) == len(tokenized_pruned)
for token_full, token_pruned in zip(tokenized_full, tokenized_pruned):
if token_pruned == 0 or index_to_word_pruned[token_pruned] == tokenizer_pruned.tokenizer.oov_token:
continue
else:
assert index_to_word_full[token_full] == index_to_word_pruned[token_pruned]
assert np.allclose(embedding_matrix_full[token_full], embedding_matrix_pruned[token_pruned])
if __name__=='__main__':
maxlen = 200
max_features = 500000
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=max_features, maxlen=maxlen)
frozen_tokenizer.fit(train_text)
tokenizer2 = copy.deepcopy(frozen_tokenizer)
train_tokenized = frozen_tokenizer.transform(train_text)
embedding_dim = 300
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
embedding_matrix = models.make_embedding_matrix(embedding, frozen_tokenizer.tokenizer.word_index, max_features=max_features, maxlen=maxlen, embedding_dim=embedding_dim)
embedding_matrix2, tokenizer2.tokenizer = models.add_oov_vector_and_prune(embedding_matrix, frozen_tokenizer.tokenizer)
# test 50 random sentences for train and test
rand_train = np.random.randint(0, train_text.shape[0], 500)
rand_test = np.random.randint(0, test_text.shape[0], 500)
for rand_i in rand_train:
test_pruning(train_text[rand_i], frozen_tokenizer, tokenizer2, embedding_matrix, embedding_matrix2)
for rand_i in rand_test:
test_pruning(test_text[rand_i], frozen_tokenizer, tokenizer2, embedding_matrix, embedding_matrix2)
......@@ -7,6 +7,7 @@ from functools import partial
import joblib
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_val_score, KFold
import helpers as hlp
......@@ -16,6 +17,7 @@ import json
import time
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, CSVLogger
import feature_engineering
#TODO: implement hyper parameter search
#TODO: get vocabulary on full corpus
......@@ -30,8 +32,13 @@ def do_hyper_search(space, model_function, **kwargs):
trials = Trials()
best = fmin(model_function, space=space, trials=trials, **kwargs)
def GBRT_model(X, y, **kwargs):
def GBC_model(X, y, kwargs):
gbc = MultiOutputClassifier(GradientBoostingClassifier(**kwargs))
return validator(gbc, X, y)
def RF_model(X, y, kwargs):
model = MultiOutputClassifier(RandomForestClassifier(**kwargs))
return validator(model, X, y)
#TODO: more information??
def validator(estimator, X, y, cv=5, fit_args={}, **kwargs):
......@@ -48,24 +55,50 @@ def validator(estimator, X, y, cv=5, fit_args={}, **kwargs):
score_dict = {'loss' : np.mean(scores), 'loss_fold' : scores, 'status' : STATUS_OK}
return score_dict
fixed_params_file = '../parameters/fixed.json'
#for now for ALL languages
def validate_token_model(model_name, model_function, space, fixed_params_file='../parameters/fixed.json'):
with open(fixed_params_file, 'r') as fl:
fixed_params_dict = json.load(fl)
with open(fixed_params_file, 'r') as fl:
fixed_params_dict = json.load(fl)
train_text, train_y = pre.load_data((
test_text, _ = pre.load_data('test.csv')
train_text, train_labels = pre.load_data()
test_text, _ = pre.load_data('test.csv')
train_y = train_labels.values
frozen_tokenizer = pre.KerasPaddingTokenizer(maxlen=fixed_params_dict['maxlen'],
max_features=fixed_params_dict['max_features'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
frozen_tokenizer = pre.KerasPaddingTokenizer(maxlen=fixed_params_dict['maxlen'],
max_features=fixed_params_dict['max_features'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
fit_args = {'batch_size' : 256, 'epochs' : 20,
'validation_split' : 0.1, 'callbacks' : callbacks_list}
frozen_model_func = partial(DNN_model, train_text, train_y,
tokenizer=frozen_tokenizer, **fixed_params_dict)
# freeze all constant parameters
frozen_model_func = partial(model_function, train_text, train_y, fit_args=fit_args,
tokenizer=frozen_tokenizer, **fixed_params_dict)
fit_args = {'batch_size' : 256, 'epochs' : 20,
'validation_split' : 0.1, 'callbacks' : callbacks_list}
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
trials = Trials()
best = fmin(model_function, space=space, algo=tpe.suggest, max_evals=10, trials=trials)
hlp.dump_trials(trials, fname=model_name)
return best
#TODO: better feature selection
def validate_feature_model(model_name, model_function, space, fixed_params_file='../parameters/fixed_features.json', max_evals=10):
with open(fixed_params_file, 'r') as fl:
fixed_params_dict = json.load(fl)
which_features = fixed_params_dict.pop('features')
train_text, train_y = pre.load_data()
train_ft = feature_engineering.compute_features(train_text, which_features=which_features)
frozen_model_func = partial(model_function, train_ft, train_y, **fixed_params_dict)
trials = Trials()
best = fmin(frozen_model_func, space=space, algo=tpe.suggest, max_evals=10, trials=trials)
hlp.dump_trials(trials, fname=model_name)
return best
if __name__=='__main__':
feature_models_to_test = {
'gbc' : (GBC_model, {'n_estimators' : 80+hp.randint('n_estimators', 100), 'max_depth' : 1 + hp.randint('max_depth', 6)}),
'rf' : (RF_model, {'n_estimators' : 5 + hp.randint('n_estimators', 30)})
}
for model_name, (func, space) in feature_models_to_test.iteritems():
best = validate_feature_model(model_name, func, space)
joblib.dump(best, 'best_{}.pkl'.format(model_name))
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment