Commit a2ca524c authored by MorBoos's avatar MorBoos
parents 60a35946 ba182e10
*.csv
*.hdf5
*~
*.log
*.txt
*.out
This diff is collapsed.
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_val_score
import helpers as hlp
import models
import preprocessing as pre
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, CSVLogger
import json
memory = joblib.Memory(cachedir='/home/mboos/joblib')
best_weights_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
def schedule(ind):
a = [0.002,0.002,0.002,0.001,0.001]
return a[ind]
lr = LearningRateScheduler(schedule)
callbacks_list = [checkpoint, early] #early
fit_args = {'batch_size' : 256, 'epochs' : 20,
'validation_split' : 0.2, 'callbacks' : callbacks_list}
# for now use only english as model
train_per_language = pre.load_data()
train_text, train_y = train_per_language['en']
test_per_language = pre.load_data('test.csv')
test_text, _ = test_per_language['en']
#FOR NOW!!
#train_text, train_y = pre.load_data(language=False)['babel']
#test_text, _ = pre.load_data('test.csv', language=False)['babel']
def train_DNN(embeddings_index, **kwargs):
model = models.Embedding_Blanko_DNN(embeddings_index=embeddings_index, **kwargs)
model.fit(train_text, train_y, **fit_args)
model.model.load_weights(best_weights_path)
return model
def load_DNN_weights(embeddings_index, weights_path='weights_base.best.hdf5',**kwargs):
model = models.Embedding_Blanko_DNN(embeddings_index=embeddings_index, **kwargs)
fit_args_tmp = {'batch_size' : 128, 'epochs' : 1,
'validation_split' : 0.9}
model.fit(train_text, train_y, **fit_args_tmp)
model.model.load_weights(weights_path)
return model
def DNN_EN_to_language_dict(model_english, train_per_language, simple_for=None):
language_dict = models.make_default_language_dict()
language_dict['en'] = model_english
if simple_for:
for simple_lan in simple_for:
language_dict[simple_lan] = models.tfidf_model().fit(*train_per_language[simple_lan])
hlp.write_model(hlp.predictions_for_language(language_dict))
def predict_for_all(model):
test_text, _ = pre.load_data('test.csv', language=False)['babel']
predictions = model.predict(test_text)
hlp.write_model(predictions)
if __name__=='__main__':
maxlen = 200
max_features = 500000
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=max_features, maxlen=maxlen)
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
model_name = '300_fasttext_LSTM'
logger = CSVLogger('../logs/{}.csv'.format(model_name), separator=',', append=False)
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
embedding_dim = 300
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
model = train_DNN(embedding, maxlen=maxlen,
max_features=max_features, model_function=models.LSTM_dropout_model,
embedding_dim=embedding_dim, tokenizer=frozen_tokenizer,
compilation_args={'optimizer' : 'nadam', 'loss':'binary_crossentropy','metrics':['accuracy']})
# joblib.pickle(model, '../models/{}.pkl'.format(model_name))
predict_for_all(model)
# DNN_EN_to_language_dict(model, train_per_language)
#
# checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# logger = CSVLogger('../logs/300_fasttext_LSTM.csv', separator=',', append=False)
# callbacks_list = [logger, checkpoint, early] #early
# fit_args['callbacks'] = callbacks_list
# DNN_EN_to_language_dict(
# train_DNN(embedding, trainable=False, maxlen=maxlen,
# max_features=max_features, model_function=models.LSTM_dropout_model,
# embedding_dim=embedding_dim, tokenizer=frozen_tokenizer,
# compilation_args={'optimizer' : 'adam', 'loss':'binary_crossentropy','metrics':['accuracy']}))
#
......@@ -35,68 +35,40 @@ lr = LearningRateScheduler(schedule)
callbacks_list = [checkpoint, early] #early
fit_args = {'batch_size' : 100, 'epochs' : 30,
fit_args = {'batch_size' : 80, 'epochs' : 30,
'validation_split' : 0.2, 'callbacks' : callbacks_list}
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
def aux_net():
model_func = partial(models.RNN_aux_loss, rnn_func=keras.layers.CuDNNLSTM, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 300,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy', 'aux_output' : 'binary_crossentropy'}, 'loss_weights' : [1., 0.1]}}
return model_params
def simple_one_output_net():
model_func = partial(models.RNN_general_one_class, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def simple_net():
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def add_net():
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 400, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
if __name__=='__main__':
aux_task = train_y.sum(axis=1) > 0
# for toxic skip
aux_task = train_y[:,0]
# train_y = np.delete(train_y, 0, axis=1)
train_data_augmentation = pre.pad_and_extract_capitals(train_text)[..., None]
test_data_augmentation = pre.pad_and_extract_capitals(test_text)[..., None]
class_weights = hlp.get_class_weights(train_y)
weight_tensor = tf.convert_to_tensor(class_weights, dtype=tf.float32)
loss = partial(models.weighted_binary_crossentropy, weights=weight_tensor)
loss.__name__ = 'weighted_binary_crossentropy'
model_params = simple_one_output_net()
model_name = '300_fasttext_cuda_2_layers_larger_GRU'
model_params = simple_attention(trainable=False)
model_name = '300_fasttext_attention_smaller_voc_GRU'
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# SHUFFLE TRAINING SET so validation split is different every time
row_idx = np.arange(0, train_text.shape[0])
np.random.shuffle(row_idx)
train_text, train_y, aux_task = train_text[row_idx], train_y[row_idx], aux_task[row_idx]
keras_model = load_keras_model(model_name)
# model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output':train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = continue_training_DNN(model_name, fit_args, train_text, train_y, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# row_idx = np.arange(0, train_text.shape[0])
# np.random.shuffle(row_idx)
# train_text, train_y, aux_task, train_data_augmentation = train_text[row_idx], train_y[row_idx], aux_task[row_idx], train_data_augmentation[row_idx]
# model = load_keras_model(model_name)
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
fine_tune_model(model_name, keras_model, fit_args, train_text, train_y, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
conc_finetuned_preds(model_name)
model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output': train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
hlp.write_model(model.predict({'main_input':test_text,'aug_input':test_data_augmentation}))
hlp.make_training_set_preds(model, {'main_input':train_text, 'aug_input':train_data_augmentation}, train_y)
# model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output':train_y, 'aux_output':aux_task}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = continue_training_DNN(model_name, fit_args, train_text, train_y, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict(test_text))
# K.clear_session()
# model_params['compilation_args']['optimizer'] = optimizers.Adam(lr=0.0005, beta_2=0.99)
# model = continue_training_DNN(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict(test_text))
# model = continue_training_DNN(model_name, fit_args, train_text, train_y, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model_params = simple_attention_1d()
# extend_and_finetune_last_layer_model(model_name, fit_args, train_text, train_y, test_text, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
......@@ -58,3 +58,6 @@ def compute_features(text_df, which_features=None):
feature_data[:,i] = features
return feature_data
def caps_vec(input_text):
split_text = text.text_to_word_sequence(input_text, filters="\n\t", lower=False)
return np.array([1 if (word.isupper() and len(word)>1) else 0 for word in split_text])
......@@ -44,8 +44,8 @@ def write_model(predictions, correct=None,
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
import pandas as pd
import time
if correct:
predictions = correct(predictions)
if isinstance(predictions, list):
predictions = np.concatenate(predictions, axis=-1)
timestr = time.strftime("%m%d-%H%M")
subm = pd.read_csv('../input/sample_submission.csv')
submid = pd.DataFrame({'id': subm["id"]})
......@@ -92,6 +92,27 @@ def predictions_for_language(language_dict, test_data=None):
predictions[languages_test==language, :] = language_dict[language].predict_proba(language_data)
return predictions
@memory.cache
def get_fasttext_rank(fasttext_path):
rank_index = {}
with open(fasttext_path, 'r') as f:
for nr, line in enumerate(f):
values = line.split()
word = values[0]
rank_index[word] = nr
return rank_index
def make_training_set_preds(model, train_data, train_y, split=0.2):
import time
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
split_n = np.round(split*train_data['main_input'].shape[0]).astype('int')
predictions = model.predict({label: data[-split_n:] for label, data in train_data.iteritems()})
df_dict = {'predictions_{}'.format(lbl) : preds for lbl, preds in zip(cols, predictions.T)}
df_dict.update({lbl : lbl_col for lbl, lbl_col in zip(cols, train_y[-split_n:].T)})
df_dict['text'] = train_data['main_input'][-split_n:]
df = pd.DataFrame(df_dict)
df.to_csv('predictions_{}.csv'.format(time.strftime("%m%d-%H%M")))
def dump_trials(trials, fname=''):
import time
joblib.dump(trials, '../validation_logs/trial_{}_{}.json'.format(fname, time.strftime("%m%d-%H%M")))
......@@ -110,3 +131,4 @@ def join_embedding_vec(word_dict, path):
except KeyError:
word_dict[word] = np.concatenate([word_dict[word], np.zeros(n_dim)])
return word_dict
This diff is collapsed.
......@@ -11,8 +11,11 @@ from keras.preprocessing import text, sequence
from nltk.corpus import stopwords
import re, string
from sklearn.base import BaseEstimator, TransformerMixin
import feature_engineering
import string
import json
from functools import partial
eng_stopwords = set(stopwords.words("english"))
memory = joblib.Memory(cachedir='/home/mboos/joblib')
......@@ -36,37 +39,95 @@ control_chars = ''.join(map(unichr, range(0,32) + range(127,160)))
control_char_re = re.compile('[%s]' % re.escape(control_chars))
#with open('bad_words_translation.json', 'r') as fl:
# bad_word_dict = json.load(fl)
bad_word_dict = joblib.load('bad_words_misspellings.pkl')
some_bad_words = joblib.load('some_bad_words.pkl')
def check_for_duplicates(word, zero_words):
regex = r'^' + ''.join('[{}]+'.format(c) for c in word) + '$'
matches = [re.search(regex, s) for s in zero_words]
is_match = np.array([m is not None for m in matches])
return is_match, np.where(is_match)[0]
def replacement_regex(word):
regex = r'\b' + ''.join('[{}]+'.format(c) for c in word) + r'\b'
return regex
def remove_control_chars(s):
return control_char_re.sub('', s)
def clean_comment(text):
def clean_comment(text, replace_misspellings=False):
import unicodedata as ud
text = ud.normalize('NFD', text.encode('utf-8').decode('utf-8'))
text = text.lower()
text = re.sub(r'[^\x00-\x7f]', r' ' , text)
text = re.sub(r'[\n\r]', r' ', text)
s = re.sub(r"what's", "what is ", text, flags=re.IGNORECASE)
s = re.sub(r"\'s", " ", s, flags=re.IGNORECASE)
s = re.sub(r"\'ve", " have ", s, flags=re.IGNORECASE)
s = re.sub(r"can't", "cannot ", s, flags=re.IGNORECASE)
s = re.sub(r"n't", " not ", s, flags=re.IGNORECASE)
s = re.sub(r"i'm", "i am ", s, flags=re.IGNORECASE)
s = re.sub(r"\'re", " are ", s, flags=re.IGNORECASE)
s = re.sub(r"\'d", " would ", s, flags=re.IGNORECASE)
s = re.sub(r"\'ll", " will ", s, flags=re.IGNORECASE)
s = re.sub(r"\'scuse", " excuse ", s, flags=re.IGNORECASE)
s = re.sub(r'([_])', r' \1 ', s)
s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s)
s = re.sub(r'\b[a-z]+\d+\b', ' _user_ ', s)
s = re.sub(r'\b[0-9]+\b', ' _number_ ', s)
#hard coded replacements
for bad_word in some_bad_words:
s = re.sub(replacement_regex(bad_word), ' ' + bad_word + ' ', s)
s = re.sub(r'\bfukc\b', ' fuck ', s)
s = re.sub(r'\bfcuk\b', ' fuck ', s)
s = re.sub(r'\bfucc\b', ' fuck ', s)
s = re.sub(r'\bfukk\b', ' fuck ', s)
s = re.sub(r'\bfukker\b', ' fuck ', s)
s = re.sub(r'\bfucka\b', ' fucker ', s)
#wikipedia specific features
# wikipedia_regex = [r'\(talk\)', r'\(utc\)', r'\(talk|email\)']
# wikipedia_matches = [re.search(regex, s) for regex in wikipedia_regex]
s = re.sub(r'(?<=\(talk\)).*?(?=\(utc\))', ' _date_ ', s)
s = re.sub(r'\(talk\)', ' _wikipedia ', s)
s = re.sub(r'\(utc\)', ' _wikipedia_ ', s)
s = re.sub(r'\(talk|email\)', ' _wikipedia_ ', s)
s = re.sub(ur'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' _url_ ', s)
s = re.sub(ur'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b', ' _mail_ ', s)
#without_controls = ' '.join(control_char_re.sub(' ', text).split(' '))
# add space between punctuation
s = re.sub(r'([.,!?():;_^`<=>$%&@|{}\-+#~*\/"])', r' \1 ', text)
#shorten words
s = re.sub(r'(\w)\1\1+', r' \1\1 ', s)
s = re.sub(r'([.,!?():;^`<=>$%&@|{}\-+\[\]#~*\/"])', r' \1 ', s)
s = re.sub(r"(['])", r' \1 ', s)
s = re.sub('\s{2,}', ' ', s)
if replace_misspellings:
for key, val in bad_word_dict.iteritems():
s = re.sub(r'\b{}\b'.format(key.lower()), ' '+val.lower()+' ', s)
return s.encode('utf-8')
@memory.cache
def data_preprocessing(df):
df['comment_text'].fillna(' ', inplace=True)
df['comment_text'] = df['comment_text'].apply(clean_comment)
def data_preprocessing(df, replace_misspellings=False):
df['comment_text'].fillna(' ', inplace=True)
clean_comment_dummy = partial(clean_comment, replace_misspellings=replace_misspellings)
df['comment_text'] = df['comment_text'].apply(clean_comment_dummy)
return df
def load_data(name='train.csv', preprocess=True):
def load_data(name='train.csv', preprocess=True, cut=False, replace_misspellings=False):
data = pd.read_csv('../input/{}'.format(name), encoding='utf-8')
if preprocess:
data = data_preprocessing(data)
# if language:
# languages = pd.read_csv('language_{}'.format(name), header=None).squeeze()
# grouped_data = data.groupby(by=lambda x : languages[x])
# data_dict = { language : [data['comment_text'], data.iloc[:, 2:].values]
# for language, data in grouped_data }
# else:
text = data['comment_text']
data = data_preprocessing(data, replace_misspellings=replace_misspellings)
if cut and name=='train.csv':
# these comments are often (or always) mis-labeled
not_toxic_but_nz = np.logical_and(data.iloc[:,2].values==0, data.iloc[:,2:].values.any(axis=1))
data = data.drop(data.index[np.where(not_toxic_but_nz)[0]])
text = data['comment_text'].reset_index(drop=True)
labels = data.iloc[:, 2:].values
# data_dict = {'babel' : [text, labels]}
return text, labels
......@@ -78,7 +139,7 @@ def keras_pad_sequence_to_sklearn_transformer(maxlen=100):
class KerasPaddingTokenizer(BaseEstimator, TransformerMixin):
def __init__(self, max_features=20000, maxlen=200,
filters='\t\n', **kwargs):
filters="\t\n{}&%$§^°[]<>|@[]+`' ", **kwargs):
self.max_features = max_features
self.maxlen = maxlen
self.is_trained = False
......@@ -91,3 +152,7 @@ class KerasPaddingTokenizer(BaseEstimator, TransformerMixin):
def transform(self, list_of_sentences):
return sequence.pad_sequences(self.tokenizer.texts_to_sequences(list_of_sentences), maxlen=self.maxlen)
def pad_and_extract_capitals(df, maxlen=500):
train_data_augmentation = df.apply(feature_engineering.caps_vec)
return sequence.pad_sequences([caps for caps in train_data_augmentation], maxlen=maxlen)
......@@ -27,19 +27,24 @@ def DNN_model_validate(X, y, fit_args, fixed_args, kwargs, cv=5):
'''Builds and evaluates a CNN on train_text, train_labels'''
new_dict = {key:val for key, val in fixed_args.items()}
new_dict.update(kwargs)
new_dict['compilation_args']['optimizer'] = optimizers.Adam(lr=0.001, beta_2=0.99)
new_time = 'cval_{}'.format(time.strftime("%m%d-%H%M"))
kfold = KFold(n_splits=cv, shuffle=True)
kfold = KFold(n_splits=cv, shuffle=False)
scores = []
Xs = np.zeros((len(X),1), dtype='int8')
predictions = []
for train, test in kfold.split(Xs):
new_dict['compilation_args']['optimizer'] = new_dict['compilation_args']['optimizer_func'](**new_dict['compilation_args']['optimizer_args'])
train_x = [X[i] for i in train]
test_x = [X[i] for i in test]
estimator = DNN.fit_model(new_time, fit_args, train_x, y[train], **new_dict)
predictions = estimator.predict(test_x)
scores.append(hlp.mean_log_loss(y[test], predictions))
model_time = '{}_{}'.format(new_time, time.strftime("%m%d-%H%M"))
estimator = DNN.fit_model(model_time, fit_args, train_x, y[train], **new_dict)
predictions.append(estimator.predict(test_x))
scores.append(roc_auc_score(y[test], predictions[-1]))
K.clear_session()
score_dict = {'loss' : np.mean(scores), 'loss_fold' : scores, 'status' : STATUS_OK}
K.clear_session()
predictions = np.vstack(predictions)
joblib.dump(predictions, '../predictions/{}.pkl'.format(new_time), compress=3)
joblib.dump(score_dict, '../scores/{}.pkl'.format(new_time))
return score_dict
def do_hyper_search(space, model_function, **kwargs):
......@@ -79,16 +84,20 @@ def validator(estimator, X, y, cv=3, fit_args={}, **kwargs):
#TODO: add other params
#TODO: model_func_param
def hyperopt_token_model(model_name, model_function, space, maxlen=300, max_features=500000):
def hyperopt_token_model(model_name, model_function, space, fixed_args):
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
frozen_tokenizer = pre.KerasPaddingTokenizer(maxlen=maxlen,
max_features=max_features)
# remove keys that are in space from fixed_args
all_search_space_keys = space.keys() + list(*[sp[key].keys() for key in sp])
fixed_args = {key : val for key, val in fixed_args.iteritems() if key not in all_search_space_keys}
frozen_tokenizer = pre.KerasPaddingTokenizer(maxlen=fixed_args['maxlen'],
max_features=fixed_args['max_features'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
compilation_args = {'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}
fit_args = {'batch_size' : 256, 'epochs' : 30,
fit_args = {'batch_size' : 80, 'epochs' : 30,
'validation_split' : 0.1}
fixed_args = {'tokenizer':frozen_tokenizer, 'embedding':embedding, 'compilation_args':compilation_args}
......@@ -97,7 +106,7 @@ def hyperopt_token_model(model_name, model_function, space, maxlen=300, max_feat
frozen_model_func = partial(model_function, train_text, train_y, fit_args, fixed_args)
trials = Trials()
best = fmin(frozen_model_func, space=space, algo=tpe.suggest, max_evals=5, trials=trials)
best = fmin(frozen_model_func, space=space, algo=tpe.suggest, max_evals=20, trials=trials)
hlp.dump_trials(trials, fname=model_name)
return best
......@@ -115,13 +124,34 @@ def validate_feature_model(model_name, model_function, space, fixed_params_file=
hlp.dump_trials(trials, fname=model_name)
return best
if __name__=='__main__':
def do_hyperparameter_search():
DNN_search_space = {'model_function' : {'no_rnn_layers' : hp.choice('no_rnn_layers', [2]),
'rnn_func' : hp.choice('rnn_func', [models.CuDNNLSTM, models.CuDNNGRU]),
'hidden_rnn' : hp.quniform('hidden_rnn', 32, 96, 16),
'hidden_dense' : hp.quniform('hidden_dense', 16, 64, 8)}}
token_models_to_test = {
'DNN' : (DNN_model_validate, DNN_search_space)}
for model_name, (func, space) in token_models_to_test.iteritems():
best = hyperopt_token_model(model_name, func, space)
'DNN' : (DNN_model_validate, DNN_search_space, DNN.simple_attention())}
for model_name, (func, space, fixed_args) in token_models_to_test.iteritems():
best = hyperopt_token_model(model_name, func, space, fixed_args)
joblib.dump(best, 'best_{}.pkl'.format(model_name))
def test_models():
fit_args = {'batch_size' : 80, 'epochs' : 10,
'validation_split' : 0.2}
fixed_args = DNN.simple_attention_channel_dropout()
kwargs = {}
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
adam_args = {'clipnorm' : 1., 'lr' : 0.001}
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=fixed_args['max_features'], maxlen=fixed_args['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
kwargs['embedding'] = embedding
kwargs['tokenizer'] = frozen_tokenizer
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, adam_args, cv=3)
fixed_args = DNN.simple_attention_channel_dropout()
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, adam_args, cv=3)
fixed_args = DNN.conc_attention()
DNN_model_validate(train_text, train_y, fit_args, fixed_args, kwargs, adam_args, cv=3)
if __name__=='__main__':
test_models()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment