Commit 6020f288 authored by mjboos's avatar mjboos

stuff

parent 1a5644ce
......@@ -52,7 +52,7 @@ def continue_training_DNN(model_name, fit_args, *args, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
model = models.Embedding_Blanko_DNN(**kwargs)
model.model.load_weights(best_weights_path)
callbacks_list = make_callback_list(model_name+'_more', patience=5)
callbacks_list = make_callback_list(model_name+'_more', patience=3)
fit_args['callbacks'] = callbacks_list
model.fit(*args, **fit_args)
return model
......@@ -110,11 +110,12 @@ def transfer_weights_multi_to_one(weights, model, i):
# now for the last layer
model.layers[-1].set_weights([weights[-1][0][:,i][:,None], weights[-1][1][i][None]])
def fine_tune_model(model_name, old_model, fit_args, train_X, train_y, **kwargs):
def fine_tune_model(model_name, old_model, fit_args, train_X, train_y, test_text, **kwargs):
'''Fits and returns a model for one label (provided as index i)'''
weights = [layer.get_weights() for layer in old_model.layers]
for i in xrange(6):
for i in xrange(4, 6):
new_name = model_name + '_{}'.format(i)
model = continue_training_DNN_one_output(new_name, i, weights, fit_args, train_X, train_y[:,i], **kwargs)
joblib.dump(model.predict(test_text), '{}.pkl'.format(new_name))
K.clear_session()
......@@ -92,8 +92,8 @@ if __name__=='__main__':
keras_model = load_keras_model(model_name)
# model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output':train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = continue_training_DNN(model_name, fit_args, train_text, train_y, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
fine_tune_model(model_name, keras_model, fit_args, train_text, train_y, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = load_full_model(model_name+'_0', embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
fine_tune_model(model_name, keras_model, fit_args, train_text, train_y, test_text, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
conc_finetuned_preds(model_name)
# hlp.write_model(model.predict(test_text))
# K.clear_session()
......
......@@ -92,6 +92,17 @@ def predictions_for_language(language_dict, test_data=None):
predictions[languages_test==language, :] = language_dict[language].predict_proba(language_data)
return predictions
@memory.cache
def get_fasttext_rank(fasttext_path):
rank_idx = {}
with open(fasttext_path, 'r') as f:
for nr, line in enumerate(f):
values = line.split()
word = values[0]
rank_index[word] = nr
return rank_idx
def dump_trials(trials, fname=''):
import time
joblib.dump(trials, '../validation_logs/trial_{}_{}.json'.format(fname, time.strftime("%m%d-%H%M")))
......
......@@ -56,16 +56,14 @@ def data_preprocessing(df):
df['comment_text'] = df['comment_text'].apply(clean_comment)
return df
def load_data(name='train.csv', preprocess=True):
def load_data(name='train.csv', preprocess=True, cut=True):
data = pd.read_csv('../input/{}'.format(name), encoding='utf-8')
if preprocess:
data = data_preprocessing(data)
# if language:
# languages = pd.read_csv('language_{}'.format(name), header=None).squeeze()
# grouped_data = data.groupby(by=lambda x : languages[x])
# data_dict = { language : [data['comment_text'], data.iloc[:, 2:].values]
# for language, data in grouped_data }
# else:
if cut:
# these comments are often (or always) mis-labeled
not_toxic_but_nz = np.logical_and(train_y[:,0]==0, np.logical_not(all_zero))
data = data.drop(data.index[np.where(not_toxic_but_nz)[0]])
text = data['comment_text']
labels = data.iloc[:, 2:].values
# data_dict = {'babel' : [text, labels]}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment