Commit 3bb1f34b authored by mjboos's avatar mjboos

recent

parent cb0328d3
......@@ -35,30 +35,56 @@ lr = LearningRateScheduler(schedule)
callbacks_list = [checkpoint, early] #early
fit_args = {'batch_size' : 256, 'epochs' : 30,
fit_args = {'batch_size' : 128, 'epochs' : 30,
'validation_split' : 0.2, 'callbacks' : callbacks_list}
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
def aux_net():
model_func = partial(models.RNN_aux_loss, rnn_func=keras.layers.CuDNNLSTM, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 300,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy', 'aux_output' : 'binary_crossentropy'}, 'loss_weights' : [1., 0.1]}}
return model_params
def simple_net():
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNLSTM, no_rnn_layers=2, hidden_rnn=64, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
def add_net():
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNLSTM, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 400,
'embedding_dim' : 400, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99, clipvalue=1., clipnorm=1.), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
return model_params
if __name__=='__main__':
aux_task = train_y.sum(axis=1) > 0
class_weights = hlp.get_class_weights(train_y)
weight_tensor = tf.convert_to_tensor(class_weights, dtype=tf.float32)
loss = partial(models.weighted_binary_crossentropy, weights=weight_tensor)
loss.__name__ = 'weighted_binary_crossentropy'
model_func = partial(models.RNN_general, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=1)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 300,
'embedding_dim' : 300, 'trainable' : False,
'compilation_args' : {'optimizer' : optimizers.Adam(lr=0.001, beta_2=0.99), 'loss':{'main_output': 'binary_crossentropy'}, 'loss_weights' : [1.]}}
model_name = '300_fasttext_cuda_GRU'
model_params = simple_net()
model_name = '300_fasttext_cuda_2_layers_LSTM'
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
# embedding = hlp.join_embedding_vec(embedding, '../crawl-300d-2M.vec')
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# SHUFFLE TRAINING SET so validation split is different every time
row_idx = np.arange(0, train_text.shape[0])
np.random.shuffle(row_idx)
train_text, train_y, aux_task = train_text[row_idx], train_y[row_idx], aux_task[row_idx]
model = fit_model(model_name, fit_args, {'main_input':train_text}, {'main_output':train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = load_full_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = fit_model(model_name, {'main_input':train_text}, {'main_output':train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
model = continue_training_DNN(model_name, {'main_input':train_text}, {'main_output':train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = continue_training_DNN(model_name, {'main_input':train_text}, {'main_output':train_y}, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
hlp.write_model(model.predict(test_text))
K.clear_session()
# model_params['compilation_args']['optimizer'] = optimizers.Adam(lr=0.0005, beta_2=0.99)
......
......@@ -40,7 +40,7 @@ def get_class_weights(y_mat, smooth_factor=0.):
def make_weight_matrix(y_mat, weights):
return np.tile(weights[None], (y_mat.shape[0], 1))
def write_model(predictions, correct=correct_predictions,
def write_model(predictions, correct=None,
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
import pandas as pd
import time
......
......@@ -380,9 +380,9 @@ def RNN_aux_loss(x, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32, rnn_func=No
x = Dense(6, activation="sigmoid", name='main_output')(x)
return [x, aux_dense], None
def RNN_general(x, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32, rnn_func=None, dropout=0.5):
def RNN_general(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_func=None, dropout=0.5):
if rnn_func is None:
rnn_func = LSTM
rnn_func = CuDNNLSTM
if not isinstance(hidden_rnn, list):
hidden_rnn = [hidden_rnn] * no_rnn_layers
if len(hidden_rnn) != no_rnn_layers:
......@@ -392,6 +392,7 @@ def RNN_general(x, no_rnn_layers=1, hidden_rnn=64, hidden_dense=32, rnn_func=Non
x = Bidirectional(rnn_func(int(rnn_size), return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(dropout)(x)
# x = BatchNormalization(x)
x = Dense(int(hidden_dense), activation='relu')(x)
x = Dropout(dropout)(x)
x = Dense(6, activation="sigmoid", name='main_output')(x)
......
......@@ -44,16 +44,15 @@ def clean_comment(text):
text = ud.normalize('NFD', text.encode('utf-8').decode('utf-8'))
text = re.sub(r'[^\x00-\x7f]', r' ' , text)
text = re.sub(r'[\n\r]', r' ', text)
text = re.sub(r'["]', r' ', text)
#without_controls = ' '.join(control_char_re.sub(' ', text).split(' '))
# add space between punctuation
s = re.sub(r"([.,!?():;_^`<=>$%&@|{}\-+#~*\/])", r' \1 ', text)
s = re.sub(r'([.,!?():;_^`<=>$%&@|{}\-+#~*\/"])', r' \1 ', text)
s = re.sub('\s{2,}', ' ', s)
return s.encode('utf-8')
@memory.cache
def data_preprocessing(df):
df['comment_text'].fillna(' ', inplace=True)
df['comment_text'].fillna('', inplace=True)
df['comment_text'] = df['comment_text'].apply(clean_comment)
return df
......@@ -79,7 +78,7 @@ def keras_pad_sequence_to_sklearn_transformer(maxlen=100):
class KerasPaddingTokenizer(BaseEstimator, TransformerMixin):
def __init__(self, max_features=20000, maxlen=200,
filters='\'\"\t\n', **kwargs):
filters='\t\n', **kwargs):
self.max_features = max_features
self.maxlen = maxlen
self.is_trained = False
......@@ -87,7 +86,6 @@ class KerasPaddingTokenizer(BaseEstimator, TransformerMixin):
def fit(self, list_of_sentences, y=None, **kwargs):
self.tokenizer.fit_on_texts(list(list_of_sentences))
print('fit this thing')
self.is_trained = True
return self
......
......@@ -42,7 +42,6 @@ def DNN_model_validate(X, y, fit_args, fixed_args, kwargs, cv=5):
K.clear_session()
return score_dict
def do_hyper_search(space, model_function, **kwargs):
'''Do a search over the space using a frozen model function'''
trials = Trials()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment