Commit c7235e90 authored by mjboos's avatar mjboos

diff preprocessing

parent 362fb574
......@@ -26,21 +26,32 @@ lr = LearningRateScheduler(schedule)
callbacks_list = [checkpoint, early] #early
fit_args = {'batch_size' : 256, 'epochs' : 10,
fit_args = {'batch_size' : 256, 'epochs' : 15,
'validation_split' : 0.2, 'callbacks' : callbacks_list}
train_text, train_y = pre.load_data()
test_text, _ = pre.load_data('test.csv')
def train_DNN(model_name, embeddings_index, **kwargs):
def train_DNN(model_name, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
model = models.Embedding_Blanko_DNN(embeddings_index=embeddings_index, **kwargs)
model = models.Embedding_Blanko_DNN(**kwargs)
with open('../model_specs/{}.json'.format(model_name), 'w') as fl:
json.dump(model.model.to_json(), fl)
model.fit(train_text, train_y, **fit_args)
model.model.load_weights(best_weights_path)
return model
def continue_training_DNN(model_name, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
model = models.Embedding_Blanko_DNN(**kwargs)
model.model.load_weights(best_weights_path)
logger = CSVLogger('../logs/{}.csv'.format(model_name), separator=',', append=True)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
model.fit(train_text, train_y, **fit_args)
return model
def continue_training_DNN_one_output(model_name, i, weights, **kwargs):
best_weights_path="{}_{}_best.hdf5".format(model_name, i)
model = models.Embedding_Blanko_DNN(**kwargs)
......@@ -62,13 +73,13 @@ def predict_for_all(model):
predictions = model.predict(test_text)
hlp.write_model(predictions)
def fit_model(name, embedding, **kwargs):
def fit_model(model_name, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
logger = CSVLogger('../logs/{}.csv'.format(model_name), separator=',', append=False)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
model = train_DNN(model_name, embedding, **kwargs)
model = train_DNN(model_name, **kwargs)
return model
def load_keras_model(model_name, **kwargs):
......@@ -108,17 +119,13 @@ def fine_tune_model(model_name, old_model, **kwargs):
if __name__=='__main__':
model_params = {
'max_features' : 500000, 'model_function' : models.LSTM_dropout_model, 'maxlen' : 200,
'max_features' : 500000, 'model_function' : models.LSTM_dropout_model, 'maxlen' : 500,
'embedding_dim' : 300,
'compilation_args' : {'optimizer' : 'adam', 'loss':'binary_crossentropy','metrics':['accuracy']}}
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
model_name = '300_fasttext_oov_words_LSTM'
embedding = hlp.get_fasttext_embedding('../wiki.en.vec')
embedding = hlp.update_embedding_vec(embedding, '../unknown_words.vec')
# model_old = hacky_load_LSTM()
# fine_tune_model(model_name, model_old, embeddings_index=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = load_keras_model(model_name, tokenizer=frozen_tokenizer, **model_params)
model = fit_model(model_name, embedding, tokenizer=frozen_tokenizer, **model_params)
hlp.write_model(model.predict(test_text))
model_name = '300_fasttext_CC_LSTM'
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
model = fit_model(model_name, embedding=embedding, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict(test_text))
......@@ -110,10 +110,10 @@ def keras_token_model(model_fuction=None, max_features=20000, maxlen=100, embed_
return pipe.Pipeline(steps=[('tokenizer', pre.KerasPaddingTokenizer(max_features=max_features, maxlen=maxlen)),
('BiLSTM', model)])
def process_word(word, i, max_features, embedding_dim, correct_spelling, corr_dict1, embeddings_index):
def process_word(word, i, max_features, embedding_dim, correct_spelling, corr_dict1, embedding):
if i >= max_features:
return np.zeros((1, embedding_dim))
embedding_vector = embeddings_index.get(word)
embedding_vector = embedding.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
return embedding_vector[None]
......@@ -122,7 +122,7 @@ def process_word(word, i, max_features, embedding_dim, correct_spelling, corr_di
suggestions = corr_dict1.suggest(word)
if len(suggestions) > 0:
suggested_word = suggestions[0]
embedding_vector = embeddings_index.get(suggested_word)
embedding_vector = embedding.get(suggested_word)
if embedding_vector is not None:
return embedding_vector[None]
return np.zeros((1, embedding_dim))
......@@ -171,14 +171,14 @@ def which_words_are_zero_vectors(embedding_matrix, word_index, oov_token):
return word_list
#TODO: more flexible spelling correction
def make_embedding_matrix(embeddings_index, word_index, max_features=20000, maxlen=200, embedding_dim=50, correct_spelling=None, diagnostics=False, **kwargs):
def make_embedding_matrix(embedding, word_index, max_features=20000, maxlen=200, embedding_dim=50, correct_spelling=None, diagnostics=False, **kwargs):
num_words = min(max_features, len(word_index))
# add one element for zero vector
embedding_matrix = np.zeros((num_words+1, embedding_dim))
for word, i in word_index.items():
if i >= max_features:
continue
embedding_vector = embeddings_index.get(word)
embedding_vector = embedding.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
......@@ -186,7 +186,7 @@ def make_embedding_matrix(embeddings_index, word_index, max_features=20000, maxl
if correct_spelling:
# replace with autocorrected word IF this word is in embeddings
suggested_word = correct_spelling(word)
embedding_vector = embeddings_index.get(suggested_word)
embedding_vector = embedding.get(suggested_word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
......@@ -220,7 +220,7 @@ def add_oov_vector_and_prune(embedding_matrix, tokenizer):
return prune_matrix_and_tokenizer(embedding_matrix, tokenizer)
class Embedding_Blanko_DNN(BaseEstimator):
def __init__(self, embeddings_index=None, max_features=20000, model_function=None, tokenizer=None,
def __init__(self, embedding=None, max_features=20000, model_function=None, tokenizer=None,
maxlen=200, embedding_dim=100, correct_spelling=False, trainable=False, preprocess_embedding=False,
compilation_args={'optimizer':'adam','loss':'binary_crossentropy','metrics':['accuracy']}, embedding_args={'n_components' : 100}):
self.compilation_args = compilation_args
......@@ -239,10 +239,10 @@ class Embedding_Blanko_DNN(BaseEstimator):
else:
self.tokenizer = pre.KerasPaddingTokenizer(max_features=max_features, maxlen=maxlen)
if embeddings_index:
self.embeddings_index = embeddings_index
if embedding:
self.embedding = embedding
else:
self.embeddings_index = hlp.get_glove_embedding('../glove.6B.100d.txt')
self.embedding = hlp.get_glove_embedding('../glove.6B.100d.txt')
if model_function:
self.model_function = model_function
......@@ -251,7 +251,7 @@ class Embedding_Blanko_DNN(BaseEstimator):
if self.tokenizer.is_trained:
word_index = self.tokenizer.tokenizer.word_index
embedding_matrix = make_embedding_matrix(self.embeddings_index, word_index, max_features=self.max_features, maxlen=self.maxlen, embedding_dim=self.embedding_dim, correct_spelling=self.correct_spelling)
embedding_matrix = make_embedding_matrix(self.embedding, word_index, max_features=self.max_features, maxlen=self.maxlen, embedding_dim=self.embedding_dim, correct_spelling=self.correct_spelling)
embedding_matrix, self.tokenizer.tokenizer = add_oov_vector_and_prune(embedding_matrix, self.tokenizer.tokenizer)
embedding_layer = make_embedding_layer(embedding_matrix, maxlen=self.maxlen,
trainable=self.trainable, preprocess_embedding=self.preprocess_embedding, **self.embedding_args)
......@@ -265,7 +265,7 @@ class Embedding_Blanko_DNN(BaseEstimator):
if not self.tokenizer.is_trained:
self.tokenizer.fit(X)
word_index = self.tokenizer.tokenizer.word_index
embedding_matrix = make_embedding_matrix(self.embeddings_index, word_index, max_features=self.max_features, maxlen=self.maxlen, embedding_dim=self.embedding_dim, correct_spelling=self.correct_spelling)
embedding_matrix = make_embedding_matrix(self.embedding, word_index, max_features=self.max_features, maxlen=self.maxlen, embedding_dim=self.embedding_dim, correct_spelling=self.correct_spelling)
embedding_matrix, self.tokenizer.tokenizer = add_oov_vector_and_prune(embedding_matrix, self.tokenizer.tokenizer)
embedding_layer = make_embedding_layer(embedding_matrix, maxlen=self.maxlen, trainable=self.trainable, preprocess_embedding=self.preprocess_embedding, **self.embedding_args)
sequence_input = Input(shape=(self.maxlen,), dtype='int32')
......
......@@ -40,13 +40,19 @@ def remove_control_chars(s):
return control_char_re.sub('', s)
def clean_comment(text):
return ' '.join(control_char_re.sub(' ', text).split(' '))
import unicodedata as ud
text = ud.normalize('NFD', text)
text = re.sub(r'[^\x00-\x7f]', r' ' , text)
#without_controls = ' '.join(control_char_re.sub(' ', text).split(' '))
# add space between punctuation
s = re.sub(r"([.,!?():;_^`<=>$%&@|{}\-+#~*\/])", r' \1 ', text)
s = re.sub('\s{2,}', ' ', s)
return s.encode('utf-8')
@memory.cache
def data_preprocessing(df):
COMMENT = 'comment_text'
df[COMMENT].fillna('_UNK_', inplace=True)
df[COMMENT] = df[COMMENT].apply(clean_comment)
df['comment_text'].fillna(' ', inplace=True)
df['comment_text'] = df['comment_text'].apply(clean_comment)
return df
def load_data(name='train.csv', preprocess=True):
......@@ -71,7 +77,7 @@ def keras_pad_sequence_to_sklearn_transformer(maxlen=100):
class KerasPaddingTokenizer(BaseEstimator, TransformerMixin):
def __init__(self, max_features=20000, maxlen=200,
filters='!\'"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', **kwargs):
filters='\'\"\t\n', **kwargs):
self.max_features = max_features
self.maxlen = maxlen
self.is_trained = False
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment