Commit 362fb574 authored by mjboos's avatar mjboos

oov words wikipedia

parent df6ed11e
......@@ -62,13 +62,12 @@ def predict_for_all(model):
predictions = model.predict(test_text)
hlp.write_model(predictions)
def fit_model(name, **kwargs):
def fit_model(name, embedding, **kwargs):
best_weights_path="{}_best.hdf5".format(model_name)
logger = CSVLogger('../logs/{}.csv'.format(model_name), separator=',', append=False)
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
model = train_DNN(model_name, embedding, **kwargs)
return model
......@@ -109,17 +108,17 @@ def fine_tune_model(model_name, old_model, **kwargs):
if __name__=='__main__':
model_params = {
'max_features' : 500000, 'model_function' : models.LSTM_one_class, 'maxlen' : 200,
'max_features' : 500000, 'model_function' : models.LSTM_dropout_model, 'maxlen' : 200,
'embedding_dim' : 300,
'compilation_args' : {'optimizer' : 'adam', 'loss':'binary_crossentropy','metrics':['accuracy']}}
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
model_name = '300_fasttext_finetune_LSTM'
embeddings_index = hlp.get_fasttext_embedding('../yt_comments.vec')
model_name = '300_fasttext_oov_words_LSTM'
embedding = hlp.get_fasttext_embedding('../wiki.en.vec')
embedding = hlp.update_embedding_vec(embedding, '../unknown_words.vec')
# model_old = hacky_load_LSTM()
# embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
# fine_tune_model(model_name, model_old, embeddings_index=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = load_keras_model(model_name, tokenizer=frozen_tokenizer, **model_params)
# model = fit_model(model_name, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict(test_text))
model = fit_model(model_name, embedding, tokenizer=frozen_tokenizer, **model_params)
hlp.write_model(model.predict(test_text))
......@@ -76,3 +76,9 @@ def predictions_for_language(language_dict, test_data=None):
def dump_trials(trials, fname=''):
import time
joblib.dump(trials, '../validation_logs/trial_{}_{}.json'.format(fname, time.strftime("%m%d-%H%M")))
def update_embedding_vec(word_dict, path):
other_words = get_fasttext_embedding(path)
word_dict.update(other_words)
return word_dict
......@@ -32,16 +32,21 @@ def text_to_word_sequence(text,
text.text_to_word_sequence = text_to_word_sequence
control_chars = ''.join(map(unichr, range(0,32) + range(127,160)))
control_char_re = re.compile('[%s]' % re.escape(control_chars))
def remove_control_chars(s):
return control_char_re.sub('', s)
def clean_comment(text):
control_chars = u'0123456789!\'"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
control_char_re = re.compile('[%s]' % re.escape(control_chars))
return ' '.join(control_char_re.sub(' ', text).split(' '))
@memory.cache
def data_preprocessing(df):
COMMENT = 'comment_text'
df[COMMENT].fillna('_UNK_', inplace=True)
# df[COMMENT] = df[COMMENT].apply(clean_comment)
df[COMMENT] = df[COMMENT].apply(clean_comment)
return df
def load_data(name='train.csv', preprocess=True):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment