Commit df6ed11e authored by MorBoos's avatar MorBoos

edit

parent 7df06b25
......@@ -116,9 +116,10 @@ if __name__=='__main__':
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
model_name = '300_fasttext_finetune_LSTM'
model_old = hacky_load_LSTM()
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
fine_tune_model(model_name, model_old, embeddings_index=embedding, tokenizer=frozen_tokenizer, **model_params)
embeddings_index = hlp.get_fasttext_embedding('../yt_comments.vec')
# model_old = hacky_load_LSTM()
# embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
# fine_tune_model(model_name, model_old, embeddings_index=embedding, tokenizer=frozen_tokenizer, **model_params)
# model = load_keras_model(model_name, tokenizer=frozen_tokenizer, **model_params)
# model = fit_model(model_name, tokenizer=frozen_tokenizer, **model_params)
# hlp.write_model(model.predict(test_text))
......@@ -33,9 +33,9 @@ def text_to_word_sequence(text,
text.text_to_word_sequence = text_to_word_sequence
def clean_comment(text):
control_chars = u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f'
control_chars = u'0123456789!\'"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
control_char_re = re.compile('[%s]' % re.escape(control_chars))
return control_char_re.sub('', text)
return ' '.join(control_char_re.sub(' ', text).split(' '))
@memory.cache
def data_preprocessing(df):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment