Commit fca05728 authored by mjboos's avatar mjboos

most recent

parent 66ff81ba
*.csv
*.hdf5
*~
*.log
*.txt
*.out
......@@ -90,7 +90,7 @@ def simple_attention_1d(trainable=False, prune=True):
return model_params
def simple_attention(trainable=False, prune=True):
model_func = partial(models.RNN_attention, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, dropout_dense=0.5, dropout=0.5)
model_func = partial(models.RNN_attention, rnn_func=keras.layers.CuDNNGRU, no_rnn_layers=2, hidden_rnn=96, dropout_dense=0.5, dropout=0.5, train_embedding=False)
model_params = {
'max_features' : 500000, 'model_function' : model_func, 'maxlen' : 500,
'embedding_dim' : 300, 'trainable' : trainable, 'prune' : prune,
......@@ -131,8 +131,8 @@ if __name__=='__main__':
weight_tensor = tf.convert_to_tensor(class_weights, dtype=tf.float32)
loss = partial(models.weighted_binary_crossentropy, weights=weight_tensor)
loss.__name__ = 'weighted_binary_crossentropy'
model_params = simple_attention()
model_name = '300_fasttext_attention_diffpre2_GRU'
model_params = simple_attention(trainable=False)
model_name = '300_fasttext_attention_diffpre3_GRU'
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=model_params['max_features'], maxlen=model_params['maxlen'])
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
embedding = hlp.get_fasttext_embedding('../crawl-300d-2M.vec')
......
......@@ -209,12 +209,15 @@ def make_embedding_matrix(embedding, word_index, max_features=20000, maxlen=200,
return embedding_matrix
def make_embedding_layer(embedding_matrix, maxlen=200, trainable=False, **kwargs):
def make_embedding_layer(embedding_matrix, maxlen=200, l2=1e-6, trainable=False, **kwargs):
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
from keras.regularizers import L1L2
embed_reg = L1L2(l2=l2) if l2 != 0 and trainable else None
embedding_layer = Embedding(embedding_matrix.shape[0],
embedding_matrix.shape[1],
weights=[embedding_matrix],
embeddings_regularizer=embed_reg,
input_length=maxlen,
trainable=trainable)
return embedding_layer
......@@ -483,14 +486,17 @@ def RNN_attention_1d(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_fun
x = Dense(1, activation="sigmoid", name='main_output')(x)
return x, None
def RNN_attention(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_func=None, dropout=0.5, dropout_dense=0.5, input_len=500):
def RNN_attention(x, no_rnn_layers=2, hidden_rnn=48, hidden_dense=48, rnn_func=None, dropout=0.5, dropout_dense=0.5, input_len=500, train_embedding=False):
if rnn_func is None:
rnn_func = CuDNNLSTM
if not isinstance(hidden_rnn, list):
hidden_rnn = [hidden_rnn] * no_rnn_layers
if len(hidden_rnn) != no_rnn_layers:
raise ValueError('list of recurrent units needs to be equal to no_rnn_layers')
vals = []
if train_embedding:
vals = [x]
else:
vals = []
for rnn_size in hidden_rnn:
x = Dropout(dropout)(x)
x = Bidirectional(rnn_func(int(rnn_size), return_sequences=True))(x)
......
......@@ -89,8 +89,13 @@ def clean_comment(text, replace_misspellings=False):
s = re.sub(r'\bfucka\b', ' fucker ', s)
#wikipedia specific features
s = re.sub(r'(?<=\(talk\)).*?(?=$)', ' _date_ ', s)
s = re.sub(r'\b\(talk\)', ' _wikipedia_ ', s)
# wikipedia_regex = [r'\(talk\)', r'\(utc\)', r'\(talk|email\)']
# wikipedia_matches = [re.search(regex, s) for regex in wikipedia_regex]
s = re.sub(r'(?<=\(talk\)).*?(?=\(utc\))', ' _date_ ', s)
s = re.sub(r'\(talk\)', ' _wikipedia_ ', s)
s = re.sub(r'\(utc\)', ' _wikipedia_ ', s)
s = re.sub(r'\(talk|email\)', ' _wikipedia_ ', s)
s = re.sub(ur'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' _url_ ', s)
s = re.sub(ur'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b', ' _mail_ ', s)
#without_controls = ' '.join(control_char_re.sub(' ', text).split(' '))
......@@ -130,7 +135,7 @@ def keras_pad_sequence_to_sklearn_transformer(maxlen=100):
class KerasPaddingTokenizer(BaseEstimator, TransformerMixin):
def __init__(self, max_features=20000, maxlen=200,
filters="\t\n", **kwargs):
filters="\t\n{}&%$§^°[]<>|@[]+`' ", **kwargs):
self.max_features = max_features
self.maxlen = maxlen
self.is_trained = False
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment