Commit f2c6ce00 authored by mjboos's avatar mjboos

skopt

parent 400e25e3
......@@ -109,16 +109,44 @@ def len_comment(row):
def avg_word_length(row):
return np.mean([len(word) for word in row.split(' ')])
def sentiment(row):
from textblob.en.sentiments import PatternAnalyzer
return PatternAnalyzer().analyze(row)[0]*10.
def has_ip(row):
return '_ip_' in row
def has_username(row):
return '_user_' in row
def has_date(row):
return '_date_' in row
def has_mail(row):
return '_mail_' in row
def has_url(row):
return '_url_' in row
def is_wikipedia(row):
return '( talk )' in row or '( utc )' in row or '( talk | email )' in row or 'talk page' in row
feature_mapping_dict = {
'length' : len_comment,
'word_length' : avg_word_length,
'count_exclamation' : count_symbol,
'count_question' : partial(count_symbol, symbol='?'),
'bad_word' : count_bad_word,
'bad_word2' : count_bad_word2,
'count_capitals' : count_capitals,
# 'bad_word' : count_bad_word,
'bad_word' : count_bad_word2,
# 'count_capitals' : count_capitals,
# 'proportion_capitals' : proportion_capitals,
'num_unique_words' : num_unique_words}
'num_unique_words' : num_unique_words,
'mail' : has_mail,
'url' : has_url,
'wiki' : is_wikipedia,
'ip' : has_ip,
'date' : has_date,
'sentiment' : sentiment}
# 'proportion_unique_words' : proportion_unique_words}
def compute_features(text_df, which_features=None):
......
......@@ -102,6 +102,34 @@ def sparse_to_dense(X):
def predict_proba_conc(estimator, X):
return np.concatenate([preds[:,1][:,None] for preds in estimator.predict_proba(X)], axis=-1)
def fasttext_binary_labels_to_preds(labels, predictions):
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
col_labels = ['__label__{}'.format(col) for col in cols]
y = np.zeros((len(labels), 6))
for i, (lbl, pred) in enumerate(zip(labels, predictions)):
pred_probs = []
for col in cols:
wh_pos = np.where(np.array(lbl)=='__label__{} '.format(col))
wh_neg = np.where(np.array(lbl)=='__label__not_{} '.format(col))
prob_pos = wh_pos[0][0] if len(wh_pos[0]) > 0 else 0.
prob_neg = wh_neg[0][0] if len(wh_neg[0]) > 0 else 0.
final_prob = prob_pos / (prob_pos+prob_neg) if prob_pos > 0. and prob_neg > 0. else 0.
pred_probs.append(final_prob)
y[i] = np.array(pred_probs)
return y
def fasttext_labels_to_preds(labels, predictions):
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
col_labels = ['__label__{}'.format(col) for col in cols]
y = np.zeros((len(labels), 6))
for i, (lbl, pred) in enumerate(zip(labels, predictions)):
pred_probs = []
for col in col_labels:
wh = np.where(np.array(lbl)==col)
pred_probs.append(wh[0][0] if len(wh[0]) > 0 else 0.)
y[i] = np.array(pred_probs)
return y
@memory.cache
def get_glove_embedding(glove_path):
embeddings_index = {}
......@@ -114,6 +142,24 @@ def get_glove_embedding(glove_path):
f.close()
return embeddings_index
def make_fasttext_txt(train_text, train_y):
from sklearn.model_selection import KFold
kf = KFold(n_splits=6)
train_text = train_text.str.replace('\n', ' ')
cols=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
indicator = [''.join(['__label__{} '.format(col) if y_i == 1 else '__label__not_{} '.format(col) for col, y_i in zip(cols, y) ]) for y in train_y]
# indicator = [''.join(['__label__{} '.format(col) for col, y_i in zip(cols, y) if y_i == 1]) for y in train_y]
# indicator = [ind if len(ind)>0 else '__label__fine ' for ind in indicator]
indicator = np.array(indicator)
for i, (train, test) in enumerate(kf.split(train_y)):
all_train = [ind + train + '\n' for ind, train in zip(indicator[train], train_text[train])]
all_test = [ind + train + '\n' for ind, train in zip(indicator[test], train_text[test])]
with open('../supervised_text_for_ft_cv_{}.txt'.format(i), 'w+') as fl:
fl.writelines(all_train)
with open('../supervised_test_text_for_ft_cv_{}.txt'.format(i), 'w+') as fl:
fl.writelines(all_test)
@memory.cache
def get_fasttext_embedding(fasttext_path):
embeddings_index = {}
......
......@@ -134,7 +134,7 @@ import re, string
re_tok = re.compile('([{}“”¨«»®´·º½¾¿¡§£₤‘’])'.format(string.punctuation))
def tokenize(s): return re_tok.sub(r' \1 ', s).split()
def get_tfidf_model(ngram_range=(1,2), tokenizer=None, min_df=0.005, max_df=0.9, strip_accents='unicode',
def get_tfidf_model(ngram_range=(1,2), tokenizer=None, min_df=5, max_df=0.9, strip_accents='unicode',
use_idf=1, smooth_idf=1, sublinear_tf=1, **kwargs):
if tokenizer is None:
tokenizer = tokenize
......
......@@ -109,7 +109,7 @@ def clean_comment(text, replace_misspellings=True):
text = ud.normalize('NFD', text.encode('utf-8').decode('utf-8'))
text = text.lower()
text = re.sub(r'[^\x00-\x7f]', r' ' , text)
# text = re.sub(r'[\n\r]', r' ', text)
text = re.sub(r'[\n\r]', r' ', text)
s = re.sub(r"what's", "what is ", text, flags=re.IGNORECASE)
s = re.sub(r"\'ve", " have ", s, flags=re.IGNORECASE)
s = re.sub(r"can't", "cannot ", s, flags=re.IGNORECASE)
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment