Commit 2aa4c002 authored by mjboos's avatar mjboos

language-specific model

parent f4551d6a
......@@ -16,16 +16,6 @@ from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateSchedule
import json
memory = joblib.Memory(cachedir='/home/mboos/joblib')
def fit_model_and_predict(model_name, pipeline, train_X, train_y, test_X, **fit_params):
pipeline.fit(train_X, train_y, **fit_params)
probas = np.concatenate([proba[:,1][:,None] for proba in estimator.predict_proba(test_tf)], axis=-1)
def fit_keras_model(train_X, train_y, model_args={}, pre_args={}, fit_args={}):
model = models.keras_token_BiLSTM(pre_args=pre_args, **model_args)
model.fit(train_X, train_y, **fit_args)
return model
best_weights_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
......@@ -39,39 +29,27 @@ callbacks_list = [checkpoint, early] #early
fit_args = {'batch_size' : 128, 'epochs' : 20,
'validation_split' : 0.2, 'callbacks' : callbacks_list}
train_text, train_labels = pre.load_data()
test_text, _ = pre.load_data('test.csv')
train_y, test_y = train_labels.values, test_labels.values
## token BiLSTM
def train_token_BiLSTM():
model = models.keras_token_BiLSTM()
model.fit(train_text, train_y, **fit_args)
model.named_steps['BiLSTM'].load_weights(best_weights_path)
predictions = model.predict(test_text)
hlp.write_model(predictions)
# for now use only english as model
train_per_language = pre.load_data()
train_text, train_labels = train_per_language['en']
test_per_language = pre.load_data('test.csv')
test_text, _ = test_per_language['en']
## Glove BiLSTM
def train_glove_DNN(glove_path, **kwargs):
# with open('../parameters/glove_bilstm.json','r') as params_file:
# model_args = json.load(params_file)
# with open('../parameters/glove_bilstm_fit.json', 'r') as fit_file:
# fit_args = json.load(fit_file)
# model = models.keras_glove_BiLSTM(train_text, **kwargs)
embeddings_index = hlp.get_glove_embedding(glove_path)
model = models.Embedding_Blanko_DNN(embeddings_index=embeddings_index, **kwargs)
model.fit(train_text, train_y, **fit_args)
model.model.load_weights(best_weights_path)
predictions = model.predict(test_text)
hlp.write_model(predictions)
train_y = train_labels.values
def train_DNN(embeddings_index, **kwargs):
model = models.Embedding_Blanko_DNN(embeddings_index=embeddings_index, **kwargs)
model.fit(train_text, train_y, **fit_args)
model.model.load_weights(best_weights_path)
predictions = model.predict(test_text)
hlp.write_model(predictions)
return model
def DNN_EN_to_language_dict(model_english, train_per_language, simple_for=['fr', 'de', 'es', 'it']):
language_dict = models.make_default_language_dict()
language_dict['en'] = model_english
if simple_for:
for simple_lan in simple_for:
language_dict[simple_lan] = models.tfidf_model().fit(*train_per_language[simple_lan])
hlp.write_model(hlp.predictions_for_language(language_dict))
if __name__=='__main__':
maxlen = 200
......@@ -86,8 +64,9 @@ if __name__=='__main__':
logger = CSVLogger('../logs/300_fasttext_LSTM.csv', separator=',', append=False)
callbacks_list = [logger, checkpoint, early] #early
fit_args['callbacks'] = callbacks_list
train_DNN(embedding, trainable=False, maxlen=maxlen,
DNN_EN_to_language_dict(
train_DNN(embedding, trainable=False, maxlen=maxlen,
max_features=max_features, model_function=models.LSTM_dropout_model,
embedding_dim=embedding_dim, tokenizer=frozen_tokenizer,
compilation_args={'optimizer' : 'adam', 'loss':'binary_crossentropy','metrics':['accuracy']})
compilation_args={'optimizer' : 'adam', 'loss':'binary_crossentropy','metrics':['accuracy']}))
This diff is collapsed.
This diff is collapsed.
......@@ -14,6 +14,7 @@ import helpers as hlp
import preprocessing as pre
import sklearn.pipeline as pipe
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
......@@ -32,10 +33,13 @@ import copy
corr_dict1 = enchant.request_dict('en_US')
maketrans = string.maketrans
def make_default_language_dict(train_X, train_labels):
def make_default_language_dict(train_X=None, train_labels=None):
'''Returns a defaultdict that can be used in predict_for_language to predict a prior'''
from collections import defaultdict
from sklearn.dummy import DummyClassifier
if not train_X or not train_labels:
_, train_labels = pre.load_data(language=False)
train_X = np.zeros_like(train_labels)[:,None]
return defaultdict(DummyClassifier().fit(train_X, train_labels))
def text_to_word_sequence(text,
......@@ -81,12 +85,12 @@ class NBMLR(BaseEstimator):
def tfidf_model(pre_args={'ngram_range' : (1,2), 'tokenizer' : None,
'min_df' : 3, 'max_df' : 0.9, 'strip_accents' : 'unicode',
'use_idf' : 1, 'smooth_idf' : 1, 'sublinear_tf' : 1},
estimator_args={'n_estimators' : 150}, model_func=None):
estimator_args={}, model_func=None):
'''Returns unfitted tfidf_NBSVM pipeline object'''
if model_func is None:
model_func = GradientBoostingClassifier
return pipe.Pipeline(memory=memory, steps=[('tfidf', TfidfVectorizer(**pre_args)),
('model', MultiOutputClassifier(model_func(**estimator_args)))])
model_func = NBMLR
return pipe.Pipeline(steps=[('tfidf', TfidfVectorizer(**pre_args)),
('model', model_func(**estimator_args))])
def keras_token_model(model_fuction=None, max_features=20000, maxlen=100, embed_size=128):
if model_function is None:
......
......@@ -49,13 +49,13 @@ def load_data(name='train.csv', preprocess=True, language=True):
if preprocess:
data = data_preprocessing(data)
if language:
languages = pd.read_csv('language_{}'.format(name))
languages = pd.read_csv('language_{}'.format(name), header=None).squeeze()
grouped_data = data.groupby(by=lambda x : languages[x])
data_dict = { language : [data_language['comment_text'], data_language.iloc[:, 2:]]
data_dict = { language : [data['comment_text'], data.iloc[:, 2:].values]
for language, data in grouped_data }
else:
text = data['comment_text']
labels = data.iloc[:, 2:]
labels = data.iloc[:, 2:].values
data_dict = {'babel' : [text, labels]}
return data_dict
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_val_score
import helpers as hlp
import models
import preprocessing as pre
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, CSVLogger
import json
memory = joblib.Memory(cachedir='/home/mboos/joblib')
best_weights_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
def schedule(ind):
a = [0.002,0.002,0.002,0.001,0.001]
return a[ind]
lr = LearningRateScheduler(schedule)
callbacks_list = [checkpoint, early] #early
fit_args = {'batch_size' : 128, 'epochs' : 20,
'validation_split' : 0.2, 'callbacks' : callbacks_list}
# for now use only english as model
train_per_language = pre.load_data()
train_text, train_y = train_per_language['en']
test_per_language = pre.load_data('test.csv')
test_text, _ = test_per_language['en']
def train_DNN(embeddings_index, **kwargs):
model = models.Embedding_Blanko_DNN(embeddings_index=embeddings_index, **kwargs)
model.fit(train_text, train_y, **fit_args)
model.model.load_weights(best_weights_path)
return model
def DNN_EN_to_language_dict(model_english, train_per_language, simple_for=['fr', 'de', 'es', 'it']):
language_dict = models.make_default_language_dict()
language_dict['en'] = model_english
if simple_for:
for simple_lan in simple_for:
language_dict[simple_lan] = models.tfidf_model().fit(*train_per_language[simple_lan])
hlp.write_model(hlp.predictions_for_language(language_dict))
if __name__=='__main__':
maxlen = 200
max_features = 500000
frozen_tokenizer = pre.KerasPaddingTokenizer(max_features=max_features, maxlen=maxlen)
frozen_tokenizer.fit(pd.concat([train_text, test_text]))
english_model = models.tfidf_model().fit(train_text, train_y)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment