import os
import pickle
import pandas as pd
import numpy as np
from sklearn import svm, neighbors, ensemble, preprocessing, model_selection, metrics
from copro import conflict, data
[docs]def define_scaling(config):
"""Defines scaling method based on model configurations.
Args:
config (ConfigParser-object): object containing the parsed configuration-settings of the model.
Raises:
ValueError: raised if a non-supported scaling method is specified.
Returns:
scaler: the specified scaling method instance.
"""
if config.get('machine_learning', 'scaler') == 'MinMaxScaler':
scaler = preprocessing.MinMaxScaler()
elif config.get('machine_learning', 'scaler') == 'StandardScaler':
scaler = preprocessing.StandardScaler()
elif config.get('machine_learning', 'scaler') == 'RobustScaler':
scaler = preprocessing.RobustScaler()
elif config.get('machine_learning', 'scaler') == 'QuantileTransformer':
scaler = preprocessing.QuantileTransformer(random_state=42)
else:
raise ValueError('no supported scaling-algorithm selected - choose between MinMaxScaler, StandardScaler, RobustScaler or QuantileTransformer')
if config.getboolean('general', 'verbose'): print('DEBUG: chosen scaling method is {}'.format(scaler))
return scaler
[docs]def define_model(config):
"""Defines model based on model configurations. Model parameter were optimized beforehand using GridSearchCV.
Args:
config (ConfigParser-object): object containing the parsed configuration-settings of the model.
Raises:
ValueError: raised if a non-supported model is specified.
Returns:
classifier: the specified model instance.
"""
if config.get('machine_learning', 'model') == 'NuSVC':
clf = svm.NuSVC(nu=0.1, kernel='rbf', class_weight={1: 100}, probability=True, degree=10, gamma=10, random_state=42)
elif config.get('machine_learning', 'model') == 'KNeighborsClassifier':
clf = neighbors.KNeighborsClassifier(n_neighbors=10, weights='distance')
elif config.get('machine_learning', 'model') == 'RFClassifier':
clf = ensemble.RandomForestClassifier(n_estimators=1000, class_weight={1: 100}, random_state=42)
else:
raise ValueError('no supported ML model selected - choose between NuSVC, KNeighborsClassifier or RFClassifier')
if config.getboolean('general', 'verbose'): print('DEBUG: chosen ML model is {}'.format(clf))
return clf
[docs]def split_scale_train_test_split(X, Y, config, scaler):
"""Splits and transforms the X-array (or sample data) and Y-array (or target data) in test-data and training-data.
The fraction of data used to split the data is specified in the configuration file.
Additionally, the unique identifier and geometry of each data point in both test-data and training-data is retrieved in separate arrays.
Args:
X (array): array containing the variable values plus unique identifer and geometry information.
Y (array): array containing merely the binary conflict classifier data.
config (ConfigParser-object): object containing the parsed configuration-settings of the model.
scaler (scaler): the specified scaling method instance.
Returns:
arrays: arrays containing training-set and test-set for X-data and Y-data as well as IDs and geometry.
"""
##- separate arrays for ID, geometry, and variable values
X_ID, X_geom, X_data = conflict.split_conflict_geom_data(X)
##- scaling only the variable values
if config.getboolean('general', 'verbose'): print('DEBUG: fitting and transforming X')
X_ft = scaler.fit_transform(X_data)
##- combining ID, geometry and scaled sample values per polygon
X_cs = np.column_stack((X_ID, X_geom, X_ft))
##- splitting in train and test samples based on user-specified fraction
if config.getboolean('general', 'verbose'): print('DEBUG: splitting both X and Y in train and test data')
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_cs,
Y,
test_size=1-config.getfloat('machine_learning', 'train_fraction'))
# for training-set and test-set, split in ID, geometry, and values
X_train_ID, X_train_geom, X_train = conflict.split_conflict_geom_data(X_train)
X_test_ID, X_test_geom, X_test = conflict.split_conflict_geom_data(X_test)
return X_train, X_test, y_train, y_test, X_train_geom, X_test_geom, X_train_ID, X_test_ID
[docs]def fit_predict(X_train, y_train, X_test, clf, config, out_dir, run_nr):
"""Fits classifier based on training-data and makes predictions.
The fitted classifier is dumped to file with pickle to be used again during projections.
Makes prediction with test-data including probabilities of those predictions.
Args:
X_train (array): training-data of variable values.
y_train (array): training-data of conflict data.
X_test (array): test-data of variable values.
clf (classifier): the specified model instance.
config (ConfigParser-object): object containing the parsed configuration-settings of the model.
out_dir (path): path to output folder.
run_nr (int): number of fit/predict repetition and created classifier.
Returns:
arrays: arrays including the predictions made and their probabilities
"""
# fit the classifier with training data
clf.fit(X_train, y_train)
# create folder to store all classifiers with pickle
clf_pickle_rep = os.path.join(out_dir, 'clfs')
if not os.path.isdir(clf_pickle_rep):
os.makedirs(clf_pickle_rep)
# save the fitted classifier to file via pickle.dump()
if config.getboolean('general', 'verbose'): print('DEBUG: dumping classifier to {}'.format(clf_pickle_rep))
with open(os.path.join(clf_pickle_rep, 'clf_{}.pkl'.format(run_nr)), 'wb') as f:
pickle.dump(clf, f)
# make prediction
y_pred = clf.predict(X_test)
# make prediction of probability
y_prob = clf.predict_proba(X_test)
return y_pred, y_prob
[docs]def pickle_clf(scaler, clf, config, root_dir):
"""(Re)fits a classifier with all available data and pickles it.
Args:
scaler (scaler): the specified scaling method instance.
clf (classifier): the specified model instance.
config (ConfigParser-object): object containing the parsed configuration-settings of the model.
root_dir (str): path to location of cfg-file.
Returns:
classifier: classifier fitted with all available data.
"""
print('INFO: fitting the classifier with all data from reference period')
# reading XY-data
# if nothing specified in cfg-file, load from output directory
if config.get('pre_calc', 'XY') is '':
if config.getboolean('general', 'verbose'): print('DEBUG: loading XY data from {}'.format(os.path.join(root_dir, config.get('general', 'output_dir'), '_REF', 'XY.npy')))
XY_fit = np.load(os.path.join(root_dir, config.get('general', 'output_dir'), '_REF', 'XY.npy'), allow_pickle=True)
# if a path is specified, load from there
else:
if config.getboolean('general', 'verbose'): print('DEBUG: loading XY data from {}'.format(os.path.join(root_dir, config.get('pre_calc', 'XY'))))
XY_fit = np.load(os.path.join(root_dir, config.get('pre_calc', 'XY')), allow_pickle=True)
# split in X and Y data
X_fit, Y_fit = data.split_XY_data(XY_fit, config)
# split X in ID, geometry, and values
X_ID_fit, X_geom_fit, X_data_fit = conflict.split_conflict_geom_data(X_fit)
# scale values
X_ft_fit = scaler.fit_transform(X_data_fit)
# fit classifier with values
clf.fit(X_ft_fit, Y_fit)
return clf
[docs]def load_clfs(config, out_dir):
"""Loads the paths to all previously fitted classifiers to a list.
Classifiers were saved to file in fit_predict().
With this list, the classifiers can be loaded again during projections.
Args:
config (ConfigParser-object): object containing the parsed configuration-settings of the model.
out_dir (path): path to output folder.
Returns:
list: list with file names of classifiers.
"""
clfs = os.listdir(os.path.join(out_dir, 'clfs'))
assert (len(clfs), config.getint('machine_learning', 'n_runs')), AssertionError('ERROR: number of loaded classifiers does not match the specified number of runs in cfg-file!')
return clfs