Source code for copro.machine_learning

import os
import pickle
import pandas as pd
import numpy as np
from sklearn import svm, neighbors, ensemble, preprocessing, model_selection, metrics
from copro import conflict, data

[docs]def define_scaling(config): """Defines scaling method based on model configurations. Args: config (ConfigParser-object): object containing the parsed configuration-settings of the model. Raises: ValueError: raised if a non-supported scaling method is specified. Returns: scaler: the specified scaling method instance. """ if config.get('machine_learning', 'scaler') == 'MinMaxScaler': scaler = preprocessing.MinMaxScaler() elif config.get('machine_learning', 'scaler') == 'StandardScaler': scaler = preprocessing.StandardScaler() elif config.get('machine_learning', 'scaler') == 'RobustScaler': scaler = preprocessing.RobustScaler() elif config.get('machine_learning', 'scaler') == 'QuantileTransformer': scaler = preprocessing.QuantileTransformer(random_state=42) else: raise ValueError('no supported scaling-algorithm selected - choose between MinMaxScaler, StandardScaler, RobustScaler or QuantileTransformer') if config.getboolean('general', 'verbose'): print('DEBUG: chosen scaling method is {}'.format(scaler)) return scaler
[docs]def define_model(config): """Defines model based on model configurations. Model parameter were optimized beforehand using GridSearchCV. Args: config (ConfigParser-object): object containing the parsed configuration-settings of the model. Raises: ValueError: raised if a non-supported model is specified. Returns: classifier: the specified model instance. """ if config.get('machine_learning', 'model') == 'NuSVC': clf = svm.NuSVC(nu=0.1, kernel='rbf', class_weight={1: 100}, probability=True, degree=10, gamma=10, random_state=42) elif config.get('machine_learning', 'model') == 'KNeighborsClassifier': clf = neighbors.KNeighborsClassifier(n_neighbors=10, weights='distance') elif config.get('machine_learning', 'model') == 'RFClassifier': clf = ensemble.RandomForestClassifier(n_estimators=1000, class_weight={1: 100}, random_state=42) else: raise ValueError('no supported ML model selected - choose between NuSVC, KNeighborsClassifier or RFClassifier') if config.getboolean('general', 'verbose'): print('DEBUG: chosen ML model is {}'.format(clf)) return clf
[docs]def split_scale_train_test_split(X, Y, config, scaler): """Splits and transforms the X-array (or sample data) and Y-array (or target data) in test-data and training-data. The fraction of data used to split the data is specified in the configuration file. Additionally, the unique identifier and geometry of each data point in both test-data and training-data is retrieved in separate arrays. Args: X (array): array containing the variable values plus unique identifer and geometry information. Y (array): array containing merely the binary conflict classifier data. config (ConfigParser-object): object containing the parsed configuration-settings of the model. scaler (scaler): the specified scaling method instance. Returns: arrays: arrays containing training-set and test-set for X-data and Y-data as well as IDs and geometry. """ ##- separate arrays for ID, geometry, and variable values X_ID, X_geom, X_data = conflict.split_conflict_geom_data(X) ##- scaling only the variable values if config.getboolean('general', 'verbose'): print('DEBUG: fitting and transforming X') X_ft = scaler.fit_transform(X_data) ##- combining ID, geometry and scaled sample values per polygon X_cs = np.column_stack((X_ID, X_geom, X_ft)) ##- splitting in train and test samples based on user-specified fraction if config.getboolean('general', 'verbose'): print('DEBUG: splitting both X and Y in train and test data') X_train, X_test, y_train, y_test = model_selection.train_test_split(X_cs, Y, test_size=1-config.getfloat('machine_learning', 'train_fraction')) # for training-set and test-set, split in ID, geometry, and values X_train_ID, X_train_geom, X_train = conflict.split_conflict_geom_data(X_train) X_test_ID, X_test_geom, X_test = conflict.split_conflict_geom_data(X_test) return X_train, X_test, y_train, y_test, X_train_geom, X_test_geom, X_train_ID, X_test_ID
[docs]def fit_predict(X_train, y_train, X_test, clf, config, out_dir, run_nr): """Fits classifier based on training-data and makes predictions. The fitted classifier is dumped to file with pickle to be used again during projections. Makes prediction with test-data including probabilities of those predictions. Args: X_train (array): training-data of variable values. y_train (array): training-data of conflict data. X_test (array): test-data of variable values. clf (classifier): the specified model instance. config (ConfigParser-object): object containing the parsed configuration-settings of the model. out_dir (path): path to output folder. run_nr (int): number of fit/predict repetition and created classifier. Returns: arrays: arrays including the predictions made and their probabilities """ # fit the classifier with training data clf.fit(X_train, y_train) # create folder to store all classifiers with pickle clf_pickle_rep = os.path.join(out_dir, 'clfs') if not os.path.isdir(clf_pickle_rep): os.makedirs(clf_pickle_rep) # save the fitted classifier to file via pickle.dump() if config.getboolean('general', 'verbose'): print('DEBUG: dumping classifier to {}'.format(clf_pickle_rep)) with open(os.path.join(clf_pickle_rep, 'clf_{}.pkl'.format(run_nr)), 'wb') as f: pickle.dump(clf, f) # make prediction y_pred = clf.predict(X_test) # make prediction of probability y_prob = clf.predict_proba(X_test) return y_pred, y_prob
[docs]def pickle_clf(scaler, clf, config, root_dir): """(Re)fits a classifier with all available data and pickles it. Args: scaler (scaler): the specified scaling method instance. clf (classifier): the specified model instance. config (ConfigParser-object): object containing the parsed configuration-settings of the model. root_dir (str): path to location of cfg-file. Returns: classifier: classifier fitted with all available data. """ print('INFO: fitting the classifier with all data from reference period') # reading XY-data # if nothing specified in cfg-file, load from output directory if config.get('pre_calc', 'XY') is '': if config.getboolean('general', 'verbose'): print('DEBUG: loading XY data from {}'.format(os.path.join(root_dir, config.get('general', 'output_dir'), '_REF', 'XY.npy'))) XY_fit = np.load(os.path.join(root_dir, config.get('general', 'output_dir'), '_REF', 'XY.npy'), allow_pickle=True) # if a path is specified, load from there else: if config.getboolean('general', 'verbose'): print('DEBUG: loading XY data from {}'.format(os.path.join(root_dir, config.get('pre_calc', 'XY')))) XY_fit = np.load(os.path.join(root_dir, config.get('pre_calc', 'XY')), allow_pickle=True) # split in X and Y data X_fit, Y_fit = data.split_XY_data(XY_fit, config) # split X in ID, geometry, and values X_ID_fit, X_geom_fit, X_data_fit = conflict.split_conflict_geom_data(X_fit) # scale values X_ft_fit = scaler.fit_transform(X_data_fit) # fit classifier with values clf.fit(X_ft_fit, Y_fit) return clf
[docs]def load_clfs(config, out_dir): """Loads the paths to all previously fitted classifiers to a list. Classifiers were saved to file in fit_predict(). With this list, the classifiers can be loaded again during projections. Args: config (ConfigParser-object): object containing the parsed configuration-settings of the model. out_dir (path): path to output folder. Returns: list: list with file names of classifiers. """ clfs = os.listdir(os.path.join(out_dir, 'clfs')) assert (len(clfs), config.getint('machine_learning', 'n_runs')), AssertionError('ERROR: number of loaded classifiers does not match the specified number of runs in cfg-file!') return clfs