Source code for copro.data

from copro import conflict, variables, evaluation
import click
import numpy as np
import xarray as xr
import pandas as pd
import os, sys


[docs]def initiate_XY_data(config):
    """Initiates an empty dictionary to contain the XY-data for each polygon, ie. both sample data and target data. 
    This is needed for the reference run.
    By default, the first column is for the polygon ID, the second for polygon geometry.
    The antepenultimate column is for boolean information about conflict at t-1 while the penultimate column is for boolean information about conflict at t-1 in neighboring polygons.
    The last column is for binary conflict data at t (i.e. the target data).
    
    Every column in between corresponds to the variables provided in the cfg-file.

    Args:
        config (ConfigParser-object): object containing the parsed configuration-settings of the model.

    Returns:
        dict: emtpy dictionary to be filled, containing keys for each variable (X), binary conflict data (Y) plus meta-data.
    """

    # Initialize dictionary
    # some entries are set by default, besides the ones corresponding to input data variables
    XY = {}
    XY['poly_ID'] = pd.Series()
    XY['poly_geometry'] = pd.Series()
    for key in config.items('data'):
        XY[str(key[0])] = pd.Series(dtype=float)
    XY['conflict_t_min_1'] = pd.Series(dtype=bool)
    XY['conflict_t_min_1_nb'] = pd.Series(dtype=float)
    XY['conflict'] = pd.Series(dtype=bool)

    if config.getboolean('general', 'verbose'): 
        click.echo('DEBUG: the columns in the sample matrix used are:')
        for key in XY:
            click.echo('...{}'.format(key))

    return XY

[docs]def initiate_X_data(config):
    """Initiates an empty dictionary to contain the X-data for each polygon, ie. only sample data. 
    This is needed for each time step of each projection run.
    By default, the first column is for the polygon ID and the second for polygon geometry.
    The penultimate column is for boolean information about conflict at t-1 while the last column is for boolean information about conflict at t-1 in neighboring polygons.
    All remaining columns correspond to the variables provided in the cfg-file.

    Args:
        config (ConfigParser-object): object containing the parsed configuration-settings of the model.

    Returns:
        dict: emtpy dictionary to be filled, containing keys for each variable (X) plus meta-data.
    """   

    # Initialize dictionary
    # some entries are set by default, besides the ones corresponding to input data variables
    X = {}
    X['poly_ID'] = pd.Series()
    X['poly_geometry'] = pd.Series()
    for key in config.items('data'):
        X[str(key[0])] = pd.Series(dtype=float)
    X['conflict_t_min_1'] = pd.Series(dtype=bool)
    X['conflict_t_min_1_nb'] = pd.Series(dtype=float)

    if config.getboolean('general', 'verbose'): 
        click.echo('DEBUG: the columns in the sample matrix used are:')
        for key in X:
            click.echo('...{}'.format(key))

    return X

[docs]def fill_XY(XY, config, root_dir, conflict_data, polygon_gdf, out_dir):
    """Fills the (XY-)dictionary with data for each variable and conflict for each polygon for each simulation year. 
    The number of rows should therefore equal to number simulation years times number of polygons.
    At end of last simulation year, the dictionary is converted to a numpy-array.

    Args:
        XY (dict): initiated, i.e. empty, XY-dictionary
        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
        root_dir (str): path to location of cfg-file.
        conflict_data (geo-dataframe): geo-dataframe containing the selected conflicts.
        polygon_gdf (geo-dataframe): geo-dataframe containing the selected polygons.
        out_dir (path): path to output folder.

    Raises:
        Warning: raised if the datetime-format of the netCDF-file does not match conventions and/or supported formats.

    Returns:
        array: filled array containing the variable values (X) and binary conflict data (Y) plus meta-data.
    """    

    # go through all simulation years as specified in config-file
    model_period = np.arange(config.getint('settings', 'y_start'), config.getint('settings', 'y_end') + 1, 1)
    click.echo('INFO: reading data for period from {} to {}'.format(model_period[0], model_period[-1]))

    neighboring_matrix = neighboring_polys(config, polygon_gdf)

    for (sim_year, i) in zip(model_period, range(len(model_period))):

        if i == 0:

            click.echo('INFO: skipping first year {} to start up model'.format(sim_year))

        else:

            click.echo('INFO: entering year {}'.format(sim_year))

            # go through all keys in dictionary
            for key, value in XY.items(): 

                if key == 'conflict':
                
                    data_series = value
                    data_list = conflict.conflict_in_year_bool(config, conflict_data, polygon_gdf, sim_year, out_dir)
                    data_series = data_series.append(pd.Series(data_list), ignore_index=True)
                    XY[key] = data_series

                elif key == 'conflict_t_min_1':

                    data_series = value
                    data_list = conflict.conflict_in_previous_year(config, conflict_data, polygon_gdf, sim_year)
                    data_series = data_series.append(pd.Series(data_list), ignore_index=True)
                    XY[key] = data_series

                elif key == 'conflict_t_min_1_nb':

                    data_series = value
                    data_list = conflict.conflict_in_previous_year(config, conflict_data, polygon_gdf, sim_year, check_neighbors=True, neighboring_matrix=neighboring_matrix)
                    data_series = data_series.append(pd.Series(data_list), ignore_index=True)
                    XY[key] = data_series

                elif key == 'poly_ID':
                
                    data_series = value
                    data_list = conflict.get_poly_ID(polygon_gdf)
                    data_series = data_series.append(pd.Series(data_list), ignore_index=True)
                    XY[key] = data_series

                elif key == 'poly_geometry':
                
                    data_series = value
                    data_list = conflict.get_poly_geometry(polygon_gdf, config)
                    data_series = data_series.append(pd.Series(data_list), ignore_index=True)
                    XY[key] = data_series

                else:

                    nc_ds = xr.open_dataset(os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', key)).rsplit(',')[0])
                    
                    if (np.dtype(nc_ds.time) == np.float32) or (np.dtype(nc_ds.time) == np.float64):
                        data_series = value
                        data_list = variables.nc_with_float_timestamp(polygon_gdf, config, root_dir, key, sim_year)
                        data_series = data_series.append(pd.Series(data_list), ignore_index=True)
                        XY[key] = data_series
                        
                    elif np.dtype(nc_ds.time) == 'datetime64[ns]':
                        data_series = value
                        data_list = variables.nc_with_continous_datetime_timestamp(polygon_gdf, config, root_dir, key, sim_year)
                        data_series = data_series.append(pd.Series(data_list), ignore_index=True)
                        XY[key] = data_series
                        
                    else:
                        raise Warning('WARNING: this nc-file does have a different dtype for the time variable than currently supported: {}'.format(os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', key))))

            if config.getboolean('general', 'verbose'): click.echo('DEBUG: all data read')

    df_out = pd.DataFrame.from_dict(XY)
    
    return df_out.to_numpy()

[docs]def fill_X_sample(X, config, root_dir, polygon_gdf, proj_year):
    """Fills the X-dictionary with the data sample data besides any conflict-related data for each polygon and each year.
    Used during the projection runs as the sample and conflict data need to be treated separately there.

    Args:
        X (dict): dictionary containing keys to be sampled.
        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
        root_dir (str): path to location of cfg-file of reference run.
        polygon_gdf (geo-dataframe): geo-dataframe containing the selected polygons.
        proj_year (int): year for which projection is made.

    Raises:
        Warning: raised if the datetime-format of the netCDF-file does not match conventions and/or supported formats.

    Returns:
        dict: dictionary containing sample values.
    """    

    if config.getboolean('general', 'verbose'): click.echo('DEBUG: reading sample data from files')

    # go through all keys in dictionary
    for key, value in X.items(): 

        if key == 'poly_ID':
        
            data_series = value
            data_list = conflict.get_poly_ID(polygon_gdf)
            data_series = data_series.append(pd.Series(data_list), ignore_index=True)
            X[key] = data_series

        elif key == 'poly_geometry':
        
            data_series = value
            data_list = conflict.get_poly_geometry(polygon_gdf, config)
            data_series = data_series.append(pd.Series(data_list), ignore_index=True)
            X[key] = data_series

        else:

            if (key != 'conflict_t_min_1') and (key != 'conflict_t_min_1_nb'):

                nc_ds = xr.open_dataset(os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', key)).rsplit(',')[0])
                
                if (np.dtype(nc_ds.time) == np.float32) or (np.dtype(nc_ds.time) == np.float64):
                    data_series = value
                    data_list = variables.nc_with_float_timestamp(polygon_gdf, config, root_dir, key, proj_year)
                    data_series = data_series.append(pd.Series(data_list), ignore_index=True)
                    X[key] = data_series
                    
                elif np.dtype(nc_ds.time) == 'datetime64[ns]':
                    data_series = value
                    data_list = variables.nc_with_continous_datetime_timestamp(polygon_gdf, config, root_dir, key, proj_year)
                    data_series = data_series.append(pd.Series(data_list), ignore_index=True)
                    X[key] = data_series
                    
                else:
                    raise Warning('WARNING: this nc-file does have a different dtype for the time variable than currently supported: {}'.format(nc_fo))

    return X

[docs]def fill_X_conflict(X, config, conflict_data, polygon_gdf):
    """Fills the X-dictionary with the conflict data for each polygon and each year.
    Used during the projection runs as the sample and conflict data need to be treated separately there.

    Args:
        X (dict): dictionary containing keys to be sampled.
        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
        conflict_data (dataframe): dataframe containing all polygons with conflict.
        polygon_gdf (geo-dataframe): geo-dataframe containing the selected polygons.

    Returns:
        dict: dictionary containing sample and conflict values.
    """    

    # determine all neighbours for each polygon
    neighboring_matrix = neighboring_polys(config, polygon_gdf)

    # go through all keys in dictionary
    for key, value in X.items(): 

        if key == 'conflict_t_min_1':

            data_series = value
            data_list = conflict.read_projected_conflict(polygon_gdf, conflict_data)
            data_series = data_series.append(pd.Series(data_list), ignore_index=True)
            X[key] = data_series

        elif key == 'conflict_t_min_1_nb':

            data_series = value
            data_list = conflict.read_projected_conflict(polygon_gdf, conflict_data, check_neighbors=True, neighboring_matrix=neighboring_matrix)
            data_series = data_series.append(pd.Series(data_list), ignore_index=True)
            X[key] = data_series

        else:

            pass

    if config.getboolean('general', 'verbose'): click.echo('DEBUG: all data read')

    return X

[docs]def split_XY_data(XY, config):
    """Separates the XY-array into array containing information about variable values (X-array or sample data) and conflict data (Y-array or target data).
    Thereby, the X-array also contains the information about unique identifier and polygon geometry.

    Args:
        XY (array): array containing variable values and conflict data.
        config (ConfigParser-object): object containing the parsed configuration-settings of the model.

    Returns:
        arrays: two separate arrays, the X-array and Y-array.
    """    

    # convert array to dataframe for easier handling
    XY = pd.DataFrame(XY)
    if config.getboolean('general', 'verbose'): click.echo('DEBUG: number of data points including missing values: {}'.format(len(XY)))

    # fill all missing values with 0
    XY = XY.fillna(0)

    # convert dataframe back to array
    XY = XY.to_numpy()
    
    # get X data
    # since conflict is the last column, we know that all previous columns must be variable values
    X = XY[:, :-1] 
    # get Y data and convert to integer values
    Y = XY[:, -1]
    Y = Y.astype(int)

    if config.getboolean('general', 'verbose'): 
        fraction_Y_1 = 100*len(np.where(Y != 0)[0])/len(Y)
        click.echo('DEBUG: a fraction of {} percent in the data corresponds to conflicts.'.format(round(fraction_Y_1, 2)))

    return X, Y

[docs]def neighboring_polys(config, extent_gdf, identifier='watprovID'):
    """For each polygon, determines its neighboring polygons.
    As result, a (n x n) look-up dataframe is obtained containing, where n is number of polygons in extent_gdf.

    Args:
        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
        extent_gdf (geo-dataframe): geo-dataframe containing the selected polygons.
        identifier (str, optional): column name in extent_gdf to be used to identify neighbors. Defaults to 'watprovID'.

    Returns:
        dataframe: look-up dataframe containing True/False statement per polygon for all other polygons.
    """    

    if config.getboolean('general', 'verbose'): click.echo('DEBUG: determining matrix with neighboring polygons')

    # initialise empty dataframe
    df = pd.DataFrame()

    # go through each polygon aka water province
    for i in range(len(extent_gdf)):
        # get geometry of current polygon
        wp = extent_gdf.geometry.iloc[i]
        # check which polygons in geodataframe (i.e. all water provinces) touch the current polygon
        # also create a dataframe from result (boolean)
        # the transpose is needed to easier append
        df_temp = pd.DataFrame(extent_gdf.geometry.touches(wp), columns=[extent_gdf[identifier].iloc[i]]).T
        # append the dataframe
        df = df.append(df_temp)

    # replace generic indices with actual water province IDs
    df.set_index(extent_gdf[identifier], inplace=True)

    # replace generic columns with actual water province IDs
    df.columns = extent_gdf[identifier].values

    return df

[docs]def find_neighbors(ID, neighboring_matrix):
    """Filters all polygons which are actually neighbors to given polygon.

    Args:
        ID (int): ID of specific polygon under consideration.
        neighboring_matrix (dataframe): output from neighboring_polys().

    Returns:
        dataframe: dataframe containig IDs of all polygons that are actual neighbors.
    """    

    # locaties entry for polygon under consideration
    neighbours = neighboring_matrix.loc[neighboring_matrix.index == ID].T
    
    # filters all actual neighbors defined as neighboring polygons with True statement
    actual_neighbours = neighbours.loc[neighbours[ID] == True].index.values

    return actual_neighbours