Source code for netsim.generate

"""
This module contains all functions needed to setup a network simulation  

"""

import numpy as np
import pandas as pd
import geopandas as gpd
from collections import OrderedDict
from itertools import permutations, product
from math import factorial


# constants used in generate 

#: maximum number of samples to be drawn from any group.
NSAMPLES = 100

#: maximum number of points below which permulations are possible
MAX_PERMUTATION_NUM = 7

#: minimum number of points above which samples can be drawn
MIN_NUM_SAMPLE = 3

#: maximum total number of iterations allowed in the simulation
MAX_ITERATIONS = 5000



[docs]def check(df):
    
    '''
    Checks and corrects input geo/pandas dataframe.

    
    Parameters
    ----------
    
    df: geo/dataframe
        contains locations used in the simulation
        
    
    Returns
    -------
    c_df: dataframe
        corrected geo/dataframe ready for netsim
    
    Notes
    -----
    
    This function checks for the existence columns needed in the *netsim* simulation. Columns must have the appropriate header (as shown below).
    Columns that do not exist will be generated and populated with default values. The validity of values in existing columns is also checked. Minor
    errors and corrections are notified. Major errors raise an exception error.
    
    The following columns need to be present:

    - *id*: exclusive identifier for each location. 
    - *group*: identifies a location as being part of a specific group. Groups can be of any size. Groups of size 1 will automatically mixed with the following
      group. A column with a single group affiliation will be created in case this column does not exist (default value is 1).
    - *seq*: identifies rank/ordering of a location within a group. There are two possible scenarios:

      - *No ordering/ ranking (default)*. Within each group a **single** value is used for all sites in that group. Depending on the number of locations in the
        group the simulation with either generate all possible permutations or *num_samples* of randomized samples (with repetition).
      - *Ordering /ranking*. Identify all sites in a group with a increasing monotonic sequence of numbers (no repetitions).
    
    '''
    
    # copy original dataframe
    c_df = df.rename(str.lower, axis = 'columns')
    
    # collect basic information from dataframe
    nrows, ncols = c_df.shape
    colnames = list(c_df)
        
    # initialize message 
    msg = []
    error_flag = False

    # id
    if 'id' not in colnames:
        # create a id column
        c_df['id'] = pd.Series(np.arange(nrows, dtype= np.int16), index=c_df.index)
    
    elif 'id' in colnames:
        # any row with the same id?
        if nrows != len(c_df['id'].unique()):
            msg.append('\n ERROR: id column - ids are not unique !!!')
            error_flag = True

    # group
    if 'group' not in colnames:
        # create group column with default (1 for single group)
        c_df['group'] = pd.Series(np.full(nrows, int(1)), index=c_df.index)
        msg.append('group column - created group column with single group !')
    
    groups= c_df['group'].unique()
    ngroup = len(groups)
        
    # seq
    if 'seq' not in colnames:
        # create a seq column with default value
        c_df['seq'] = pd.Series(np.full(nrows, int(1)), index=c_df.index)
        msg.append('seq column - created sequence with no sequence (default 1.) !')

    elif 'seq' in colnames:
        # check each group
        for g in groups:
            # Only one group?
            all_one = (c_df.loc[c_df['group'] == g, 'seq'] == 1).all()
            sequence = c_df.loc[c_df['group'] == g, 'seq'].unique()
            unique_sequence = len(c_df.loc[c_df['group'] == g]) == len(sequence)
            if not (all_one or unique_sequence): 
                error_flag = True
                msg.append('\n ERROR: seq column - sequence for group '+str(g)+' is not 1 or sequential!')
    
    # print messages
    if msg != []:
        for m in msg:
            print('\n'+m)
        
        # raise exception if any errors        
        if error_flag:
            raise('\nCheck errors !!! Network simulation ABORTED!! ')    
    else:
        print('\n No corrections or errors !! ')

        
    return c_df



def __shuffle(arr, nsamples):
    '''
    Generator function used to create randomized samples.
    
    Parameters
    ----------
    arr: 1D numpy array
        numpy array to be randomized
    
    nsamples: int
        number of randomized samples we want the generator to produce
        
    Yields
    ------
    
    arr: 1D numpy array
        randomized array

    Notes
    -----
    Internal function called by create_network_generator()

    '''
    
    for i in range(nsamples):
        np.random.shuffle(arr)
        yield tuple(arr)


[docs]def create_network_generator(df):
    '''
    Generates network generator.
    
    Parameters
    ---------
    df: geo/dataframe
        contains locations, group membership and parameters used in netsim
         
    Yields
    ------

    netgentor: generator
        network generator
    
    Returns
    -------
      
    df_net_info: dataframe
        contains information about each network (see *Notes* below)
    
    total_num_iter: int
        total number of iterations
    
    Notes
    -----

    This function returns three different outputs:

       1. It primarily returns a network generator that results from the cartesian product of separate generators
          (one per group of locations as identified thru the *group* column). Each, *group*, generator may be of one of these
          three types:

          - *single*: returns always the same combination of locations.
          - *sample*: returns a shuffled version of the locations. *n.b.* repetition can occur.
          - *permutation*: returns permutation of the locations (no repetition).
       2. Generates a dataframe, *df_net_info*, containing generator information for each group (number of locations, total
          number of iterations, generator type).
       3. Total number of iterations that results from the combination of all group generators.
       
    '''

    groups = df['group'].sort_values().unique()                                             # number of groups in df?

    netgentor = list(range(len(groups)))                                                    # initialize list to store
                                                                                            # generator functions.

    net_info = {'group':[], 'num_loc':[], 'num_iter':[], 'iter_type':[]}                    # dictionary with generator
                                                                                            # summary.
    total_num_iter = 1

    for i, grp in enumerate(groups):

        df_grp = df.loc[df['group'] == grp]                                                 # select pts with the same group id.
        indxs = list(df_grp['id'])                                                          # generate a list of with pt ids in group.
        num_pts_in_grp = len(df_grp)                                                        # number of points in group?
        num_unique_pts_grp = len(df_grp['seq'].unique())                                    # number of points with distinct pt ids in group?
          
        
        if num_pts_in_grp == num_unique_pts_grp:                                            # is there a single sequence?
            net_info['group'].append(grp)                                                   # single sequence block.
            net_info['num_loc'].append(num_pts_in_grp)
            num_iter_grp = 1
            net_info['num_iter'].append(num_iter_grp)                                                 
            net_info['iter_type'].append('single')
            netgentor[i] = [tuple(indxs)]                                                   # return single seq generator.
 
        else:
            if num_pts_in_grp > MIN_NUM_SAMPLE:                                             # can it be sampled?
                if num_pts_in_grp > MAX_PERMUTATION_NUM:                                    # can it be permutated?
                    net_info['group'].append(grp)                                           # sample sequence block.
                    net_info['num_loc'].append(num_pts_in_grp)
                    num_iter_grp = NSAMPLES
                    net_info['num_iter'].append(num_iter_grp)                        
                    net_info['iter_type'].append('sample')
                    netgentor[i] = __shuffle(indxs, NSAMPLES)                               # return random seq generator.

                else:
                    if factorial(num_pts_in_grp) < NSAMPLES:                                # num_samples > permutations?
                        # permute
                        net_info['group'].append(grp)                                       # permutation sequence block.
                        net_info['num_loc'].append(num_pts_in_grp)
                        num_iter_grp = factorial(num_pts_in_grp)
                        net_info['num_iter'].append(num_iter_grp)            
                        net_info['iter_type'].append('permutation')
                        netgentor[i] = permutations(indxs)                                  # return permutation seq generator.

                    else:
                        net_info['group'].append(grp)                                       # sample sequence block.
                        net_info['num_loc'].append(num_pts_in_grp)
                        num_iter_grp = NSAMPLES
                        net_info['num_iter'].append(num_iter_grp)                        
                        net_info['iter_type'].append('sample')
                        netgentor[i] = __shuffle(indxs, NSAMPLES)                           # return random seq generator.
            else:                                                                           # too small to sample so permute
                net_info['group'].append(grp)                                               # permutation sequence block.
                net_info['num_loc'].append(num_pts_in_grp)
                num_iter_grp = factorial(num_pts_in_grp)
                net_info['num_iter'].append(num_iter_grp)            
                net_info['iter_type'].append('permutation')
                netgentor[i] = permutations(indxs)                                          # return permutation seq generator.

        total_num_iter = total_num_iter * num_iter_grp                                      # update total number of iterations.
    
    df_net_info = pd.DataFrame(net_info)
    print('\n iteration broken per group....\n')
    print(df_net_info)
    print('\n total number of iterations....',total_num_iter)
                                                                                            # return generator for network
                                                                                            # iteration, iteration info and 
    return product(*netgentor), df_net_info, total_num_iter                                 # total number of iterations

[docs]def network_layout(df, iteration, iter_num, df_net= None, twoway= False, opt= 'close'):
    '''
        Creates a dataframe with information for each path in network.
        
    Parameters
    ----------
    
    df: dataframe
         contains list of locations, group membership used to generate a network
        
    iteration: tuple of tuples
        a tuple containing one or several tuples, one per group, representing a single network iteration
    
    iter_num: int
        iteration identifier
    
    df_net: dataframe, optional
        contains the identifiers of the origin and destination of each path plus the iteration identifier
    
    twoway: boolean
        if **True** two-way paths are generated for each pair of locations in a network. *Default*: **False**.
    
    opt: string
        type of network to generate. The options are as follows:

        - *close*: This option defines an independent close network of paths for each group. In this network, the locations are connected
          in order so that the first location is connected to the second one and so one until the last location is conected to the first.
          No network is defined if the first group is made of a single location.
        - *central*: This option defines a network of that consists of centralized set of paths from the locations of the first group to 
          all of the locations in the remaining groups.
        - *decentral*: This option defines a network of paths so that the locations of each group are connected to the locations of the following
          (lower level) group.
        - *distributed*: This option defines a network of paths similar to *decentral*, where the locations of each group are connected to
          the locations of the following (lower level) group, and in addition, locations within each group are interconnected.
        - *all*: This option defines a network of paths from amongst all locations.
    
    Returns
    -------
    
    df_net: dataframe, optional
      contains the identifiers of the origin and destination of each path plus the iteration identifier
        
    Notes
    -----
    
    This function takes a dataframe with locations, a list of lists containing an ordering of these locations obtained after running 
    ``create_network_generator()`` function and a iteration identifier number. It generates, or updates, the *df_net* dataframe with the 
    identifiers of the origin and destination of each path that make up the path network for this specific iteration. 

    '''
    
    # option valid?
    options = ['close', 'central', 'decentral','distributed', 'all']
    if opt not in options:
        raise Exception("'opt' not valid!!! Choose from {}".format(options))
    
    # create df_net?
    if df_net is None:
        net = {'origin': 'int32', 'destination': 'int32', 'iteration': 'int32'}
        df_net = pd.DataFrame(columns=list(net.keys())).astype(net)      
        
    # distinct groups
    groups = df['group'].unique()
    n_groups = len(groups)            
    grp_sizes = df.groupby(['group']).count().id.values # sizes of each group
    iteration = [list(g) for g in iteration]
    
    if n_groups == 1:
        
        if opt == 'close':
            origins = iteration[0]
            destinations = origins[1:]+[origins[0]]
            for o, d in zip(origins, destinations):
                df_net.loc[len(df_net.index)] = [o,d,iter_num]
                if twoway & (grp_sizes[0]>2):
                    df_net.loc[len(df_net.index)] = [d,o,iter_num]
        else:
            # do all other options as 'all' 
            origins = iteration[0]
            indx = range(len(origins))
            for i in indx[:-1]:
                for j in indx[i+1:]:
                    df_net.loc[len(df_net.index)] = [origins[i], origins[j], iter_num]
                    if twoway:
                        df_net.loc[len(df_net.index)] = [origins[j], origins[i], iter_num]                               
    else:
        
        if opt == 'close':
                       
            for i in range(n_groups):
                if grp_sizes[i] > 1:
                    # generate paths connecting all locations within the same group
                    origins = iteration[i]
                    destinations = origins[1:]+[origins[0]]
                    for o, d in zip(origins, destinations):
                        df_net.loc[len(df_net.index)] = [o,d,iter_num]
                        if twoway & (grp_sizes[i]>2):
                            df_net.loc[len(df_net.index)] = [d,o,iter_num]
                    
                else:
                    print('\nWARNING: No path for first group 1 calculated (size = 1)')                    
        
        elif opt == 'central':
            
            # first group
            if grp_sizes[0] > 1:
                # more than one location. define paths connecting all locations in group
                origins = iteration[0]
                destinations = origins[1:]+[origins[0]]
                for o, d in zip(origins, destinations):
                    df_net.loc[len(df_net.index)] = [o,d,iter_num]
                    if twoway:
                        df_net.loc[len(df_net.index)] = [d,o,iter_num]
            
            # set destinations to all locations in first group 
            destinations_up = iteration[0]
            
            # loop thru lower levels
            for i in range(1,n_groups):
                # define paths from lower level locations to all first group locations
                origins = iteration[i]
                for o in origins:
                    for d in destinations_up:
                        df_net.loc[len(df_net.index)] = [o,d,iter_num]
                        if twoway:
                            df_net.loc[len(df_net.index)] = [d,o,iter_num]

        elif opt == 'decentral':

            # first group
            if grp_sizes[0] > 1:
                # more than one location, define paths connecting all locations in group
                origins = iteration[0]
                destinations = origins[1:]+[origins[0]]
                for o, d in zip(origins, destinations):
                    df_net.loc[len(df_net.index)] = [o,d,iter_num]
                    if twoway:
                        df_net.loc[len(df_net.index)] = [d,o,iter_num]
            
            # set destinations to all locations in first group 
            destinations_up = iteration[0]
            
            # loop thru levels
            for i in range(1,n_groups):
                
                # define paths to from higher level locations to lower level locations
                origins = iteration[i]
                for o in origins:
                    for d in destinations_up:
                        df_net.loc[len(df_net.index)] = [o,d,iter_num]
                        if twoway:
                            df_net.loc[len(df_net.index)] = [d,o,iter_num]
                
                # set destinations to all locations in the previous (higher level) group
                destinations_up = origins

        elif opt == 'distributed':
        
            # first group
            if grp_sizes[0] > 1:
                # more than one location
                origins = iteration[0]
                destinations = origins[1:]+[origins[0]]
                for o, d in zip(origins, destinations):
                    df_net.loc[len(df_net.index)] = [o,d,iter_num]
                    if twoway:
                        df_net.loc[len(df_net.index)] = [d,o,iter_num]

            # set destinations to all locations in first group
            destinations_up = iteration[0]

            # loop thru levels
            for i in range(1,n_groups):

                # define paths to from higher level locations to lower level locations
                origins = iteration[i] #df.loc[df['group']== groups[i]]['id'].tolist()
                for o in origins:
                    for d in destinations_up:
                        df_net.loc[len(df_net.index)] = [o,d,iter_num]
                        if twoway:
                            df_net.loc[len(df_net.index)] = [d,o,iter_num]

                # paths between locations of the same level
                indx = range(len(origins))
                for i in indx[:-1]:
                    for j in indx[i+1:]:
                        df_net.loc[len(df_net.index)] = [origins[i], origins[j], iter_num]
                        if twoway:
                            df_net.loc[len(df_net.index)] = [origins[j], origins[i], iter_num]

                # update lower_destinations with previous origins
                destinations_up = origins
            
        else: #opt == 'all'#
            
            # define paths from all locations to all other locations
            origins = [id for g in iteration for id in g]
            indx = range(len(origins))
            for i in indx[:-1]:
                for j in indx[i+1:]:
                    df_net.loc[len(df_net.index)] = [origins[i], origins[j], iter_num]
                    if twoway:
                        df_net.loc[len(df_net.index)] = [origins[j], origins[i], iter_num]                   

    return df_net