|
1 | 1 | # ActivitySim |
2 | 2 | # See full license in LICENSE.txt. |
3 | 3 | import logging |
4 | | -import sys |
| 4 | +import os |
| 5 | + |
| 6 | +import numpy as np |
5 | 7 | import pandas as pd |
| 8 | +from activitysim.abm.models.trip_matrices import annotate_trips |
| 9 | +from activitysim.core import config, expressions, inject, pipeline |
| 10 | + |
| 11 | +logger = logging.getLogger(__name__) |
6 | 12 |
|
7 | | -from activitysim.core import pipeline |
8 | | -from activitysim.core import inject |
9 | | -from activitysim.core import config |
10 | 13 |
|
11 | | -from activitysim.core.config import setting |
| 14 | +def wrap_skims( |
| 15 | + network_los: pipeline.Pipeline, |
| 16 | + trips_merged: pd.DataFrame |
| 17 | +) -> dict[str, object]: |
| 18 | + """ |
| 19 | + Retrieve skim wrappers for merged trips. |
12 | 20 |
|
13 | | -logger = logging.getLogger(__name__) |
| 21 | + For each record in `trips_merged`, retrieve skim wrappers for appropriate time of day. |
| 22 | +
|
| 23 | + Returns dictionary of skims wrappers that are available for use in expressions defined |
| 24 | + in `summarize_preprocessor.csv` |
| 25 | + """ |
| 26 | + skim_dict = network_los.get_default_skim_dict() |
| 27 | + |
| 28 | + trips_merged['start_tour_period'] = network_los.skim_time_period_label( |
| 29 | + trips_merged['start'] |
| 30 | + ) |
| 31 | + trips_merged['end_tour_period'] = network_los.skim_time_period_label( |
| 32 | + trips_merged['end'] |
| 33 | + ) |
| 34 | + trips_merged['trip_period'] = network_los.skim_time_period_label( |
| 35 | + trips_merged['depart'] |
| 36 | + ) |
| 37 | + |
| 38 | + tour_odt_skim_stack_wrapper = skim_dict.wrap_3d( |
| 39 | + orig_key='origin_tour', |
| 40 | + dest_key='destination_tour', |
| 41 | + dim3_key='start_tour_period', |
| 42 | + ) |
| 43 | + tour_dot_skim_stack_wrapper = skim_dict.wrap_3d( |
| 44 | + orig_key='destination_tour', dest_key='origin_tour', dim3_key='end_tour_period' |
| 45 | + ) |
| 46 | + trip_odt_skim_stack_wrapper = skim_dict.wrap_3d( |
| 47 | + orig_key='origin_trip', dest_key='destination_trip', dim3_key='trip_period' |
| 48 | + ) |
| 49 | + |
| 50 | + tour_od_skim_stack_wrapper = skim_dict.wrap('origin_tour', 'destination_tour') |
| 51 | + trip_od_skim_stack_wrapper = skim_dict.wrap('origin_trip', 'destination_trip') |
| 52 | + |
| 53 | + return { |
| 54 | + "tour_odt_skims": tour_odt_skim_stack_wrapper, |
| 55 | + "tour_dot_skims": tour_dot_skim_stack_wrapper, |
| 56 | + "trip_odt_skims": trip_odt_skim_stack_wrapper, |
| 57 | + "tour_od_skims": tour_od_skim_stack_wrapper, |
| 58 | + "trip_od_skims": trip_od_skim_stack_wrapper, |
| 59 | + } |
| 60 | + |
| 61 | + |
| 62 | +DEFAULT_BIN_LABEL_FORMAT = "{left:,.2f} - {right:,.2f}" |
| 63 | + |
| 64 | + |
| 65 | +def construct_bin_labels(bins: pd.Series, label_format: str) -> pd.Series: |
| 66 | + """ |
| 67 | + Construct bin label strings based on intervals (pd.Interval) in `bins` |
| 68 | +
|
| 69 | + `label_format` is an F-string format that can reference the following variables: |
| 70 | + - 'left': Bin minimum |
| 71 | + - 'right': Min maximum |
| 72 | + - 'mid': Bin center |
| 73 | + - 'rank': Bin rank (lowest to highest) |
| 74 | +
|
| 75 | + For example: '{left:,.2f} - {right:,.2f}' might yield '0.00 - 1.00' |
| 76 | + """ |
| 77 | + left = bins.apply(lambda x: x.left) |
| 78 | + mid = bins.apply(lambda x: x.mid) |
| 79 | + right = bins.apply(lambda x: x.right) |
| 80 | + # Get integer ranks of bins (e.g., 1st, 2nd ... nth quantile) |
| 81 | + rank = mid.map( |
| 82 | + { |
| 83 | + x: sorted(mid.unique().tolist()).index(x) + 1 if pd.notnull(x) else np.nan |
| 84 | + for x in mid.unique() |
| 85 | + }, |
| 86 | + na_action='ignore', |
| 87 | + ) |
| 88 | + |
| 89 | + def construct_label(label_format, bounds_dict): |
| 90 | + bounds_dict = { |
| 91 | + x: bound for x, bound in bounds_dict.items() if x in label_format |
| 92 | + } |
| 93 | + return label_format.format(**bounds_dict) |
| 94 | + |
| 95 | + labels = pd.Series( |
| 96 | + [ |
| 97 | + construct_label(label_format, {'left': lt, 'mid': md, 'right': rt, 'rank': rk}) |
| 98 | + for lt, md, rt, rk in zip(left, mid, right, rank) |
| 99 | + ], |
| 100 | + index=bins.index, |
| 101 | + ) |
| 102 | + # Convert to numeric if possible |
| 103 | + labels = pd.to_numeric(labels, errors='ignore') |
| 104 | + return labels |
| 105 | + |
| 106 | + |
| 107 | +def quantiles( |
| 108 | + data: pd.Series, |
| 109 | + bins: pd.Series, |
| 110 | + label_format: str = DEFAULT_BIN_LABEL_FORMAT |
| 111 | +) -> pd.Series: |
| 112 | + """ |
| 113 | + Construct quantiles from a Series given a number of bins. |
| 114 | +
|
| 115 | + For example: set bins = 5 to construct quintiles. |
| 116 | +
|
| 117 | + data: Input Series |
| 118 | + bins: Number of bins |
| 119 | + label_format: F-string format for bin labels |
| 120 | + Bins are labeled with 'min - max' ranges by default. |
| 121 | +
|
| 122 | + Returns a Series indexed by labels |
| 123 | + """ |
| 124 | + vals = data.sort_values() |
| 125 | + # qcut a ranking instead of raw values to deal with high frequencies of the same value |
| 126 | + # (e.g., many 0 values) that may span multiple bins |
| 127 | + ranks = vals.rank(method='first') |
| 128 | + bins = pd.qcut(ranks, bins, duplicates='drop') |
| 129 | + bins = construct_bin_labels(bins, label_format) |
| 130 | + return bins |
| 131 | + |
| 132 | + |
| 133 | +def spaced_intervals( |
| 134 | + data: pd.Series, |
| 135 | + lower_bound: float, |
| 136 | + interval: float, |
| 137 | + label_format: str = DEFAULT_BIN_LABEL_FORMAT, |
| 138 | +) -> pd.Series: |
| 139 | + """ |
| 140 | + Construct evenly-spaced intervals from a Series given a starting value and bin size. |
| 141 | +
|
| 142 | + data: Input Series |
| 143 | + lower_bound: Minimum value of lowest bin |
| 144 | + interval: Bin spacing above the `lower_bound` |
| 145 | + label_format: F-string format for bin labels |
| 146 | + Bins are labeled with 'min - max' ranges by default. |
| 147 | +
|
| 148 | + Returns a Series indexed by labels |
| 149 | + """ |
| 150 | + if lower_bound == 'min': |
| 151 | + lower_bound = data.min() |
| 152 | + breaks = np.arange(lower_bound, data.max() + interval, interval) |
| 153 | + bins = pd.cut(data, breaks, include_lowest=True) |
| 154 | + bins = construct_bin_labels(bins, label_format) |
| 155 | + return bins |
| 156 | + |
| 157 | + |
| 158 | +def equal_intervals( |
| 159 | + data: pd.Series, |
| 160 | + bins: int, |
| 161 | + label_format: str = DEFAULT_BIN_LABEL_FORMAT |
| 162 | +) -> pd.Series: |
| 163 | + """ |
| 164 | + Construct equally-spaced intervals across the entire range of a Series. |
| 165 | +
|
| 166 | + data: Input Series |
| 167 | + bins: Number of bins |
| 168 | + label_format: F-string format for bin labels |
| 169 | + Bins are labeled with 'min - max' ranges by default. |
| 170 | +
|
| 171 | + Returns a Series indexed by labels |
| 172 | + """ |
| 173 | + bins = pd.cut(data, bins, include_lowest=True) |
| 174 | + bins = construct_bin_labels(bins, label_format) |
| 175 | + return bins |
| 176 | + |
| 177 | + |
| 178 | +def manual_breaks( |
| 179 | + data: pd.Series, |
| 180 | + bin_breaks: list, |
| 181 | + labels: list = None, |
| 182 | + label_format: str = DEFAULT_BIN_LABEL_FORMAT |
| 183 | +) -> pd.Series: |
| 184 | + """ |
| 185 | + Classify numeric data in a Pandas Series into manually-defined bins. |
| 186 | +
|
| 187 | + data: Input Series |
| 188 | + bin_breaks: Break points between bins |
| 189 | + labels: Manually-defined labels for each bin (`len(labels)` == `len(bin_breaks) + 1`) |
| 190 | + label_format: F-string format for bin labels if not defined by `labels` |
| 191 | + Bins are labeled with 'min - max' ranges by default. |
| 192 | +
|
| 193 | + Returns a Series indexed by labels |
| 194 | + """ |
| 195 | + if isinstance(labels, list): |
| 196 | + return pd.cut(data, bin_breaks, labels=labels, include_lowest=True) |
| 197 | + else: |
| 198 | + bins = pd.cut(data, bin_breaks, include_lowest=True) |
| 199 | + bins = construct_bin_labels(bins, label_format) |
| 200 | + return bins |
14 | 201 |
|
15 | 202 |
|
16 | 203 | @inject.step() |
17 | | -def write_summaries(output_dir): |
| 204 | +def summarize( |
| 205 | + network_los: pipeline.Pipeline, |
| 206 | + persons: pd.DataFrame, |
| 207 | + persons_merged: pd.DataFrame, |
| 208 | + households: pd.DataFrame, |
| 209 | + households_merged: pd.DataFrame, |
| 210 | + trips: pd.DataFrame, |
| 211 | + tours: pd.DataFrame, |
| 212 | + tours_merged: pd.DataFrame, |
| 213 | + land_use: pd.DataFrame, |
| 214 | +): |
| 215 | + """ |
| 216 | + A standard model that uses expression files to summarize pipeline tables for vizualization. |
| 217 | +
|
| 218 | + Summaries are configured in `summarize.yaml`, including specification of the |
| 219 | + expression file (`summarize.csv` by default). |
| 220 | +
|
| 221 | + Columns in pipeline tables can also be sliced and aggregated prior to summarization. |
| 222 | + This preprocessing is configured in `summarize.yaml`. |
| 223 | +
|
| 224 | + Outputs a seperate csv summary file for each expression; |
| 225 | + outputs starting with '_' are saved as temporary local variables. |
| 226 | + """ |
| 227 | + trace_label = 'summarize' |
| 228 | + model_settings_file_name = 'summarize.yaml' |
| 229 | + model_settings = config.read_model_settings(model_settings_file_name) |
| 230 | + |
| 231 | + output_location = ( |
| 232 | + model_settings['OUTPUT'] if 'OUTPUT' in model_settings else 'summaries' |
| 233 | + ) |
| 234 | + os.makedirs(config.output_file_path(output_location), exist_ok=True) |
| 235 | + |
| 236 | + spec = pd.read_csv( |
| 237 | + config.config_file_path(model_settings['SPECIFICATION']), comment='#' |
| 238 | + ) |
| 239 | + |
| 240 | + # Load dataframes from pipeline |
| 241 | + persons = persons.to_frame() |
| 242 | + persons_merged = persons_merged.to_frame() |
| 243 | + households = households.to_frame() |
| 244 | + households_merged = households_merged.to_frame() |
| 245 | + trips = trips.to_frame() |
| 246 | + tours = tours_merged.to_frame() |
| 247 | + tours_merged = tours_merged.to_frame() |
| 248 | + land_use = land_use.to_frame() |
| 249 | + |
| 250 | + # - trips_merged - merge trips and tours_merged |
| 251 | + trips_merged = pd.merge( |
| 252 | + trips, |
| 253 | + tours_merged.drop(columns=['person_id', 'household_id']), |
| 254 | + left_on='tour_id', |
| 255 | + right_index=True, |
| 256 | + suffixes=['_trip', '_tour'], |
| 257 | + how="left", |
| 258 | + ) |
| 259 | + |
| 260 | + # Add dataframes as local variables |
| 261 | + locals_d = { |
| 262 | + 'persons': persons, |
| 263 | + 'persons_merged': persons_merged, |
| 264 | + 'households': households, |
| 265 | + 'households_merged': households_merged, |
| 266 | + 'trips': trips, |
| 267 | + 'trips_merged': trips_merged, |
| 268 | + 'tours': tours_merged, |
| 269 | + 'tours_merged': tours_merged, |
| 270 | + 'land_use': land_use, |
| 271 | + } |
| 272 | + |
| 273 | + skims = wrap_skims(network_los, trips_merged) |
| 274 | + |
| 275 | + # Annotate trips_merged |
| 276 | + expressions.annotate_preprocessors( |
| 277 | + trips_merged, locals_d, skims, model_settings, 'summarize' |
| 278 | + ) |
| 279 | + |
| 280 | + for table_name, df in locals_d.items(): |
| 281 | + if table_name in model_settings: |
| 282 | + |
| 283 | + meta = model_settings[table_name] |
| 284 | + df = eval(table_name) |
| 285 | + |
| 286 | + if 'AGGREGATE' in meta and meta['AGGREGATE']: |
| 287 | + for agg in meta['AGGREGATE']: |
| 288 | + assert set(('column', 'label', 'map')) <= agg.keys() |
| 289 | + df[agg['label']] = ( |
| 290 | + df[agg['column']].map(agg['map']).fillna(df[agg['column']]) |
| 291 | + ) |
| 292 | + |
| 293 | + if 'BIN' in meta and meta['BIN']: |
| 294 | + for slicer in meta['BIN']: |
| 295 | + if slicer['type'] == 'manual_breaks': |
| 296 | + df[slicer['label']] = manual_breaks( |
| 297 | + df[slicer['column']], slicer['bin_breaks'], slicer['bin_labels'] |
| 298 | + ) |
| 299 | + |
| 300 | + elif slicer['type'] == 'quantiles': |
| 301 | + df[slicer['label']] = quantiles( |
| 302 | + df[slicer['column']], slicer['bins'], slicer['label_format'] |
| 303 | + ) |
| 304 | + |
| 305 | + elif slicer['type'] == 'spaced_intervals': |
| 306 | + df[slicer['label']] = spaced_intervals( |
| 307 | + df[slicer['column']], |
| 308 | + slicer['lower_bound'], |
| 309 | + slicer['interval'], |
| 310 | + slicer['label_format'], |
| 311 | + ) |
| 312 | + |
| 313 | + elif slicer['type'] == 'equal_intervals': |
| 314 | + df[slicer['label']] = equal_intervals( |
| 315 | + df[slicer['column']], slicer['bins'], slicer['label_format'] |
| 316 | + ) |
18 | 317 |
|
19 | | - summary_settings_name = 'output_summaries' |
20 | | - summary_file_name = 'summaries.txt' |
| 318 | + # Output pipeline tables for expression development |
| 319 | + if model_settings['EXPORT_PIPELINE_TABLES'] is True: |
| 320 | + pipeline_table_dir = os.path.join(output_location, 'pipeline_tables') |
| 321 | + os.makedirs(config.output_file_path(pipeline_table_dir), exist_ok=True) |
| 322 | + for name, df in locals_d.items(): |
| 323 | + df.to_csv(config.output_file_path(os.path.join(pipeline_table_dir, f'{name}.csv'))) |
21 | 324 |
|
22 | | - summary_settings = setting(summary_settings_name) |
| 325 | + # Add classification functions to locals |
| 326 | + locals_d.update( |
| 327 | + { |
| 328 | + 'quantiles': quantiles, |
| 329 | + 'spaced_intervals': spaced_intervals, |
| 330 | + 'equal_intervals': equal_intervals, |
| 331 | + 'manual_breaks': manual_breaks, |
| 332 | + } |
| 333 | + ) |
23 | 334 |
|
24 | | - if summary_settings is None: |
25 | | - logger.info("No {summary_settings_name} specified in settings file. Nothing to write.") |
26 | | - return |
| 335 | + for i, row in spec.iterrows(): |
27 | 336 |
|
28 | | - summary_dict = summary_settings |
| 337 | + out_file = row['Output'] |
| 338 | + expr = row['Expression'] |
29 | 339 |
|
30 | | - mode = 'wb' if sys.version_info < (3,) else 'w' |
31 | | - with open(config.output_file_path(summary_file_name), mode) as output_file: |
| 340 | + # Save temporary variables starting with underscores in locals_d |
| 341 | + if out_file.startswith('_'): |
32 | 342 |
|
33 | | - for table_name, column_names in summary_dict.items(): |
| 343 | + logger.debug(f'Temp Variable: {expr} -> {out_file}') |
34 | 344 |
|
35 | | - df = pipeline.get_table(table_name) |
| 345 | + locals_d[out_file] = eval(expr, globals(), locals_d) |
| 346 | + continue |
36 | 347 |
|
37 | | - for c in column_names: |
38 | | - n = 100 |
39 | | - empty = (df[c] == '') | df[c].isnull() |
| 348 | + logger.debug(f'Summary: {expr} -> {out_file}.csv') |
40 | 349 |
|
41 | | - print(f"\n### {table_name}.{c} type: {df.dtypes[c]} rows: {len(df)} ({empty.sum()} empty)\n\n", |
42 | | - file=output_file) |
43 | | - print(df[c].value_counts().nlargest(n), file=output_file) |
| 350 | + resultset = eval(expr, globals(), locals_d) |
| 351 | + resultset.to_csv( |
| 352 | + config.output_file_path(os.path.join(output_location, f'{out_file}.csv')), |
| 353 | + index=False, |
| 354 | + ) |
0 commit comments