diff --git a/flow/benchmarks/__init__.py b/flow/benchmarks/__init__.py
index 80615d329..a75f60a13 100644
--- a/flow/benchmarks/__init__.py
+++ b/flow/benchmarks/__init__.py
@@ -1 +1 @@
-# empty init file to ensure documentation for benchmarks is created
+"""Empty init file to ensure documentation for benchmarks is created."""
diff --git a/flow/benchmarks/baselines/bottleneck0.py b/flow/benchmarks/baselines/bottleneck0.py
index 425b7b536..06b299b4a 100644
--- a/flow/benchmarks/baselines/bottleneck0.py
+++ b/flow/benchmarks/baselines/bottleneck0.py
@@ -1,15 +1,6 @@
-"""
-This script is used to quickly evaluate a baseline for bottleneck0.
-Baseline is no AVs.
-
-Bottleneck in which the actions are specifying a desired velocity in a segment
-of space. The autonomous penetration rate in this example is 10%.
-
-Action Dimension: (?, )
+"""Evaluates the baseline performance of bottleneck0 without RL control.
 
-Observation Dimension: (?, )
-
-Horizon: 1000 steps
+Baseline is no AVs.
 """
 
 from flow.core.params import SumoParams, EnvParams, InitialConfig, NetParams, \
@@ -32,91 +23,112 @@
 DISABLE_RAMP_METER = True
 AV_FRAC = 0.10
 
-vehicles = Vehicles()
-vehicles.add(
-    veh_id="human",
-    speed_mode=9,
-    routing_controller=(ContinuousRouter, {}),
-    lane_change_mode=0,
-    num_vehicles=1 * SCALING)
-
-controlled_segments = [("1", 1, False), ("2", 2, True), ("3", 2, True),
-                       ("4", 2, True), ("5", 1, False)]
-num_observed_segments = [("1", 1), ("2", 3), ("3", 3), ("4", 3), ("5", 1)]
-additional_env_params = {
-    "target_velocity": 40,
-    "disable_tb": True,
-    "disable_ramp_metering": True,
-    "controlled_segments": controlled_segments,
-    "symmetric": False,
-    "observed_segments": num_observed_segments,
-    "reset_inflow": False,
-    "lane_change_duration": 5,
-    "max_accel": 3,
-    "max_decel": 3,
-    "inflow_range": [1000, 2000]
-}
-
-# flow rate
-flow_rate = 1900 * SCALING
-
-# percentage of flow coming out of each lane
-inflow = InFlows()
-inflow.add(
-    veh_type="human",
-    edge="1",
-    vehs_per_hour=flow_rate,
-    departLane="random",
-    departSpeed=10)
-
-traffic_lights = TrafficLights()
-if not DISABLE_TB:
-    traffic_lights.add(node_id="2")
-if not DISABLE_RAMP_METER:
-    traffic_lights.add(node_id="3")
-
-additional_net_params = {"scaling": SCALING}
-net_params = NetParams(
-    in_flows=inflow,
-    no_internal_links=False,
-    additional_params=additional_net_params)
-
-sumo_params = SumoParams(
-    sim_step=0.5,
-    sumo_binary="sumo-gui",
-    print_warnings=False,
-    restart_instance=False,
-)
-
-env_params = EnvParams(
-    evaluate=True,  # Set to True to evaluate traffic metrics
-    warmup_steps=40,
-    sims_per_step=1,
-    horizon=HORIZON,
-    additional_params=additional_env_params,
-)
-
-initial_config = InitialConfig(
-    spacing="uniform",
-    min_gap=5,
-    lanes_distribution=float("inf"),
-    edges_distribution=["2", "3", "4", "5"],
-)
-
-scenario = BottleneckScenario(
-    name="bay_bridge_toll",
-    generator_class=BottleneckGenerator,
-    vehicles=vehicles,
-    net_params=net_params,
-    initial_config=initial_config,
-    traffic_lights=traffic_lights)
-
-env = DesiredVelocityEnv(env_params, sumo_params, scenario)
-
-exp = SumoExperiment(env, scenario)
-
-num_runs = 2
-results = exp.run(num_runs, HORIZON)
-avg_outflow = np.mean([outflow[-1] for outflow in results["per_step_returns"]])
-print('The average outflow over 500 seconds '
-      'across {} runs is {}'.format(num_runs, avg_outflow))
+
+def bottleneck0_baseline(num_runs, sumo_binary="sumo-gui"):
+    """Run script for the bottleneck0 baseline.
+
+    Parameters
+    ----------
+        num_runs : int
+            number of rollouts the performance of the environment is evaluated
+            over
+        sumo_binary: str, optional
+            specifies whether to use sumo's gui during execution
+
+    Returns
+    -------
+        SumoExperiment
+            class needed to run simulations
+    """
+    vehicles = Vehicles()
+    vehicles.add(veh_id="human",
+                 speed_mode=9,
+                 routing_controller=(ContinuousRouter, {}),
+                 lane_change_mode=0,
+                 num_vehicles=1 * SCALING)
+
+    controlled_segments = [("1", 1, False), ("2", 2, True), ("3", 2, True),
+                           ("4", 2, True), ("5", 1, False)]
+    num_observed_segments = [("1", 1), ("2", 3), ("3", 3),
+                             ("4", 3), ("5", 1)]
+    additional_env_params = {
+        "target_velocity": 40,
+        "disable_tb": True,
+        "disable_ramp_metering": True,
+        "controlled_segments": controlled_segments,
+        "symmetric": False,
+        "observed_segments": num_observed_segments,
+        "reset_inflow": False,
+        "lane_change_duration": 5,
+        "max_accel": 3,
+        "max_decel": 3,
+        "inflow_range": [1000, 2000]
+    }
+
+    # flow rate
+    flow_rate = 1900 * SCALING
+
+    # percentage of flow coming out of each lane
+    inflow = InFlows()
+    inflow.add(veh_type="human", edge="1",
+               vehs_per_hour=flow_rate,
+               departLane="random", departSpeed=10)
+
+    traffic_lights = TrafficLights()
+    if not DISABLE_TB:
+        traffic_lights.add(node_id="2")
+    if not DISABLE_RAMP_METER:
+        traffic_lights.add(node_id="3")
+
+    additional_net_params = {"scaling": SCALING}
+    net_params = NetParams(in_flows=inflow,
+                           no_internal_links=False,
+                           additional_params=additional_net_params)
+
+    sumo_params = SumoParams(
+        sim_step=0.5,
+        sumo_binary=sumo_binary,
+        print_warnings=False,
+        restart_instance=False,
+    )
+
+    env_params = EnvParams(
+        evaluate=True,  # Set to True to evaluate traffic metrics
+        warmup_steps=40,
+        sims_per_step=1,
+        horizon=HORIZON,
+        additional_params=additional_env_params,
+    )
+
+    initial_config = InitialConfig(
+        spacing="uniform",
+        min_gap=5,
+        lanes_distribution=float("inf"),
+        edges_distribution=["2", "3", "4", "5"],
+    )
+
+    scenario = BottleneckScenario(name="bay_bridge_toll",
+                                  generator_class=BottleneckGenerator,
+                                  vehicles=vehicles,
+                                  net_params=net_params,
+                                  initial_config=initial_config,
+                                  traffic_lights=traffic_lights)
+
+    env = DesiredVelocityEnv(env_params, sumo_params, scenario)
+
+    exp = SumoExperiment(env, scenario)
+
+    results = exp.run(num_runs, HORIZON)
+    avg_outflow = np.mean([outflow[-1]
+                           for outflow in results["per_step_returns"]])
+
+    return avg_outflow
+
+
+if __name__ == "__main__":
+    runs = 2  # number of simulations to average over
+    res = bottleneck0_baseline(num_runs=runs)
+
+    print('---------')
+    print('The average outflow over 500 seconds '
+          'across {} runs is {}'.format(runs, res))
diff --git a/flow/benchmarks/baselines/bottleneck1.py b/flow/benchmarks/baselines/bottleneck1.py
index 8cb259674..011308f34 100644
--- a/flow/benchmarks/baselines/bottleneck1.py
+++ b/flow/benchmarks/baselines/bottleneck1.py
@@ -1,16 +1,6 @@
-"""
-This script is used to quickly evaluate a baseline for bottleneck0.
-Baseline is no AVs.
-
-Bottleneck in which the actions are specifying a desired velocity in a segment
-of space. The autonomous penetration rate in this example is 25%.
-Humans are allowed to lane change
-
-Action Dimension: (?, )
+"""Evaluates the baseline performance of bottleneck1 without RL control.
 
-Observation Dimension: (?, )
-
-Horizon: 1000 steps
+Baseline is no AVs.
 """
 
 from flow.core.params import SumoParams, EnvParams, InitialConfig, NetParams, \
@@ -33,91 +23,112 @@
 DISABLE_RAMP_METER = True
 AV_FRAC = 0.25
 
-vehicles = Vehicles()
-vehicles.add(
-    veh_id="human",
-    speed_mode=9,
-    routing_controller=(ContinuousRouter, {}),
-    lane_change_mode=1621,
-    num_vehicles=1 * SCALING)
-
-controlled_segments = [("1", 1, False), ("2", 2, True), ("3", 2, True),
-                       ("4", 2, True), ("5", 1, False)]
-num_observed_segments = [("1", 1), ("2", 3), ("3", 3), ("4", 3), ("5", 1)]
-additional_env_params = {
-    "target_velocity": 40,
-    "disable_tb": True,
-    "disable_ramp_metering": True,
-    "controlled_segments": controlled_segments,
-    "symmetric": False,
-    "observed_segments": num_observed_segments,
-    "reset_inflow": False,
-    "lane_change_duration": 5,
-    "max_accel": 3,
-    "max_decel": 3,
-    "inflow_range": [1000, 2000]
-}
-
-# flow rate
-flow_rate = 1900 * SCALING
-
-# percentage of flow coming out of each lane
-inflow = InFlows()
-inflow.add(
-    veh_type="human",
-    edge="1",
-    vehs_per_hour=flow_rate,
-    departLane="random",
-    departSpeed=10)
-
-traffic_lights = TrafficLights()
-if not DISABLE_TB:
-    traffic_lights.add(node_id="2")
-if not DISABLE_RAMP_METER:
-    traffic_lights.add(node_id="3")
-
-additional_net_params = {"scaling": SCALING}
-net_params = NetParams(
-    in_flows=inflow,
-    no_internal_links=False,
-    additional_params=additional_net_params)
-
-sumo_params = SumoParams(
-    sim_step=0.5,
-    sumo_binary="sumo-gui",
-    print_warnings=False,
-    restart_instance=False,
-)
-
-env_params = EnvParams(
-    evaluate=True,  # Set to True to evaluate traffic metrics
-    warmup_steps=40,
-    sims_per_step=1,
-    horizon=HORIZON,
-    additional_params=additional_env_params,
-)
-
-initial_config = InitialConfig(
-    spacing="uniform",
-    min_gap=5,
-    lanes_distribution=float("inf"),
-    edges_distribution=["2", "3", "4", "5"],
-)
-
-scenario = BottleneckScenario(
-    name="bay_bridge_toll",
-    generator_class=BottleneckGenerator,
-    vehicles=vehicles,
-    net_params=net_params,
-    initial_config=initial_config,
-    traffic_lights=traffic_lights)
-
-env = DesiredVelocityEnv(env_params, sumo_params, scenario)
-
-exp = SumoExperiment(env, scenario)
-
-num_runs = 2
-results = exp.run(num_runs, HORIZON)
-avg_outflow = np.mean([outflow[-1] for outflow in results["per_step_returns"]])
-print('The average outflow over 500 seconds '
-      'across {} runs is {}'.format(num_runs, avg_outflow))
+
+def bottleneck1_baseline(num_runs, sumo_binary="sumo-gui"):
+    """Run script for the bottleneck1 baseline.
+
+    Parameters
+    ----------
+        num_runs : int
+            number of rollouts the performance of the environment is evaluated
+            over
+        sumo_binary: str, optional
+            specifies whether to use sumo's gui during execution
+
+    Returns
+    -------
+        SumoExperiment
+            class needed to run simulations
+    """
+    vehicles = Vehicles()
+    vehicles.add(veh_id="human",
+                 speed_mode=9,
+                 routing_controller=(ContinuousRouter, {}),
+                 lane_change_mode=1621,
+                 num_vehicles=1 * SCALING)
+
+    controlled_segments = [("1", 1, False), ("2", 2, True), ("3", 2, True),
+                           ("4", 2, True), ("5", 1, False)]
+    num_observed_segments = [("1", 1), ("2", 3), ("3", 3),
+                             ("4", 3), ("5", 1)]
+    additional_env_params = {
+        "target_velocity": 40,
+        "disable_tb": True,
+        "disable_ramp_metering": True,
+        "controlled_segments": controlled_segments,
+        "symmetric": False,
+        "observed_segments": num_observed_segments,
+        "reset_inflow": False,
+        "lane_change_duration": 5,
+        "max_accel": 3,
+        "max_decel": 3,
+        "inflow_range": [1000, 2000]
+    }
+
+    # flow rate
+    flow_rate = 1900 * SCALING
+
+    # percentage of flow coming out of each lane
+    inflow = InFlows()
+    inflow.add(veh_type="human", edge="1",
+               vehs_per_hour=flow_rate,
+               departLane="random", departSpeed=10)
+
+    traffic_lights = TrafficLights()
+    if not DISABLE_TB:
+        traffic_lights.add(node_id="2")
+    if not DISABLE_RAMP_METER:
+        traffic_lights.add(node_id="3")
+
+    additional_net_params = {"scaling": SCALING}
+    net_params = NetParams(in_flows=inflow,
+                           no_internal_links=False,
+                           additional_params=additional_net_params)
+
+    sumo_params = SumoParams(
+        sim_step=0.5,
+        sumo_binary=sumo_binary,
+        print_warnings=False,
+        restart_instance=False,
+    )
+
+    env_params = EnvParams(
+        evaluate=True,  # Set to True to evaluate traffic metrics
+        warmup_steps=40,
+        sims_per_step=1,
+        horizon=HORIZON,
+        additional_params=additional_env_params,
+    )
+
+    initial_config = InitialConfig(
+        spacing="uniform",
+        min_gap=5,
+        lanes_distribution=float("inf"),
+        edges_distribution=["2", "3", "4", "5"],
+    )
+
+    scenario = BottleneckScenario(name="bay_bridge_toll",
+                                  generator_class=BottleneckGenerator,
+                                  vehicles=vehicles,
+                                  net_params=net_params,
+                                  initial_config=initial_config,
+                                  traffic_lights=traffic_lights)
+
+    env = DesiredVelocityEnv(env_params, sumo_params, scenario)
+
+    exp = SumoExperiment(env, scenario)
+
+    results = exp.run(num_runs, HORIZON)
+    avg_outflow = np.mean([outflow[-1]
+                           for outflow in results["per_step_returns"]])
+
+    return avg_outflow
+
+
+if __name__ == "__main__":
+    runs = 2  # number of simulations to average over
+    res = bottleneck1_baseline(num_runs=runs)
+
+    print('---------')
+    print('The average outflow over 500 seconds '
+          'across {} runs is {}'.format(runs, res))
diff --git a/flow/benchmarks/baselines/bottleneck2.py b/flow/benchmarks/baselines/bottleneck2.py
index 532c89668..26140ed2f 100644
--- a/flow/benchmarks/baselines/bottleneck2.py
+++ b/flow/benchmarks/baselines/bottleneck2.py
@@ -1,15 +1,6 @@
-"""
-Script used to evaluate uncontrolled outflow performance of bottleneck2.
-Baseline is no AV control.
-
-Bottleneck in which the actions are specifying a desired velocity in a segment
-of space. The autonomous penetration rate in this example is 10%.
-
-Action Dimension: (40, )
+"""Evaluates the baseline performance of bottleneck2 without RL control.
 
-Observation Dimension: (281, )
-
-Horizon: 1000 steps
+Baseline is no AVs.
 """
 
 from flow.core.params import SumoParams, EnvParams, InitialConfig, NetParams, \
@@ -32,91 +23,112 @@
 DISABLE_RAMP_METER = True
 AV_FRAC = .10
 
-vehicles = Vehicles()
-vehicles.add(
-    veh_id="human",
-    speed_mode=9,
-    routing_controller=(ContinuousRouter, {}),
-    lane_change_mode=0,
-    num_vehicles=1 * SCALING)
-
-controlled_segments = [("1", 1, False), ("2", 2, True), ("3", 2, True),
-                       ("4", 2, True), ("5", 1, False)]
-num_observed_segments = [("1", 1), ("2", 3), ("3", 3), ("4", 3), ("5", 1)]
-additional_env_params = {
-    "target_velocity": 40,
-    "disable_tb": True,
-    "disable_ramp_metering": True,
-    "controlled_segments": controlled_segments,
-    "symmetric": False,
-    "observed_segments": num_observed_segments,
-    "reset_inflow": False,
-    "lane_change_duration": 5,
-    "max_accel": 3,
-    "max_decel": 3,
-    "inflow_range": [1000, 2000]
-}
-
-# flow rate
-flow_rate = 1900 * SCALING
-
-# percentage of flow coming out of each lane
-inflow = InFlows()
-inflow.add(
-    veh_type="human",
-    edge="1",
-    vehs_per_hour=flow_rate,
-    departLane="random",
-    departSpeed=10)
-
-traffic_lights = TrafficLights()
-if not DISABLE_TB:
-    traffic_lights.add(node_id="2")
-if not DISABLE_RAMP_METER:
-    traffic_lights.add(node_id="3")
-
-additional_net_params = {"scaling": SCALING}
-net_params = NetParams(
-    in_flows=inflow,
-    no_internal_links=False,
-    additional_params=additional_net_params)
-
-sumo_params = SumoParams(
-    sim_step=0.5,
-    sumo_binary="sumo-gui",
-    print_warnings=False,
-    restart_instance=False,
-)
-
-env_params = EnvParams(
-    evaluate=True,  # Set to True to evaluate traffic metrics
-    warmup_steps=40,
-    sims_per_step=1,
-    horizon=HORIZON,
-    additional_params=additional_env_params,
-)
-
-initial_config = InitialConfig(
-    spacing="uniform",
-    min_gap=5,
-    lanes_distribution=float("inf"),
-    edges_distribution=["2", "3", "4", "5"],
-)
-
-scenario = BottleneckScenario(
-    name="bay_bridge_toll",
-    generator_class=BottleneckGenerator,
-    vehicles=vehicles,
-    net_params=net_params,
-    initial_config=initial_config,
-    traffic_lights=traffic_lights)
-
-env = DesiredVelocityEnv(env_params, sumo_params, scenario)
-
-exp = SumoExperiment(env, scenario)
-
-num_runs = 2
-results = exp.run(num_runs, HORIZON)
-avg_outflow = np.mean([outflow[-1] for outflow in results["per_step_returns"]])
-print('The average outflow over 500 seconds '
-      'across {} runs is {}'.format(num_runs, avg_outflow))
+
+def bottleneck2_baseline(num_runs, sumo_binary="sumo-gui"):
+    """Run script for the bottleneck2 baseline.
+
+    Parameters
+    ----------
+        num_runs : int
+            number of rollouts the performance of the environment is evaluated
+            over
+        sumo_binary: str, optional
+            specifies whether to use sumo's gui during execution
+
+    Returns
+    -------
+        SumoExperiment
+            class needed to run simulations
+    """
+    vehicles = Vehicles()
+    vehicles.add(veh_id="human",
+                 speed_mode=9,
+                 routing_controller=(ContinuousRouter, {}),
+                 lane_change_mode=0,
+                 num_vehicles=1 * SCALING)
+
+    controlled_segments = [("1", 1, False), ("2", 2, True), ("3", 2, True),
+                           ("4", 2, True), ("5", 1, False)]
+    num_observed_segments = [("1", 1), ("2", 3), ("3", 3),
+                             ("4", 3), ("5", 1)]
+    additional_env_params = {
+        "target_velocity": 40,
+        "disable_tb": True,
+        "disable_ramp_metering": True,
+        "controlled_segments": controlled_segments,
+        "symmetric": False,
+        "observed_segments": num_observed_segments,
+        "reset_inflow": False,
+        "lane_change_duration": 5,
+        "max_accel": 3,
+        "max_decel": 3,
+        "inflow_range": [1000, 2000]
+    }
+
+    # flow rate
+    flow_rate = 1900 * SCALING
+
+    # percentage of flow coming out of each lane
+    inflow = InFlows()
+    inflow.add(veh_type="human", edge="1",
+               vehs_per_hour=flow_rate,
+               departLane="random", departSpeed=10)
+
+    traffic_lights = TrafficLights()
+    if not DISABLE_TB:
+        traffic_lights.add(node_id="2")
+    if not DISABLE_RAMP_METER:
+        traffic_lights.add(node_id="3")
+
+    additional_net_params = {"scaling": SCALING}
+    net_params = NetParams(in_flows=inflow,
+                           no_internal_links=False,
+                           additional_params=additional_net_params)
+
+    sumo_params = SumoParams(
+        sim_step=0.5,
+        sumo_binary=sumo_binary,
+        print_warnings=False,
+        restart_instance=False,
+    )
+
+    env_params = EnvParams(
+        evaluate=True,  # Set to True to evaluate traffic metrics
+        warmup_steps=40,
+        sims_per_step=1,
+        horizon=HORIZON,
+        additional_params=additional_env_params,
+    )
+
+    initial_config = InitialConfig(
+        spacing="uniform",
+        min_gap=5,
+        lanes_distribution=float("inf"),
+        edges_distribution=["2", "3", "4", "5"],
+    )
+
+    scenario = BottleneckScenario(name="bay_bridge_toll",
+                                  generator_class=BottleneckGenerator,
+                                  vehicles=vehicles,
+                                  net_params=net_params,
+                                  initial_config=initial_config,
+                                  traffic_lights=traffic_lights)
+
+    env = DesiredVelocityEnv(env_params, sumo_params, scenario)
+
+    exp = SumoExperiment(env, scenario)
+
+    results = exp.run(num_runs, HORIZON)
+    avg_outflow = np.mean([outflow[-1]
+                           for outflow in results["per_step_returns"]])
+
+    return avg_outflow
+
+
+if __name__ == "__main__":
+    runs = 2  # number of simulations to average over
+    res = bottleneck2_baseline(num_runs=runs)
+
+    print('---------')
+    print('The average outflow over 500 seconds '
+          'across {} runs is {}'.format(runs, res))
diff --git a/flow/benchmarks/baselines/figureeight012.py b/flow/benchmarks/baselines/figureeight012.py
new file mode 100644
index 000000000..2707c5e77
--- /dev/null
+++ b/flow/benchmarks/baselines/figureeight012.py
@@ -0,0 +1,87 @@
+"""Evaluates the baseline performance of figureeight without RL control.
+
+Baseline is human acceleration and intersection behavior.
+"""
+
+from flow.core.params import SumoParams, EnvParams, InitialConfig, NetParams
+from flow.core.vehicles import Vehicles
+from flow.controllers import IDMController, ContinuousRouter
+from flow.scenarios.figure8.figure8_scenario import Figure8Scenario
+from flow.scenarios.figure8.gen import Figure8Generator
+from flow.scenarios.figure8.figure8_scenario import ADDITIONAL_NET_PARAMS
+from flow.envs.loop.loop_accel import AccelEnv
+from flow.core.experiment import SumoExperiment
+import numpy as np
+
+# time horizon of a single rollout
+HORIZON = 1500
+
+
+def figure_eight_baseline(num_runs, sumo_binary="sumo-gui"):
+    """Run script for all figure eight baselines.
+
+    Parameters
+    ----------
+        num_runs : int
+            number of rollouts the performance of the environment is evaluated
+            over
+        sumo_binary: str, optional
+            specifies whether to use sumo's gui during execution
+
+    Returns
+    -------
+        SumoExperiment
+            class needed to run simulations
+    """
+    # We place 1 autonomous vehicle and 13 human-driven vehicles in the network
+    vehicles = Vehicles()
+    vehicles.add(veh_id="human",
+                 acceleration_controller=(IDMController, {"noise": 0.2}),
+                 routing_controller=(ContinuousRouter, {}),
+                 speed_mode="no_collide",
+                 num_vehicles=14)
+
+    sumo_params = SumoParams(
+        sim_step=0.1,
+        sumo_binary=sumo_binary,
+    )
+
+    env_params = EnvParams(
+        horizon=HORIZON,
+        evaluate=True,  # Set to True to evaluate traffic metrics
+        additional_params={
+            "target_velocity": 20,
+            "max_accel": 3,
+            "max_decel": 3,
+        },
+    )
+
+    initial_config = InitialConfig()
+
+    net_params = NetParams(
+        no_internal_links=False,
+        additional_params=ADDITIONAL_NET_PARAMS,
+    )
+
+    scenario = Figure8Scenario(name="figure_eight",
+                               generator_class=Figure8Generator,
+                               vehicles=vehicles,
+                               net_params=net_params,
+                               initial_config=initial_config)
+
+    env = AccelEnv(env_params, sumo_params, scenario)
+
+    exp = SumoExperiment(env, scenario)
+
+    results = exp.run(num_runs, HORIZON)
+    avg_speed = np.mean(results["mean_returns"])
+
+    return avg_speed
+
+
+if __name__ == "__main__":
+    runs = 2  # number of simulations to average over
+    res = figure_eight_baseline(num_runs=runs)
+
+    print('---------')
+    print('The average speed across {} runs is {}'.format(runs, res))
diff --git a/flow/benchmarks/baselines/figureeight{0,1,2}.py b/flow/benchmarks/baselines/figureeight{0,1,2}.py
deleted file mode 100644
index 40a8969ca..000000000
--- a/flow/benchmarks/baselines/figureeight{0,1,2}.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""
-Script to evaluate the baseline performance of figureeight without RL control
-Baseline is human intersection behavior
-
-Trains a fraction of vehicles in a ring road structure to regulate the flow of
-vehicles through an intersection. In this example, the last vehicle in the
-network is an autonomous vehicle.
-
-Action Dimension: (1, )
-
-Observation Dimension: (28, )
-
-Horizon: 1500 steps
-"""
-
-from flow.core.params import SumoParams, EnvParams, InitialConfig, NetParams
-from flow.core.vehicles import Vehicles
-from flow.controllers import IDMController, ContinuousRouter
-from flow.scenarios.figure8.figure8_scenario import Figure8Scenario
-from flow.scenarios.figure8.gen import Figure8Generator
-from flow.scenarios.figure8.figure8_scenario import ADDITIONAL_NET_PARAMS
-from flow.envs.loop.loop_accel import AccelEnv
-from flow.core.experiment import SumoExperiment
-import numpy as np
-
-# time horizon of a single rollout
-HORIZON = 1500
-
-# We place 1 autonomous vehicle and 13 human-driven vehicles in the network
-vehicles = Vehicles()
-vehicles.add(
-    veh_id="human",
-    acceleration_controller=(IDMController, {
-        "noise": 0.2
-    }),
-    routing_controller=(ContinuousRouter, {}),
-    speed_mode="no_collide",
-    num_vehicles=14)
-
-sumo_params = SumoParams(
-    sim_step=0.1,
-    sumo_binary="sumo-gui",
-)
-
-env_params = EnvParams(
-    horizon=HORIZON,
-    evaluate=True,  # Set to True to evaluate traffic metrics
-    additional_params={
-        "target_velocity": 20,
-        "max_accel": 3,
-        "max_decel": 3,
-    },
-)
-
-initial_config = InitialConfig()
-
-net_params = NetParams(
-    no_internal_links=False,
-    additional_params=ADDITIONAL_NET_PARAMS,
-)
-
-scenario = Figure8Scenario(
-    name="figure_eight",
-    generator_class=Figure8Generator,
-    vehicles=vehicles,
-    net_params=net_params,
-    initial_config=initial_config)
-
-env = AccelEnv(env_params, sumo_params, scenario)
-
-exp = SumoExperiment(env, scenario)
-
-num_runs = 2
-results = exp.run(num_runs, HORIZON)
-avg_speed = np.mean(results["mean_returns"])
-print('The average speed across {} runs is {}'.format(num_runs, avg_speed))
diff --git a/flow/benchmarks/baselines/grid0.py b/flow/benchmarks/baselines/grid0.py
index 5e5e595f5..4147d7061 100644
--- a/flow/benchmarks/baselines/grid0.py
+++ b/flow/benchmarks/baselines/grid0.py
@@ -1,14 +1,6 @@
-"""
-Script to evaluate the baseline performance of grid1 without RL control
-Baseline is an actuated traffic light provided by SUMO
-
-Grid/green wave example
-
-Action Dimension: (9, )
+"""Evaluates the baseline performance of grid0 without RL control.
 
-Observation Dimension: (339, )
-
-Horizon: 400 steps
+Baseline is an actuated traffic light provided by SUMO.
 """
 
 from flow.core.params import SumoParams, EnvParams, InitialConfig, NetParams, \
@@ -41,117 +33,124 @@
 # number of vehicles originating in the left, right, top, and bottom edges
 N_LEFT, N_RIGHT, N_TOP, N_BOTTOM = 1, 1, 1, 1
 
-# we place a sufficient number of vehicles to ensure they confirm with the
-# total number specified above. We also use a "right_of_way" speed mode to
-# support traffic light compliance
-vehicles = Vehicles()
-vehicles.add(
-    veh_id="human",
-    acceleration_controller=(SumoCarFollowingController, {}),
-    sumo_car_following_params=SumoCarFollowingParams(
-        min_gap=2.5,
-        max_speed=V_ENTER,
-    ),
-    routing_controller=(GridRouter, {}),
-    num_vehicles=(N_LEFT + N_RIGHT) * N_COLUMNS + (N_BOTTOM + N_TOP) * N_ROWS,
-    speed_mode="right_of_way")
-
-# inflows of vehicles are place on all outer edges (listed here)
-outer_edges = []
-outer_edges += ["left{}_{}".format(N_ROWS, i) for i in range(N_COLUMNS)]
-outer_edges += ["right0_{}".format(i) for i in range(N_ROWS)]
-outer_edges += ["bot{}_0".format(i) for i in range(N_ROWS)]
-outer_edges += ["top{}_{}".format(i, N_COLUMNS) for i in range(N_ROWS)]
-
-# equal inflows for each edge (as dictate by the EDGE_INFLOW constant)
-inflow = InFlows()
-for edge in outer_edges:
-    inflow.add(
-        veh_type="human",
-        edge=edge,
-        vehs_per_hour=EDGE_INFLOW,
-        departLane="free",
-        departSpeed="max")
-
-# define the traffic light logic
-tl_logic = TrafficLights(baseline=False)
-phases = [{
-    "duration": "31",
-    "minDur": "8",
-    "maxDur": "45",
-    "state": "GGGrrrGGGrrr"
-}, {
-    "duration": "6",
-    "minDur": "3",
-    "maxDur": "6",
-    "state": "yyyrrryyyrrr"
-}, {
-    "duration": "31",
-    "minDur": "8",
-    "maxDur": "45",
-    "state": "rrrGGGrrrGGG"
-}, {
-    "duration": "6",
-    "minDur": "3",
-    "maxDur": "6",
-    "state": "rrryyyrrryyy"
-}]
-for i in range(N_ROWS * N_COLUMNS):
-    tl_logic.add(
-        "center" + str(i), tls_type="actuated", phases=phases, programID=1)
-
-net_params = NetParams(
-    in_flows=inflow,
-    no_internal_links=False,
-    additional_params={
-        "speed_limit": V_ENTER + 5,
-        "grid_array": {
-            "short_length": SHORT_LENGTH,
-            "inner_length": INNER_LENGTH,
-            "long_length": LONG_LENGTH,
-            "row_num": N_ROWS,
-            "col_num": N_COLUMNS,
-            "cars_left": N_LEFT,
-            "cars_right": N_RIGHT,
-            "cars_top": N_TOP,
-            "cars_bot": N_BOTTOM,
-        },
-        "horizontal_lanes": 1,
-        "vertical_lanes": 1,
-    },
-)
-
-sumo_params = SumoParams(
-    restart_instance=False,
-    sim_step=1,
-    sumo_binary="sumo-gui",
-)
-
-env_params = EnvParams(
-    evaluate=True,  # Set to True to evaluate traffic metrics
-    horizon=HORIZON,
-    additional_params={
-        "switch_time": 2.0,
-        "num_observed": 2,
-        "tl_type": "actuated",
-    },
-)
-
-initial_config = InitialConfig(shuffle=True)
-
-scenario = SimpleGridScenario(
-    name="grid",
-    generator_class=SimpleGridGenerator,
-    vehicles=vehicles,
-    net_params=net_params,
-    initial_config=initial_config,
-    traffic_lights=tl_logic)
-
-env = PO_TrafficLightGridEnv(env_params, sumo_params, scenario)
-
-exp = SumoExperiment(env, scenario)
-
-num_runs = 2
-results = exp.run(num_runs, HORIZON)
-total_delay = np.mean(results["returns"])
-print('The total delay across {} runs is {}'.format(num_runs, total_delay))
+
+def grid0_baseline(num_runs, sumo_binary="sumo-gui"):
+    """Run script for the grid0 baseline.
+
+    Parameters
+    ----------
+        num_runs : int
+            number of rollouts the performance of the environment is evaluated
+            over
+        sumo_binary: str, optional
+            specifies whether to use sumo's gui during execution
+
+    Returns
+    -------
+        SumoExperiment
+            class needed to run simulations
+    """
+    # we place a sufficient number of vehicles to ensure they confirm with the
+    # total number specified above. We also use a "right_of_way" speed mode to
+    # support traffic light compliance
+    vehicles = Vehicles()
+    vehicles.add(veh_id="human",
+                 acceleration_controller=(SumoCarFollowingController, {}),
+                 sumo_car_following_params=SumoCarFollowingParams(
+                     min_gap=2.5,
+                     max_speed=V_ENTER,
+                 ),
+                 routing_controller=(GridRouter, {}),
+                 num_vehicles=(N_LEFT+N_RIGHT)*N_COLUMNS +
+                              (N_BOTTOM+N_TOP)*N_ROWS,
+                 speed_mode="right_of_way")
+
+    # inflows of vehicles are place on all outer edges (listed here)
+    outer_edges = []
+    outer_edges += ["left{}_{}".format(N_ROWS, i) for i in range(N_COLUMNS)]
+    outer_edges += ["right0_{}".format(i) for i in range(N_ROWS)]
+    outer_edges += ["bot{}_0".format(i) for i in range(N_ROWS)]
+    outer_edges += ["top{}_{}".format(i, N_COLUMNS) for i in range(N_ROWS)]
+
+    # equal inflows for each edge (as dictate by the EDGE_INFLOW constant)
+    inflow = InFlows()
+    for edge in outer_edges:
+        inflow.add(veh_type="human", edge=edge, vehs_per_hour=EDGE_INFLOW,
+                   departLane="free", departSpeed="max")
+
+    # define the traffic light logic
+    tl_logic = TrafficLights(baseline=False)
+    phases = [{"duration": "31", "minDur": "8", "maxDur": "45",
+               "state": "GGGrrrGGGrrr"},
+              {"duration": "6", "minDur": "3", "maxDur": "6",
+               "state": "yyyrrryyyrrr"},
+              {"duration": "31", "minDur": "8", "maxDur": "45",
+               "state": "rrrGGGrrrGGG"},
+              {"duration": "6", "minDur": "3", "maxDur": "6",
+               "state": "rrryyyrrryyy"}]
+    for i in range(N_ROWS*N_COLUMNS):
+        tl_logic.add("center"+str(i), tls_type="actuated", phases=phases,
+                     programID=1)
+
+    net_params = NetParams(
+            in_flows=inflow,
+            no_internal_links=False,
+            additional_params={
+                "speed_limit": V_ENTER + 5,
+                "grid_array": {
+                    "short_length": SHORT_LENGTH,
+                    "inner_length": INNER_LENGTH,
+                    "long_length": LONG_LENGTH,
+                    "row_num": N_ROWS,
+                    "col_num": N_COLUMNS,
+                    "cars_left": N_LEFT,
+                    "cars_right": N_RIGHT,
+                    "cars_top": N_TOP,
+                    "cars_bot": N_BOTTOM,
+                },
+                "horizontal_lanes": 1,
+                "vertical_lanes": 1,
+            },
+        )
+
+    sumo_params = SumoParams(
+            restart_instance=False,
+            sim_step=1,
+            sumo_binary=sumo_binary,
+        )
+
+    env_params = EnvParams(
+            evaluate=True,  # Set to True to evaluate traffic metrics
+            horizon=HORIZON,
+            additional_params={
+                "switch_time": 2.0,
+                "num_observed": 2,
+                "tl_type": "actuated",
+            },
+        )
+
+    initial_config = InitialConfig(shuffle=True)
+
+    scenario = SimpleGridScenario(name="grid",
+                                  generator_class=SimpleGridGenerator,
+                                  vehicles=vehicles,
+                                  net_params=net_params,
+                                  initial_config=initial_config,
+                                  traffic_lights=tl_logic)
+
+    env = PO_TrafficLightGridEnv(env_params, sumo_params, scenario)
+
+    exp = SumoExperiment(env, scenario)
+
+    results = exp.run(num_runs, HORIZON)
+    total_delay = np.mean(results["returns"])
+
+    return total_delay
+
+
+if __name__ == "__main__":
+    runs = 2  # number of simulations to average over
+    res = grid0_baseline(num_runs=runs)
+
+    print('---------')
+    print('The total delay across {} runs is {}'.format(runs, res))
diff --git a/flow/benchmarks/baselines/grid1.py b/flow/benchmarks/baselines/grid1.py
index 438c2d56e..18cf061c8 100644
--- a/flow/benchmarks/baselines/grid1.py
+++ b/flow/benchmarks/baselines/grid1.py
@@ -1,14 +1,6 @@
-"""
-Script to evaluate the baseline performance of grid1 without RL control
-Baseline is an actuated traffic light provided by SUMO
-
-Grid/green wave example
-
-Action Dimension: (25, )
+"""Evaluates the baseline performance of grid1 without RL control.
 
-Observation Dimension: (915, )
-
-Horizon: 400 steps
+Baseline is an actuated traffic light provided by SUMO.
 """
 
 from flow.core.params import SumoParams, EnvParams, InitialConfig, NetParams, \
@@ -41,117 +33,124 @@
 # number of vehicles originating in the left, right, top, and bottom edges
 N_LEFT, N_RIGHT, N_TOP, N_BOTTOM = 1, 1, 1, 1
 
-# we place a sufficient number of vehicles to ensure they confirm with the
-# total number specified above. We also use a "right_of_way" speed mode to
-# support traffic light compliance
-vehicles = Vehicles()
-vehicles.add(
-    veh_id="human",
-    acceleration_controller=(SumoCarFollowingController, {}),
-    sumo_car_following_params=SumoCarFollowingParams(
-        min_gap=2.5,
-        max_speed=V_ENTER,
-    ),
-    routing_controller=(GridRouter, {}),
-    num_vehicles=(N_LEFT + N_RIGHT) * N_COLUMNS + (N_BOTTOM + N_TOP) * N_ROWS,
-    speed_mode="right_of_way")
-
-# inflows of vehicles are place on all outer edges (listed here)
-outer_edges = []
-outer_edges += ["left{}_{}".format(N_ROWS, i) for i in range(N_COLUMNS)]
-outer_edges += ["right0_{}".format(i) for i in range(N_ROWS)]
-outer_edges += ["bot{}_0".format(i) for i in range(N_ROWS)]
-outer_edges += ["top{}_{}".format(i, N_COLUMNS) for i in range(N_ROWS)]
-
-# equal inflows for each edge (as dictate by the EDGE_INFLOW constant)
-inflow = InFlows()
-for edge in outer_edges:
-    inflow.add(
-        veh_type="human",
-        edge=edge,
-        vehs_per_hour=EDGE_INFLOW,
-        departLane="free",
-        departSpeed="max")
-
-# define the traffic light logic
-tl_logic = TrafficLights(baseline=False)
-phases = [{
-    "duration": "31",
-    "minDur": "8",
-    "maxDur": "45",
-    "state": "GGGrrrGGGrrr"
-}, {
-    "duration": "6",
-    "minDur": "3",
-    "maxDur": "6",
-    "state": "yyyrrryyyrrr"
-}, {
-    "duration": "31",
-    "minDur": "8",
-    "maxDur": "45",
-    "state": "rrrGGGrrrGGG"
-}, {
-    "duration": "6",
-    "minDur": "3",
-    "maxDur": "6",
-    "state": "rrryyyrrryyy"
-}]
-for i in range(N_ROWS * N_COLUMNS):
-    tl_logic.add(
-        "center" + str(i), tls_type="actuated", phases=phases, programID=1)
-
-net_params = NetParams(
-    in_flows=inflow,
-    no_internal_links=False,
-    additional_params={
-        "speed_limit": V_ENTER + 5,
-        "grid_array": {
-            "short_length": SHORT_LENGTH,
-            "inner_length": INNER_LENGTH,
-            "long_length": LONG_LENGTH,
-            "row_num": N_ROWS,
-            "col_num": N_COLUMNS,
-            "cars_left": N_LEFT,
-            "cars_right": N_RIGHT,
-            "cars_top": N_TOP,
-            "cars_bot": N_BOTTOM,
-        },
-        "horizontal_lanes": 1,
-        "vertical_lanes": 1,
-    },
-)
-
-sumo_params = SumoParams(
-    restart_instance=False,
-    sim_step=1,
-    sumo_binary="sumo-gui",
-)
-
-env_params = EnvParams(
-    evaluate=True,  # Set to True to evaluate traffic metrics
-    horizon=HORIZON,
-    additional_params={
-        "switch_time": 2.0,
-        "num_observed": 2,
-        "tl_type": "actuated",
-    },
-)
-
-initial_config = InitialConfig(shuffle=True)
-
-scenario = SimpleGridScenario(
-    name="grid",
-    generator_class=SimpleGridGenerator,
-    vehicles=vehicles,
-    net_params=net_params,
-    initial_config=initial_config,
-    traffic_lights=tl_logic)
-
-env = PO_TrafficLightGridEnv(env_params, sumo_params, scenario)
-
-exp = SumoExperiment(env, scenario)
-
-num_runs = 2
-results = exp.run(num_runs, HORIZON)
-total_delay = np.mean(results["returns"])
-print('The total delay across {} runs is {}'.format(num_runs, total_delay))
+
+def grid1_baseline(num_runs, sumo_binary="sumo-gui"):
+    """Run script for the grid1 baseline.
+
+    Parameters
+    ----------
+        num_runs : int
+            number of rollouts the performance of the environment is evaluated
+            over
+        sumo_binary: str, optional
+            specifies whether to use sumo's gui during execution
+
+    Returns
+    -------
+        SumoExperiment
+            class needed to run simulations
+    """
+    # we place a sufficient number of vehicles to ensure they confirm with the
+    # total number specified above. We also use a "right_of_way" speed mode to
+    # support traffic light compliance
+    vehicles = Vehicles()
+    vehicles.add(veh_id="human",
+                 acceleration_controller=(SumoCarFollowingController, {}),
+                 sumo_car_following_params=SumoCarFollowingParams(
+                     min_gap=2.5,
+                     max_speed=V_ENTER,
+                 ),
+                 routing_controller=(GridRouter, {}),
+                 num_vehicles=(N_LEFT+N_RIGHT)*N_COLUMNS +
+                              (N_BOTTOM+N_TOP)*N_ROWS,
+                 speed_mode="right_of_way")
+
+    # inflows of vehicles are place on all outer edges (listed here)
+    outer_edges = []
+    outer_edges += ["left{}_{}".format(N_ROWS, i) for i in range(N_COLUMNS)]
+    outer_edges += ["right0_{}".format(i) for i in range(N_ROWS)]
+    outer_edges += ["bot{}_0".format(i) for i in range(N_ROWS)]
+    outer_edges += ["top{}_{}".format(i, N_COLUMNS) for i in range(N_ROWS)]
+
+    # equal inflows for each edge (as dictate by the EDGE_INFLOW constant)
+    inflow = InFlows()
+    for edge in outer_edges:
+        inflow.add(veh_type="human", edge=edge, vehs_per_hour=EDGE_INFLOW,
+                   departLane="free", departSpeed="max")
+
+    # define the traffic light logic
+    tl_logic = TrafficLights(baseline=False)
+    phases = [{"duration": "31", "minDur": "8", "maxDur": "45",
+               "state": "GGGrrrGGGrrr"},
+              {"duration": "6", "minDur": "3", "maxDur": "6",
+               "state": "yyyrrryyyrrr"},
+              {"duration": "31", "minDur": "8", "maxDur": "45",
+               "state": "rrrGGGrrrGGG"},
+              {"duration": "6", "minDur": "3", "maxDur": "6",
+               "state": "rrryyyrrryyy"}]
+    for i in range(N_ROWS*N_COLUMNS):
+        tl_logic.add("center"+str(i), tls_type="actuated", phases=phases,
+                     programID=1)
+
+    net_params = NetParams(
+            in_flows=inflow,
+            no_internal_links=False,
+            additional_params={
+                "speed_limit": V_ENTER + 5,
+                "grid_array": {
+                    "short_length": SHORT_LENGTH,
+                    "inner_length": INNER_LENGTH,
+                    "long_length": LONG_LENGTH,
+                    "row_num": N_ROWS,
+                    "col_num": N_COLUMNS,
+                    "cars_left": N_LEFT,
+                    "cars_right": N_RIGHT,
+                    "cars_top": N_TOP,
+                    "cars_bot": N_BOTTOM,
+                },
+                "horizontal_lanes": 1,
+                "vertical_lanes": 1,
+            },
+        )
+
+    sumo_params = SumoParams(
+            restart_instance=False,
+            sim_step=1,
+            sumo_binary=sumo_binary,
+        )
+
+    env_params = EnvParams(
+            evaluate=True,  # Set to True to evaluate traffic metrics
+            horizon=HORIZON,
+            additional_params={
+                "switch_time": 2.0,
+                "num_observed": 2,
+                "tl_type": "actuated",
+            },
+        )
+
+    initial_config = InitialConfig(shuffle=True)
+
+    scenario = SimpleGridScenario(name="grid",
+                                  generator_class=SimpleGridGenerator,
+                                  vehicles=vehicles,
+                                  net_params=net_params,
+                                  initial_config=initial_config,
+                                  traffic_lights=tl_logic)
+
+    env = PO_TrafficLightGridEnv(env_params, sumo_params, scenario)
+
+    exp = SumoExperiment(env, scenario)
+
+    results = exp.run(num_runs, HORIZON)
+    total_delay = np.mean(results["returns"])
+
+    return total_delay
+
+
+if __name__ == "__main__":
+    runs = 2  # number of simulations to average over
+    res = grid1_baseline(num_runs=runs)
+
+    print('---------')
+    print('The total delay across {} runs is {}'.format(runs, res))
diff --git a/flow/benchmarks/baselines/merge012.py b/flow/benchmarks/baselines/merge012.py
new file mode 100644
index 000000000..087d3faa2
--- /dev/null
+++ b/flow/benchmarks/baselines/merge012.py
@@ -0,0 +1,114 @@
+"""Evaluates the baseline performance of merge without RL control.
+
+Baseline is no AVs.
+"""
+
+from flow.core.params import SumoParams, EnvParams, InitialConfig, NetParams, \
+    InFlows
+from flow.scenarios.merge.scenario import ADDITIONAL_NET_PARAMS
+from flow.core.vehicles import Vehicles
+from flow.core.experiment import SumoExperiment
+from flow.controllers import SumoCarFollowingController
+from flow.scenarios.merge.scenario import MergeScenario
+from flow.scenarios.merge.gen import MergeGenerator
+from flow.envs.merge import WaveAttenuationMergePOEnv
+import numpy as np
+
+# time horizon of a single rollout
+HORIZON = int(750*(0.5/0.2))
+# inflow rate at the highway
+FLOW_RATE = 2000
+# percent of autonomous vehicles
+RL_PENETRATION = 0.1
+# num_rl term (see ADDITIONAL_ENV_PARAMs)
+NUM_RL = 5
+
+
+def merge_baseline(num_runs, sumo_binary="sumo-gui"):
+    """Run script for all merge baselines.
+
+    Parameters
+    ----------
+        num_runs : int
+            number of rollouts the performance of the environment is evaluated
+            over
+        sumo_binary: str, optional
+            specifies whether to use sumo's gui during execution
+
+    Returns
+    -------
+        SumoExperiment
+            class needed to run simulations
+    """
+    # We consider a highway network with an upstream merging lane producing
+    # shockwaves
+    additional_net_params = ADDITIONAL_NET_PARAMS.copy()
+    additional_net_params["merge_lanes"] = 1
+    additional_net_params["highway_lanes"] = 1
+    additional_net_params["pre_merge_length"] = 500
+
+    # RL vehicles constitute 5% of the total number of vehicles
+    vehicles = Vehicles()
+    vehicles.add(veh_id="human",
+                 acceleration_controller=(SumoCarFollowingController, {}),
+                 speed_mode="no_collide",
+                 num_vehicles=5)
+
+    # Vehicles are introduced from both sides of merge, with RL vehicles
+    # entering from the highway portion as well
+    inflow = InFlows()
+    inflow.add(veh_type="human", edge="inflow_highway",
+               vehs_per_hour=FLOW_RATE,
+               departLane="free", departSpeed=10)
+    inflow.add(veh_type="human", edge="inflow_merge", vehs_per_hour=100,
+               departLane="free", departSpeed=7.5)
+
+    sumo_params = SumoParams(
+        restart_instance=False,
+        sim_step=0.2,  # time step decreased to prevent occasional crashes
+        sumo_binary=sumo_binary,
+    )
+
+    env_params = EnvParams(
+        horizon=HORIZON,
+        sims_per_step=5,  # value raised to ensure sec/step match experiment
+        warmup_steps=0,
+        evaluate=True,  # Set to True to evaluate traffic metric performance
+        additional_params={
+            "max_accel": 1.5,
+            "max_decel": 1.5,
+            "target_velocity": 20,
+            "num_rl": NUM_RL,
+        },
+    )
+
+    initial_config = InitialConfig()
+
+    net_params = NetParams(
+        in_flows=inflow,
+        no_internal_links=False,
+        additional_params=additional_net_params,
+    )
+
+    scenario = MergeScenario(name="merge",
+                             generator_class=MergeGenerator,
+                             vehicles=vehicles,
+                             net_params=net_params,
+                             initial_config=initial_config)
+
+    env = WaveAttenuationMergePOEnv(env_params, sumo_params, scenario)
+
+    exp = SumoExperiment(env, scenario)
+
+    results = exp.run(num_runs, HORIZON)
+    avg_speed = np.mean(results["mean_returns"])
+
+    return avg_speed
+
+
+if __name__ == "__main__":
+    runs = 2  # number of simulations to average over
+    res = merge_baseline(num_runs=runs)
+
+    print('---------')
+    print('The average speed across {} runs is {}'.format(runs, res))
diff --git a/flow/benchmarks/baselines/merge{0,1,2}.py b/flow/benchmarks/baselines/merge{0,1,2}.py
deleted file mode 100644
index 368ec4de8..000000000
--- a/flow/benchmarks/baselines/merge{0,1,2}.py
+++ /dev/null
@@ -1,107 +0,0 @@
-"""
-Script to evaluate the baseline performance of the merge scenario without AVs
-
-Trains a small percentage of autonomous vehicles to dissipate shockwaves caused
-by merges in an open network. The autonomous penetration rate in this example
-is 10%.
-
-Action Dimension: (5, )
-
-Observation Dimension: (25, )
-
-Horizon: 750 steps
-"""
-
-from flow.core.params import SumoParams, EnvParams, InitialConfig, NetParams, \
-    InFlows
-from flow.scenarios.merge.scenario import ADDITIONAL_NET_PARAMS
-from flow.core.vehicles import Vehicles
-from flow.core.experiment import SumoExperiment
-from flow.controllers import SumoCarFollowingController
-from flow.scenarios.merge.scenario import MergeScenario
-from flow.scenarios.merge.gen import MergeGenerator
-from flow.envs.merge import WaveAttenuationMergePOEnv
-import numpy as np
-
-# time horizon of a single rollout
-HORIZON = int(750 * (0.5 / 0.2))
-# inflow rate at the highway
-FLOW_RATE = 2000
-# percent of autonomous vehicles
-RL_PENETRATION = 0.1
-# num_rl term (see ADDITIONAL_ENV_PARAMs)
-NUM_RL = 5
-
-# We consider a highway network with an upstream merging lane producing
-# shockwaves
-additional_net_params = ADDITIONAL_NET_PARAMS.copy()
-additional_net_params["merge_lanes"] = 1
-additional_net_params["highway_lanes"] = 1
-additional_net_params["pre_merge_length"] = 500
-
-# RL vehicles constitute 5% of the total number of vehicles
-vehicles = Vehicles()
-vehicles.add(
-    veh_id="human",
-    acceleration_controller=(SumoCarFollowingController, {}),
-    speed_mode="no_collide",
-    num_vehicles=5)
-
-# Vehicles are introduced from both sides of merge, with RL vehicles entering
-# from the highway portion as well
-inflow = InFlows()
-inflow.add(
-    veh_type="human",
-    edge="inflow_highway",
-    vehs_per_hour=FLOW_RATE,
-    departLane="free",
-    departSpeed=10)
-inflow.add(
-    veh_type="human",
-    edge="inflow_merge",
-    vehs_per_hour=100,
-    departLane="free",
-    departSpeed=7.5)
-
-sumo_params = SumoParams(
-    restart_instance=False,
-    sim_step=0.2,  # time step decreased to prevent occasional crashes
-    sumo_binary="sumo",
-)
-
-env_params = EnvParams(
-    horizon=HORIZON,
-    sims_per_step=5,  # value raised to ensure sec/step match experiment
-    warmup_steps=0,
-    evaluate=True,  # Set to True to evaluate traffic metric performance
-    additional_params={
-        "max_accel": 1.5,
-        "max_decel": 1.5,
-        "target_velocity": 20,
-        "num_rl": NUM_RL,
-    },
-)
-
-initial_config = InitialConfig()
-
-net_params = NetParams(
-    in_flows=inflow,
-    no_internal_links=False,
-    additional_params=additional_net_params,
-)
-
-scenario = MergeScenario(
-    name="merge",
-    generator_class=MergeGenerator,
-    vehicles=vehicles,
-    net_params=net_params,
-    initial_config=initial_config)
-
-env = WaveAttenuationMergePOEnv(env_params, sumo_params, scenario)
-
-exp = SumoExperiment(env, scenario)
-
-num_runs = 5
-results = exp.run(num_runs, HORIZON)
-avg_speed = np.mean(results["mean_returns"])
-print('The average speed across {} runs is {}'.format(num_runs, avg_speed))
diff --git a/flow/benchmarks/bottleneck0.py b/flow/benchmarks/bottleneck0.py
index 3afa5a7b0..a81292f3b 100644
--- a/flow/benchmarks/bottleneck0.py
+++ b/flow/benchmarks/bottleneck0.py
@@ -1,4 +1,5 @@
-"""
+"""Benchmark for bottleneck0.
+
 Bottleneck in which the actions are specifying a desired velocity in a segment
 of space. The autonomous penetration rate in this example is 10%.
 
diff --git a/flow/benchmarks/bottleneck1.py b/flow/benchmarks/bottleneck1.py
index 5a7e57c6c..32af09ece 100644
--- a/flow/benchmarks/bottleneck1.py
+++ b/flow/benchmarks/bottleneck1.py
@@ -1,4 +1,5 @@
-"""
+"""Benchmark for bottleneck1.
+
 Bottleneck in which the actions are specifying a desired velocity in a segment
 of space. The autonomous penetration rate in this example is 25%.
 Human lane changing is enabled.
diff --git a/flow/benchmarks/bottleneck2.py b/flow/benchmarks/bottleneck2.py
index faa2e54d7..59489123b 100644
--- a/flow/benchmarks/bottleneck2.py
+++ b/flow/benchmarks/bottleneck2.py
@@ -1,4 +1,5 @@
-"""
+"""Benchmark for bottleneck2.
+
 Bottleneck in which the actions are specifying a desired velocity in a segment
 of space for a large bottleneck.
 The autonomous penetration rate in this example is 10%.
diff --git a/flow/benchmarks/figureeight0.py b/flow/benchmarks/figureeight0.py
index a583c7efa..f77851519 100644
--- a/flow/benchmarks/figureeight0.py
+++ b/flow/benchmarks/figureeight0.py
@@ -1,4 +1,5 @@
-"""
+"""Benchmark for figureeight0.
+
 Trains a fraction of vehicles in a ring road structure to regulate the flow of
 vehicles through an intersection. In this example, the last vehicle in the
 network is an autonomous vehicle.
diff --git a/flow/benchmarks/figureeight1.py b/flow/benchmarks/figureeight1.py
index 4d735b865..1762875c3 100644
--- a/flow/benchmarks/figureeight1.py
+++ b/flow/benchmarks/figureeight1.py
@@ -1,4 +1,5 @@
-"""
+"""Benchmark for figureeight1.
+
 Trains a fraction of vehicles in a ring road structure to regulate the flow of
 vehicles through an intersection. In this example, every other vehicle in the
 network is an autonomous vehicle.
diff --git a/flow/benchmarks/figureeight2.py b/flow/benchmarks/figureeight2.py
index 5067771cb..c206e7ca3 100644
--- a/flow/benchmarks/figureeight2.py
+++ b/flow/benchmarks/figureeight2.py
@@ -1,4 +1,5 @@
-"""
+"""Benchmark for figureeight2.
+
 Trains a fraction of vehicles in a ring road structure to regulate the flow of
 vehicles through an intersection. In this example, every vehicle in the
 network is an autonomous vehicle.
diff --git a/flow/benchmarks/grid0.py b/flow/benchmarks/grid0.py
index 3bdf25cdd..3de890d1b 100644
--- a/flow/benchmarks/grid0.py
+++ b/flow/benchmarks/grid0.py
@@ -1,5 +1,4 @@
-"""
-Grid/green wave example
+"""Benchmark for grid0.
 
 Action Dimension: (9, )
 
diff --git a/flow/benchmarks/grid1.py b/flow/benchmarks/grid1.py
index 1e11bd471..0dde5b575 100644
--- a/flow/benchmarks/grid1.py
+++ b/flow/benchmarks/grid1.py
@@ -1,5 +1,4 @@
-"""
-Grid/green wave example
+"""Benchmark for grid1.
 
 Action Dimension: (25, )
 
diff --git a/flow/benchmarks/merge0.py b/flow/benchmarks/merge0.py
index 7beba2faf..c9975e143 100644
--- a/flow/benchmarks/merge0.py
+++ b/flow/benchmarks/merge0.py
@@ -1,4 +1,5 @@
-"""
+"""Benchmark for merge0.
+
 Trains a small percentage of autonomous vehicles to dissipate shockwaves caused
 by merges in an open network. The autonomous penetration rate in this example
 is 10%.
diff --git a/flow/benchmarks/merge1.py b/flow/benchmarks/merge1.py
index 9336efc02..41f4c5542 100644
--- a/flow/benchmarks/merge1.py
+++ b/flow/benchmarks/merge1.py
@@ -1,4 +1,5 @@
-"""
+"""Benchmark for merge1.
+
 Trains a small percentage of autonomous vehicles to dissipate shockwaves caused
 by merges in an open network. The autonomous penetration rate in this example
 is 25%.
diff --git a/flow/benchmarks/merge2.py b/flow/benchmarks/merge2.py
index dfcc8cee4..7a2f16234 100644
--- a/flow/benchmarks/merge2.py
+++ b/flow/benchmarks/merge2.py
@@ -1,4 +1,5 @@
-"""
+"""Benchmark for merge2.
+
 Trains a small percentage of autonomous vehicles to dissipate shockwaves caused
 by merges in an open network. The autonomous penetration rate in this example
 is 33.3%.
diff --git a/flow/benchmarks/rllab/trpo_runner.py b/flow/benchmarks/rllab/trpo_runner.py
index e5c413296..2fa29ee02 100644
--- a/flow/benchmarks/rllab/trpo_runner.py
+++ b/flow/benchmarks/rllab/trpo_runner.py
@@ -1,5 +1,4 @@
-"""
-Runner script for environments located in flow/benchmarks.
+"""Runs the environments located in flow/benchmarks.
 
 The environment file can be modified in the imports to change the environment
 this runner script is executed on. This script than handles running the rllab
@@ -26,6 +25,11 @@
 
 
 def run_task(*_):
+    """Implement the ``run_task`` method needed to run experiments with rllab.
+
+    Note that the flow-specific parameters are imported at the start of this
+    script and unzipped and processed here.
+    """
     env_name = flow_params["env_name"]
     exp_tag = flow_params["exp_tag"]
     sumo_params = flow_params["sumo"]
diff --git a/flow/benchmarks/rllib/ars_runner.py b/flow/benchmarks/rllib/ars_runner.py
index 9d5eafef8..c70ba50ed 100644
--- a/flow/benchmarks/rllib/ars_runner.py
+++ b/flow/benchmarks/rllib/ars_runner.py
@@ -1,9 +1,11 @@
-"""
-Runner script for environments located in flow/benchmarks.
+"""Runs the environments located in flow/benchmarks.
 
 The environment file can be modified in the imports to change the environment
-this runner script is executed on. Furthermore, the rllib specific algorithm/
-parameters can be specified here once and used on multiple environments.
+this runner script is executed on. This file runs the ARS algorithm in rllib
+and utilizes the hyper-parameters specified in:
+
+Simple random search provides a competitive approach to reinforcement learning
+by Mania et. al
 """
 import json
 
diff --git a/flow/benchmarks/rllib/es_runner.py b/flow/benchmarks/rllib/es_runner.py
index e69de29bb..574e06c5f 100644
--- a/flow/benchmarks/rllib/es_runner.py
+++ b/flow/benchmarks/rllib/es_runner.py
@@ -0,0 +1,9 @@
+"""Runs the environments located in flow/benchmarks.
+
+The environment file can be modified in the imports to change the environment
+this runner script is executed on. This file runs the ES algorithm in rllib
+and utilizes the hyper-parameters specified in:
+
+Evolution Strategies as a Scalable Alternative to Reinforcement Learning
+by Salimans et. al.
+"""
diff --git a/flow/benchmarks/rllib/ppo_runner.py b/flow/benchmarks/rllib/ppo_runner.py
index 13104853d..9dcbb5462 100644
--- a/flow/benchmarks/rllib/ppo_runner.py
+++ b/flow/benchmarks/rllib/ppo_runner.py
@@ -1,9 +1,10 @@
-"""
-Runner script for environments located in flow/benchmarks.
+"""Runs the environments located in flow/benchmarks.
 
 The environment file can be modified in the imports to change the environment
-this runner script is executed on. Furthermore, the rllib specific algorithm/
-parameters can be specified here once and used on multiple environments.
+this runner script is executed on. This file runs the PPO algorithm in rllib
+and utilizes the hyper-parameters specified in:
+
+Proximal Policy Optimization Algorithms by Schulman et. al.
 """
 import json
 
diff --git a/tests/slow_tests/test_benchmarks.py b/tests/slow_tests/test_benchmarks.py
new file mode 100644
index 000000000..4e1b68cfd
--- /dev/null
+++ b/tests/slow_tests/test_benchmarks.py
@@ -0,0 +1,88 @@
+import unittest
+import os
+
+from flow.benchmarks.baselines.bottleneck0 import bottleneck0_baseline
+from flow.benchmarks.baselines.bottleneck1 import bottleneck1_baseline
+from flow.benchmarks.baselines.bottleneck2 import bottleneck2_baseline
+from flow.benchmarks.baselines.figureeight012 import figure_eight_baseline
+from flow.benchmarks.baselines.grid0 import grid0_baseline
+from flow.benchmarks.baselines.grid1 import grid1_baseline
+from flow.benchmarks.baselines.merge012 import merge_baseline
+
+os.environ["TEST_FLAG"] = "True"
+
+
+class TestBaselines(unittest.TestCase):
+
+    """
+    Tests that the baselines in the benchmarks folder are running and
+    returning expected values (i.e. values that match those in the CoRL paper
+    reported on the website, or other).
+    """
+
+    def test_bottleneck0(self):
+        """
+        Tests flow/benchmark/baselines/bottleneck0.py
+        """
+        # run the bottleneck to make sure it runs
+        bottleneck0_baseline(num_runs=1, sumo_binary="sumo")
+
+        # TODO: check that the performance measure is within some range
+
+    def test_bottleneck1(self):
+        """
+        Tests flow/benchmark/baselines/bottleneck1.py
+        """
+        # run the bottleneck to make sure it runs
+        bottleneck1_baseline(num_runs=1, sumo_binary="sumo")
+
+        # TODO: check that the performance measure is within some range
+
+    def test_bottleneck2(self):
+        """
+        Tests flow/benchmark/baselines/bottleneck2.py
+        """
+        # run the bottleneck to make sure it runs
+        bottleneck2_baseline(num_runs=1, sumo_binary="sumo")
+
+        # TODO: check that the performance measure is within some range
+
+    def test_figure_eight(self):
+        """
+        Tests flow/benchmark/baselines/figureeight{0,1,2}.py
+        """
+        # run the bottleneck to make sure it runs
+        figure_eight_baseline(num_runs=1, sumo_binary="sumo")
+
+        # TODO: check that the performance measure is within some range
+
+    def test_grid0(self):
+        """
+        Tests flow/benchmark/baselines/grid0.py
+        """
+        # run the bottleneck to make sure it runs
+        grid0_baseline(num_runs=1, sumo_binary="sumo")
+
+        # TODO: check that the performance measure is within some range
+
+    def test_grid1(self):
+        """
+        Tests flow/benchmark/baselines/grid1.py
+        """
+        # run the bottleneck to make sure it runs
+        grid1_baseline(num_runs=1, sumo_binary="sumo")
+
+        # TODO: check that the performance measure is within some range
+
+    def test_merge(self):
+        """
+        Tests flow/benchmark/baselines/merge{0,1,2}.py
+        """
+        # run the bottleneck to make sure it runs
+        merge_baseline(num_runs=1, sumo_binary="sumo")
+
+        # TODO: check that the performance measure is within some range
+
+
+if __name__ == '__main__':
+    unittest.main()