Preparing the data for the plots
if READ_SAVED_FILE_DIRECTLY:
print('Finding out pre-saved processed file..')
nyc = loc_read_parquet_file(FINAL_FILE_LOCATION)
else:
# Getting raw NYC taxi data
nyc = get_raw_data(RAW_URL, QUERY, MONTHS, LIMIT, NYC_SQL_FILE_LOC, fetch_if_exists = FETCH_IF_EXISTS)# Processing and together with new feats
nyc = prepare_data(nyc, NYC_RAW_FILE_LOC, fetch_if_exists = FETCH_IF_EXISTS)
# Together with taxi zones and mercator reworked lat/lons
nyc = process_data(nyc, FINAL_FILE_LOCATION , fetch_if_exists = FETCH_IF_EXISTS,
taxi_zones_file = TAXI_ZONES_FILE, transform_merc = True)
- See beneath the dataset
nyc
after attribute engineering, cleaning, together with zone reference and altering to Mercator projections for plotting. Discover: That’s the final dataset that is used for creating the interactive visualization. The precept choices which have been used for plotting are:pickup_weekday, pickup_hour, trip_duration_minutes, pickup_MercatorX, pickup_MercatorY, dropoff_MercatorX, dropoff_MercatorY, pickup_borough, dropoff_borough, pickup_zone_name, dropoff_zone_name.
>>> data_for_plotting.info()
<class 'pandas.core.physique.DataFrame'>
RangeIndex: 1751081 entries, 0 to 1751080
Info columns (complete 29 columns):
# Column Dtype
--- ------ -----
0 vendorid int64
1 pickup_datetime datetime64[ns]
2 dropoff_datetime datetime64[ns]
3 passenger_count int64
4 pickup_longitude float64
5 pickup_latitude float64
6 store_and_fwd_flag object
7 dropoff_longitude float64
8 dropoff_latitude float64
9 trip_duration float64
10 pickup_date datetime64[ns]
11 pickup_month int64
12 pickup_day int64
13 pickup_hour int64
14 pickup_weekday class
15 trip int64
16 distance_hav float64
17 bearing float64
18 trip_duration_minutes float64
19 pickup_taxizone_id float64
20 dropoff_taxizone_id float64
21 pickup_MercatorX float64
22 pickup_MercatorY float64
23 dropoff_MercatorX float64
24 dropoff_MercatorY float64
25 pickup_borough object
26 dropoff_borough object
27 pickup_zone_name object
28 dropoff_zone_name object
dtypes: class(1), datetime64[ns](3), float64(14), int64(6), object(5)
memory utilization: 375.7+ MB
- The borough names, and zone id and names have been mapped to each location (lat/lon) for allowing analysis using zone and borough names. The
..data/external/taxi_zones_shape/taxi_zones.shp
file contains the data on the geographic boundaries of each zone, along with the corresponding borough determine. And inside the code, I’ve used multi-processing to rush up this mapping course of by to eight to 10 events.
def assign_taxi_zones(df: pd.DataFrame, chunk: int | None = None,
lon_var: str = 'pickup_longitude', lat_var: str = 'pickup_latitude',
locid_var: str = 'pickup_taxizone_id',
taxi_zones_file: str = '../data/exterior/taxi_zones_shape/taxi_zones.shp',
) -> gpd.GeoDataFrame:# make a reproduction since we'll modify lats and lons
localdf = df[[lon_var, lat_var]].copy()
# missing lat lon info is indicated by nan. Fill with zero
# which is exterior New York shapefile.
localdf[lon_var] = localdf[lon_var].fillna(value=0.)
localdf[lat_var] = localdf[lat_var].fillna(value=0.)
shape_df = gpd.read_file(taxi_zones_file)
shape_df.drop(['OBJECTID', "Shape_Area", "Shape_Leng"], axis=1, inplace=True)
shape_df = shape_df.to_crs(pyproj.CRS('epsg:4326'))
try:
print(f"assigning taxi zones to each location: {lon_var}, {lat_var}, chunk = {chunk}")
local_gdf = gpd.GeoDataFrame(
localdf, crs = pyproj.CRS('epsg:4326'),
geometry = [Point(xy) for xy in
zip(localdf[lon_var], localdf[lat_var])
]
)
local_gdf = gpd.sjoin(local_gdf, shape_df,
how = 'left', op = 'inside')
return local_gdf.LocationID.rename(locid_var)
Two attainable backgrounds for the plot
- Google maps — desires your google maps API key
- Bokeh tile suppliers — easy to utilize, gives the entire required information, as long as you don’t care about Google maps objects. Nonetheless this choice desires us to rework from the lat/prolonged (EPSG: 4326) format to a Mercator projection (EPSG: 3857). Checkout this link for the geopandas projections’ particulars.
I’ve added code for every. In case, you resolve to utilize Google maps as a result of the background, you may want in order so as to add your GMaps API key in a apikey.txt
file.
Projections for Bokeh
The coordinate reference system (CRS) is important on account of the geometric shapes in a GeoSeries or GeoDataFrame object are merely a gaggle of coordinates in an arbitrary space. A CRS tells Python how these coordinates relate to areas on the Earth. (source)
The scenario information now we have now for the pickup and dropoff elements inside the genuine dataset is in lat-long (EPSG: 4326) format (diploma lat, diploma lon). Nonetheless for using Open highway map tiles for Bokeh plots, now we have to transform this to a spherical Mercator projection (EPSG: 3857). See an occasion code beneath.
import pyproj# Altering NYC metropolis limits to Mercator projection
nyc_long_limits = (-74.257159, -73.699215)
nyc_lat_limits = (40.471021, 40.987326)
# Setting coordinate system
inProj = pyproj.CRS('epsg:4326')
outProj = pyproj.CRS('epsg:3857')
transform_to_lat_lon = pyproj.Transformer.from_crs(inProj, outProj, always_xy = True)
nyc_lon1, nyc_lat1 = transform_to_lat_lon.rework(nyc_long_limits[0], nyc_lat_limits[0])
nyc_lon2, nyc_lat2 = transform_to_lat_lon.rework(nyc_long_limits[1], nyc_lat_limits[1])
## The remodeled coordinates
# >>> nyc_lon1, nyc_lon2
# (-8266269.127635151, -8204159.085663989)
# >>> nyc_lat1, nyc_lat2
# 4934627.716294977, 5010472.431436094)
I’ve used the CartoDB Positron tile for Bokeh plots, nonetheless that you must use one thing you want from this list.
Creating the interactive visualization
- Bokeh itself has a models object which gives all of the required devices to create the plots, add devices like checkbox and slider, add gyph renderers for circles, and so forth. All these fashions are amassed proper right into a document event which can be often called by the patron. This event is interactive and may react to particular person inputs (e.g. a slider selecting an integer between 1 to 10). I’ve carried out this code in this function. You’ll try working this in a pocket e-book or in a script to get an interactive graph (very similar to what now we have now created using streamlit, nonetheless all of it using solely bokeh).
- Nonetheless I decided to utilize a mix of bokeh and streamlit for easy deployment and sharing aim. On this case, the entire particular person interactions are abstracted away to be handled by streamlit (like select hour, weekday, zones, and so forth.). Bokeh will get the already filtered data and focuses solely on the plotting half.
To simply start the streamlit server and visualization, run the subsequent out of your terminal:
> make stream_maps
This will run the streamlit_maps.py using streamlit. Which in flip will run the streamlit_points or streamlit_lines methods in NYC/streamlit_general_maps.py file based on particular person selection on the graph.
- When working for the first time, use
READ_SAVED_FILE_DIRECTLY=FALSE
to run all the data fetch and processing options and save the data. - The next time, you presumably can run the similar file using
READ_SAVED_FILE_DIRECTLY=True
to utilize the already saved data info.
This makes the above code neutral and that’s the one command that you must run to start the streamlit visualization.
In a nutshell, that’s what the code does (for the elements graph):
# Selecting the pickup or dropoff zones to visualise
pickup_or_dropoff_ed, pickup_or_dropoff = self.add_sidebar_pickup_or_dropoff()# Slider for selecting hour of the day
slider_hour = self.add_sidebar_hour()
# Selecting the weekday
checkbox_weekday = self.add_sidebar_weekday()
# Getting a listing of the on the market taxi zones, for the particular person to select from
zone_name_list = self.add_sidebar_select_zone(pickup_or_dropoff_ed, pickup_or_dropoff)
# Filtering the data based on particular person selection -- Reacting to particular person interaction
print('Filtering the dataframe based on particular person selection...')
df_to_plot = self.nyc[(self.nyc[pickup_or_dropoff].isin(zone_name_list))
& (self.nyc['pickup_weekday'].isin(checkbox_weekday))
& (self.nyc['pickup_hour'].isin(slider_hour))
]
print('Carried out filtering the data')
print(f"Number of journeys chosen: {df_to_plot.kind[0]}, out of {self.nyc.kind[0]}")
# Plotting the bokeh plot
print('Plotting the bokeh maps plot now....')
if is_gmaps:
assert api_key is simply not None, "in an effort to make use of google maps, you must cross in a legit google maps api key."
intrc_trips_loc_p = plot_gmaps_streamlit(
df_to_plot,
latitude_column= ['pickup_latitude', 'dropoff_latitude'],
longitude_column = ['pickup_longitude', 'dropoff_longitude'],
color_column = 'trip_duration_minutes', size_column = 5.0,
api_key = api_key, map_type = 'roadmap', map_zoom = 10,
width = 700, peak = 600
)
else:
# Using Bokeh tile maps
intrc_trips_loc_p = plot_cartmaps_streamlit(
data = df_to_plot,
latitude_column = ['pickup_MercatorY', 'dropoff_MercatorY'],
longitude_column= ['pickup_MercatorX', 'dropoff_MercatorX'],
map_tile_type = CARTODBPOSITRON,
nyc_long_limits = (-74.257159, -73.677215), #(-74.257159, -73.699215),
nyc_lat_limits = (40.471021, 40.987326), #(40.471021, 40.987326)
color_column = 'trip_duration_minutes',
size_column = 6,
width = 700, peak = 600
)
return intrc_trips_loc_p
And the strains chart is similar, apart from in its place of things, we plot the strains from one degree to the alternative, and we plot the pickups and dropoffs on the similar plot not just like the elements graph.
# ....................
# ....................# Plotting the strains from totally different zones to the chosen taxi zone/s
lines_p_dropoff = plot_src_to_dest_arrows(data = df_to_plot_dropoff,
pickup_or_dropoff = 'dropoff_zone_name',
custom_title = "Journeys ending on the chosen taxi zone/s (Blue circles are dropoff elements) n"
f"Number of journeys chosen: {df_to_plot_dropoff.kind[0]:,}",
**common_kwargs
)
# Combining the two plots proper right into a single format
format = gridplot([lines_p_pickup, lines_p_dropoff], ncols = 2,
sizing_mode = "scale_both", merge_tools = False) # using grid plot helps avoid the large padding appplied by row()
return format