Stops¶

Downloads + Imports¶

%run "setup.ipynb"

CPU times: user 263 ms, sys: 143 ms, total: 406 ms
Wall time: 3.99 s

Loading BokehJS ...

Read and format data¶

%time stops_df = pd.read_csv(zipfile.open('stops.txt'))
stops_df.tail()
stops_df.info()

CPU times: user 107 ms, sys: 7.92 ms, total: 115 ms
Wall time: 115 ms
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41914 entries, 0 to 41913
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   stop_id              41914 non-null  object 
 1   stop_code            0 non-null      float64
 2   stop_name            41914 non-null  object 
 3   stop_desc            0 non-null      float64
 4   stop_lat             41914 non-null  float64
 5   stop_lon             41914 non-null  float64
 6   location_type        41914 non-null  int64  
 7   parent_station       28752 non-null  float64
 8   wheelchair_boarding  9101 non-null   float64
 9   platform_code        4436 non-null   object 
 10  zone_id              15369 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 3.5+ MB

stops_df.fillna('', inplace=True)
stops_df = stops_df.drop(['stop_code', 'stop_desc'], axis=1)
stops_df.loc[stops_df["wheelchair_boarding"] == '','wheelchair_boarding'] = 0
stops_df_multiple_stops = stops_df.copy()
stops_df.drop_duplicates(subset=['stop_name', 'location_type', 'wheelchair_boarding', 'platform_code'],keep='first', inplace = True)
stops_df.head()

	stop_id	stop_name	stop_lat	stop_lon	location_type	parent_station	wheelchair_boarding	platform_code	zone_id
0	000008012713	Rangsdorf, Bahnhof	52.294125	13.431112	0	900000245025.0	0
1	000008010205	Leipzig, Hauptbahnhof	51.344817	12.381321	0	900000550090.0	0
2	000008010327	Senftenberg, Bahnhof	51.526790	14.003977	0	900000435000.0	0
3	000008010324	Schwerin, Hauptbahnhof	53.635261	11.407520	0	900000550112.0	0
4	000008012393	Mühlanger, Bahnhof	51.855704	12.748198	0	900000550319.0	0

stops_df.apply(lambda x: x.unique().size, axis=0)

stop_id                29601
stop_name              13155
stop_lat               13107
stop_lon               13119
location_type              2
parent_station         13121
wheelchair_boarding        2
platform_code             59
zone_id                14622
dtype: int64

# visualization with folium: takes way longer + more memory consumption than bokeh

#f = folium.Figure(width=800, height=600)
#m = folium.Map(location=[45.5236, -122.6750], prefer_canvas=True).add_to(f)
#for lat, lon in zip(stops_df['stop_lat'], stops_df['stop_lon']):
#    folium.CircleMarker(
#        location=[lat, lon],
#        radius=1,
#        color="#3186cc",
#        fill=True,
#        fill_color="#3186cc",
#    ).add_to(m)
#m

def merc_from_arrays(lats, lons):
    r_major = 6378137.000
    x = r_major * np.radians(lons)
    scale = x/lons
    y = 180.0/np.pi * np.log(np.tan(np.pi/4.0 + lats * (np.pi/180.0)/2.0)) * scale
    return (x, y)

p = figure(plot_width=800, plot_height=700,title="Public Transport Stops of VBB",tools="pan,wheel_zoom",
           x_range=(1215654.4978, 1721973.3732), y_range=(6533225.6816, 7296372.9720),
           x_axis_type="mercator", y_axis_type="mercator",
           tooltips=[("Name", "@stop_name"), ("platform", "@platform_code"), ("(Lat, Lon)", "(@stop_lat, @stop_lon)")])
p.add_tile(get_provider(OSM))
stops_df['merc_x'], stops_df['merc_y'] = merc_from_arrays(stops_df['stop_lat'], stops_df['stop_lon'])
p.circle(x='merc_x', y='merc_y', source=stops_df)
show(p)

hv.output(backend="bokeh")
tiles = hv.element.tiles.OSM().opts(alpha=0.5)
stops = hv.Points(stops_df, ['merc_x', 'merc_y'], label='Public Transport Stops')
stops_wa = hv.Points(stops_df.loc[stops_df['wheelchair_boarding'] == 1], ['merc_x', 'merc_y'], label='Wheelchair accessible Stops')
tiles * hd.datashade(stops) + tiles * hd.datashade(stops_wa)

Stations with most stops¶

stops_df_multiple_stops['stop_name'].value_counts().head(10)

S Potsdam Hauptbahnhof                  26
Cottbus, Hauptbahnhof                   19
Potsdam, Medienstadt Babelsberg Bhf     19
S Königs Wusterhausen Bhf               19
S Wannsee Bhf (Berlin)                  18
S+U Berlin Hauptbahnhof                 18
Fürstenwalde, Bahnhof                   18
Potsdam, Johannes-Kepler-Platz          17
S Ostkreuz Bhf (Berlin)                 17
S+U Zoologischer Garten Bhf (Berlin)    17
Name: stop_name, dtype: int64

num_stops = stops_df_multiple_stops.groupby(['stop_name']).agg(num_stops=('stop_id', 'count')).query('num_stops > 1').sort_values('num_stops', ascending=False)
num_stops.describe()

	num_stops
count	13120.000000
mean	3.191997
std	1.318984
min	2.000000
25%	3.000000
50%	3.000000
75%	3.000000
max	26.000000

num_stops_mean = num_stops['num_stops'].mean()
num_stops_median = num_stops['num_stops'].median()

fig, ax = plt.subplots()
sns.histplot(x='num_stops', data=num_stops, color=sns_c[3], ax=ax, discrete=True)
ax.axvline(x=num_stops_mean, color=sns_c[1], linestyle='--', label=f'mean = {num_stops_mean: ,.2f}')
ax.axvline(x=num_stops_median, color=sns_c[4], linestyle='--',label=f'median = {num_stops_median}')
ax.legend(loc='upper right')
ax.set(title='Number of Stops per Location', xlabel='number of stops', xlim=(0, None))

[Text(0.5, 1.0, 'Number of Stops per Location'),
 Text(0.5, 0, 'number of stops'),
 (0.0, 27.75)]

_images/stops_14_1.png

Stops per District¶

from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen, Request, urlretrieve
from collections import OrderedDict

url = "https://www.suche-postleitzahl.org/download_files/public/plz-gebiete.shp.zip"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
}

request  = Request(url, None, headers)
resp = urlopen(request)
data = resp.read()

with ZipMemoryFile(data) as zip_memory_file:
    with zip_memory_file.open('plz-gebiete.shp') as collection:
        # collection.crs is {'init': dst_epsg} -> deprecated format
        plz_shapes = gpd.GeoDataFrame.from_features(collection, crs=collection.crs['init'])

# plz_shapes = gpd.read_file('plz_gebiete.zip')
plz_shapes.head()

	geometry	plz	note
0	POLYGON ((5.86632 51.05110, 5.86692 51.05124, ...	52538	52538 Gangelt, Selfkant
1	POLYGON ((5.94504 51.82354, 5.94580 51.82409, ...	47559	47559 Kranenburg
2	POLYGON ((5.96811 51.05556, 5.96951 51.05660, ...	52525	52525 Waldfeucht, Heinsberg
3	POLYGON ((5.97486 50.79804, 5.97495 50.79809, ...	52074	52074 Aachen
4	POLYGON ((6.01507 50.94788, 6.03854 50.93561, ...	52531	52531 Ãbach-Palenberg

plz_population = pd.read_csv('plz_einwohner.csv', dtype={'plz': str, 'einwohner': int})
plz_population.head()

	plz	einwohner
0	01067	11957
1	01069	25491
2	01097	14821
3	01099	28018
4	01108	5876

plz_df = pd.merge(left=plz_shapes[['plz', 'geometry']], right=plz_population, on='plz', how='left')

plz_df.nlargest(5, 'einwohner')

	plz	geometry	einwohner
2243	65428	POLYGON ((8.35638 49.95802, 8.35666 49.95805, ...	58826
471	40764	POLYGON ((6.90571 51.11044, 6.91022 51.11426, ...	56833
358	50354	POLYGON ((6.77866 50.87839, 6.77971 50.87898, ...	55073
43	52249	POLYGON ((6.19049 50.80714, 6.19104 50.80723, ...	54732
737	53757	POLYGON ((7.12540 50.77442, 7.12591 50.77492, ...	53997

stops_gdf = gpd.GeoDataFrame(stops_df, geometry=gpd.points_from_xy(stops_df['stop_lon'], stops_df['stop_lat']))
stops_gdf.set_crs(epsg=4326, inplace=True)
stops_gdf.head()

	stop_id	stop_name	stop_lat	stop_lon	location_type	parent_station	wheelchair_boarding	platform_code	zone_id	merc_x	merc_y	geometry
0	000008012713	Rangsdorf, Bahnhof	52.294125	13.431112	0	900000245025.0	0			1.495145e+06	6.853483e+06	POINT (13.43111 52.29413)
1	000008010205	Leipzig, Hauptbahnhof	51.344817	12.381321	0	900000550090.0	0			1.378282e+06	6.682516e+06	POINT (12.38132 51.34482)
2	000008010327	Senftenberg, Bahnhof	51.526790	14.003977	0	900000435000.0	0			1.558916e+06	6.715011e+06	POINT (14.00398 51.52679)
3	000008010324	Schwerin, Hauptbahnhof	53.635261	11.407520	0	900000550112.0	0			1.269879e+06	7.101379e+06	POINT (11.40752 53.63526)
4	000008012393	Mühlanger, Bahnhof	51.855704	12.748198	0	900000550319.0	0			1.419123e+06	6.774077e+06	POINT (12.74820 51.85570)

join_df = gpd.sjoin(stops_gdf, plz_df, how='inner', op="within")
join_df.drop(['index_right', 'einwohner'], axis=1, inplace=True)
join_df = join_df[join_df['parent_station'] == ""]
join_df.head()

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
/tmp/ipykernel_2219/347435686.py in <module>
----> 1 join_df = gpd.sjoin(stops_gdf, plz_df, how='inner', op="within")
      2 join_df.drop(['index_right', 'einwohner'], axis=1, inplace=True)
      3 join_df = join_df[join_df['parent_station'] == ""]
      4 join_df.head()

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/geopandas/tools/sjoin.py in sjoin(left_df, right_df, how, op, lsuffix, rsuffix)
     87     _basic_checks(left_df, right_df, how, lsuffix, rsuffix)
     88 
---> 89     indices = _geom_predicate_query(left_df, right_df, op)
     90 
     91     joined = _frame_join(indices, left_df, right_df, how, lsuffix, rsuffix)

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/geopandas/tools/sjoin.py in _geom_predicate_query(left_df, right_df, op)
    171             # see discussion at https://github.com/geopandas/geopandas/pull/1421
    172             predicate = "contains"
--> 173             sindex = left_df.sindex
    174             input_geoms = right_df.geometry
    175         else:

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/geopandas/base.py in sindex(self)
   2628                [2]])
   2629         """
-> 2630         return self.geometry.values.sindex
   2631 
   2632     @property

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/geopandas/array.py in sindex(self)
    307     def sindex(self):
    308         if self._sindex is None:
--> 309             self._sindex = _get_sindex_class()(self.data)
    310         return self._sindex
    311 

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/geopandas/sindex.py in _get_sindex_class()
     19     if compat.HAS_RTREE:
     20         return RTreeIndex
---> 21     raise ImportError(
     22         "Spatial indexes require either `rtree` or `pygeos`. "
     23         "See installation instructions at https://geopandas.org/install.html"

ImportError: Spatial indexes require either `rtree` or `pygeos`. See installation instructions at https://geopandas.org/install.html

count_df = join_df.groupby('plz', dropna=False).size().reset_index(name='stop_count')

plz_df = pd.merge(left=plz_df[['plz', 'geometry', 'einwohner']], right=count_df, on='plz', how='left')

plz_df['stops_per_inhabitant'] = plz_df.apply(lambda row: np.nan if row['einwohner'] == 0 else row['stop_count'] / row['einwohner'], axis=1)
plz_df.sort_values('stop_count', ascending=False)

fig, ax = plt.subplots(figsize=(5, 5))
plz_df.plot(
    ax=ax,
    column='einwohner',  
    categorical=False, 
    cmap='plasma_r',
    edgecolor='black',
    linewidth=0.05,
    legend=True
)
ax.set(
    title='Population per PLZ',
    aspect=1.3
);

fig, ax = plt.subplots(figsize=(5, 5))
plz_df.plot(
    ax=ax,
    column='stop_count',  
    categorical=False, 
    cmap='plasma_r',
    edgecolor='black',
    linewidth=0.1,
    legend=True
)
ax.set(
    title='Stops per PLZ',
    aspect=1.3
);

fig, ax = plt.subplots(figsize=(5, 5))
plz_df.plot(
    ax=ax,
    column='stops_per_inhabitant',  
    categorical=False, 
    cmap='plasma_r',
    edgecolor='black',
    linewidth=0.1,
)
ax.set(
    title='Stops per Inhabitant per PLZ',
    aspect=1.3
);

Shapes Trips