Stops

Downloads + Imports

%run "setup.ipynb"
CPU times: user 263 ms, sys: 143 ms, total: 406 ms
Wall time: 3.99 s
BokehJS 2.3.1 successfully loaded.

Read and format data

%time stops_df = pd.read_csv(zipfile.open('stops.txt'))
stops_df.tail()
stops_df.info()
CPU times: user 107 ms, sys: 7.92 ms, total: 115 ms
Wall time: 115 ms
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41914 entries, 0 to 41913
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   stop_id              41914 non-null  object 
 1   stop_code            0 non-null      float64
 2   stop_name            41914 non-null  object 
 3   stop_desc            0 non-null      float64
 4   stop_lat             41914 non-null  float64
 5   stop_lon             41914 non-null  float64
 6   location_type        41914 non-null  int64  
 7   parent_station       28752 non-null  float64
 8   wheelchair_boarding  9101 non-null   float64
 9   platform_code        4436 non-null   object 
 10  zone_id              15369 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 3.5+ MB
stops_df.fillna('', inplace=True)
stops_df = stops_df.drop(['stop_code', 'stop_desc'], axis=1)
stops_df.loc[stops_df["wheelchair_boarding"] == '','wheelchair_boarding'] = 0
stops_df_multiple_stops = stops_df.copy()
stops_df.drop_duplicates(subset=['stop_name', 'location_type', 'wheelchair_boarding', 'platform_code'],keep='first', inplace = True)
stops_df.head()
stop_id stop_name stop_lat stop_lon location_type parent_station wheelchair_boarding platform_code zone_id
0 000008012713 Rangsdorf, Bahnhof 52.294125 13.431112 0 900000245025.0 0
1 000008010205 Leipzig, Hauptbahnhof 51.344817 12.381321 0 900000550090.0 0
2 000008010327 Senftenberg, Bahnhof 51.526790 14.003977 0 900000435000.0 0
3 000008010324 Schwerin, Hauptbahnhof 53.635261 11.407520 0 900000550112.0 0
4 000008012393 Mühlanger, Bahnhof 51.855704 12.748198 0 900000550319.0 0
stops_df.apply(lambda x: x.unique().size, axis=0)
stop_id                29601
stop_name              13155
stop_lat               13107
stop_lon               13119
location_type              2
parent_station         13121
wheelchair_boarding        2
platform_code             59
zone_id                14622
dtype: int64
# visualization with folium: takes way longer + more memory consumption than bokeh

#f = folium.Figure(width=800, height=600)
#m = folium.Map(location=[45.5236, -122.6750], prefer_canvas=True).add_to(f)
#for lat, lon in zip(stops_df['stop_lat'], stops_df['stop_lon']):
#    folium.CircleMarker(
#        location=[lat, lon],
#        radius=1,
#        color="#3186cc",
#        fill=True,
#        fill_color="#3186cc",
#    ).add_to(m)
#m
def merc_from_arrays(lats, lons):
    r_major = 6378137.000
    x = r_major * np.radians(lons)
    scale = x/lons
    y = 180.0/np.pi * np.log(np.tan(np.pi/4.0 + lats * (np.pi/180.0)/2.0)) * scale
    return (x, y)
p = figure(plot_width=800, plot_height=700,title="Public Transport Stops of VBB",tools="pan,wheel_zoom",
           x_range=(1215654.4978, 1721973.3732), y_range=(6533225.6816, 7296372.9720),
           x_axis_type="mercator", y_axis_type="mercator",
           tooltips=[("Name", "@stop_name"), ("platform", "@platform_code"), ("(Lat, Lon)", "(@stop_lat, @stop_lon)")])
p.add_tile(get_provider(OSM))
stops_df['merc_x'], stops_df['merc_y'] = merc_from_arrays(stops_df['stop_lat'], stops_df['stop_lon'])
p.circle(x='merc_x', y='merc_y', source=stops_df)
show(p)
© OpenStreetMap contributors
hv.output(backend="bokeh")
tiles = hv.element.tiles.OSM().opts(alpha=0.5)
stops = hv.Points(stops_df, ['merc_x', 'merc_y'], label='Public Transport Stops')
stops_wa = hv.Points(stops_df.loc[stops_df['wheelchair_boarding'] == 1], ['merc_x', 'merc_y'], label='Wheelchair accessible Stops')
tiles * hd.datashade(stops) + tiles * hd.datashade(stops_wa)
© OpenStreetMap contributors
© OpenStreetMap contributors

Stations with most stops

stops_df_multiple_stops['stop_name'].value_counts().head(10)
S Potsdam Hauptbahnhof                  26
Cottbus, Hauptbahnhof                   19
Potsdam, Medienstadt Babelsberg Bhf     19
S Königs Wusterhausen Bhf               19
S Wannsee Bhf (Berlin)                  18
S+U Berlin Hauptbahnhof                 18
Fürstenwalde, Bahnhof                   18
Potsdam, Johannes-Kepler-Platz          17
S Ostkreuz Bhf (Berlin)                 17
S+U Zoologischer Garten Bhf (Berlin)    17
Name: stop_name, dtype: int64
num_stops = stops_df_multiple_stops.groupby(['stop_name']).agg(num_stops=('stop_id', 'count')).query('num_stops > 1').sort_values('num_stops', ascending=False)
num_stops.describe()
num_stops
count 13120.000000
mean 3.191997
std 1.318984
min 2.000000
25% 3.000000
50% 3.000000
75% 3.000000
max 26.000000
num_stops_mean = num_stops['num_stops'].mean()
num_stops_median = num_stops['num_stops'].median()

fig, ax = plt.subplots()
sns.histplot(x='num_stops', data=num_stops, color=sns_c[3], ax=ax, discrete=True)
ax.axvline(x=num_stops_mean, color=sns_c[1], linestyle='--', label=f'mean = {num_stops_mean: ,.2f}')
ax.axvline(x=num_stops_median, color=sns_c[4], linestyle='--',label=f'median = {num_stops_median}')
ax.legend(loc='upper right')
ax.set(title='Number of Stops per Location', xlabel='number of stops', xlim=(0, None))
[Text(0.5, 1.0, 'Number of Stops per Location'),
 Text(0.5, 0, 'number of stops'),
 (0.0, 27.75)]
_images/stops_14_1.png

Stops per District

from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen, Request, urlretrieve
from collections import OrderedDict

url = "https://www.suche-postleitzahl.org/download_files/public/plz-gebiete.shp.zip"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
}

request  = Request(url, None, headers)
resp = urlopen(request)
data = resp.read()
with ZipMemoryFile(data) as zip_memory_file:
    with zip_memory_file.open('plz-gebiete.shp') as collection:
        # collection.crs is {'init': dst_epsg} -> deprecated format
        plz_shapes = gpd.GeoDataFrame.from_features(collection, crs=collection.crs['init'])
# plz_shapes = gpd.read_file('plz_gebiete.zip')
plz_shapes.head()
geometry plz note
0 POLYGON ((5.86632 51.05110, 5.86692 51.05124, ... 52538 52538 Gangelt, Selfkant
1 POLYGON ((5.94504 51.82354, 5.94580 51.82409, ... 47559 47559 Kranenburg
2 POLYGON ((5.96811 51.05556, 5.96951 51.05660, ... 52525 52525 Waldfeucht, Heinsberg
3 POLYGON ((5.97486 50.79804, 5.97495 50.79809, ... 52074 52074 Aachen
4 POLYGON ((6.01507 50.94788, 6.03854 50.93561, ... 52531 52531 Übach-Palenberg
plz_population = pd.read_csv('plz_einwohner.csv', dtype={'plz': str, 'einwohner': int})
plz_population.head()
plz einwohner
0 01067 11957
1 01069 25491
2 01097 14821
3 01099 28018
4 01108 5876
plz_df = pd.merge(left=plz_shapes[['plz', 'geometry']], right=plz_population, on='plz', how='left')
plz_df.nlargest(5, 'einwohner')
plz geometry einwohner
2243 65428 POLYGON ((8.35638 49.95802, 8.35666 49.95805, ... 58826
471 40764 POLYGON ((6.90571 51.11044, 6.91022 51.11426, ... 56833
358 50354 POLYGON ((6.77866 50.87839, 6.77971 50.87898, ... 55073
43 52249 POLYGON ((6.19049 50.80714, 6.19104 50.80723, ... 54732
737 53757 POLYGON ((7.12540 50.77442, 7.12591 50.77492, ... 53997
stops_gdf = gpd.GeoDataFrame(stops_df, geometry=gpd.points_from_xy(stops_df['stop_lon'], stops_df['stop_lat']))
stops_gdf.set_crs(epsg=4326, inplace=True)
stops_gdf.head()
stop_id stop_name stop_lat stop_lon location_type parent_station wheelchair_boarding platform_code zone_id merc_x merc_y geometry
0 000008012713 Rangsdorf, Bahnhof 52.294125 13.431112 0 900000245025.0 0 1.495145e+06 6.853483e+06 POINT (13.43111 52.29413)
1 000008010205 Leipzig, Hauptbahnhof 51.344817 12.381321 0 900000550090.0 0 1.378282e+06 6.682516e+06 POINT (12.38132 51.34482)
2 000008010327 Senftenberg, Bahnhof 51.526790 14.003977 0 900000435000.0 0 1.558916e+06 6.715011e+06 POINT (14.00398 51.52679)
3 000008010324 Schwerin, Hauptbahnhof 53.635261 11.407520 0 900000550112.0 0 1.269879e+06 7.101379e+06 POINT (11.40752 53.63526)
4 000008012393 Mühlanger, Bahnhof 51.855704 12.748198 0 900000550319.0 0 1.419123e+06 6.774077e+06 POINT (12.74820 51.85570)
join_df = gpd.sjoin(stops_gdf, plz_df, how='inner', op="within")
join_df.drop(['index_right', 'einwohner'], axis=1, inplace=True)
join_df = join_df[join_df['parent_station'] == ""]
join_df.head()
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
/tmp/ipykernel_2219/347435686.py in <module>
----> 1 join_df = gpd.sjoin(stops_gdf, plz_df, how='inner', op="within")
      2 join_df.drop(['index_right', 'einwohner'], axis=1, inplace=True)
      3 join_df = join_df[join_df['parent_station'] == ""]
      4 join_df.head()

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/geopandas/tools/sjoin.py in sjoin(left_df, right_df, how, op, lsuffix, rsuffix)
     87     _basic_checks(left_df, right_df, how, lsuffix, rsuffix)
     88 
---> 89     indices = _geom_predicate_query(left_df, right_df, op)
     90 
     91     joined = _frame_join(indices, left_df, right_df, how, lsuffix, rsuffix)

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/geopandas/tools/sjoin.py in _geom_predicate_query(left_df, right_df, op)
    171             # see discussion at https://github.com/geopandas/geopandas/pull/1421
    172             predicate = "contains"
--> 173             sindex = left_df.sindex
    174             input_geoms = right_df.geometry
    175         else:

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/geopandas/base.py in sindex(self)
   2628                [2]])
   2629         """
-> 2630         return self.geometry.values.sindex
   2631 
   2632     @property

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/geopandas/array.py in sindex(self)
    307     def sindex(self):
    308         if self._sindex is None:
--> 309             self._sindex = _get_sindex_class()(self.data)
    310         return self._sindex
    311 

/opt/hostedtoolcache/Python/3.8.10/x64/lib/python3.8/site-packages/geopandas/sindex.py in _get_sindex_class()
     19     if compat.HAS_RTREE:
     20         return RTreeIndex
---> 21     raise ImportError(
     22         "Spatial indexes require either `rtree` or `pygeos`. "
     23         "See installation instructions at https://geopandas.org/install.html"

ImportError: Spatial indexes require either `rtree` or `pygeos`. See installation instructions at https://geopandas.org/install.html
count_df = join_df.groupby('plz', dropna=False).size().reset_index(name='stop_count')
plz_df = pd.merge(left=plz_df[['plz', 'geometry', 'einwohner']], right=count_df, on='plz', how='left')
plz_df['stops_per_inhabitant'] = plz_df.apply(lambda row: np.nan if row['einwohner'] == 0 else row['stop_count'] / row['einwohner'], axis=1)
plz_df.sort_values('stop_count', ascending=False)
fig, ax = plt.subplots(figsize=(5, 5))
plz_df.plot(
    ax=ax,
    column='einwohner',  
    categorical=False, 
    cmap='plasma_r',
    edgecolor='black',
    linewidth=0.05,
    legend=True
)
ax.set(
    title='Population per PLZ',
    aspect=1.3
);
fig, ax = plt.subplots(figsize=(5, 5))
plz_df.plot(
    ax=ax,
    column='stop_count',  
    categorical=False, 
    cmap='plasma_r',
    edgecolor='black',
    linewidth=0.1,
    legend=True
)
ax.set(
    title='Stops per PLZ',
    aspect=1.3
);
fig, ax = plt.subplots(figsize=(5, 5))
plz_df.plot(
    ax=ax,
    column='stops_per_inhabitant',  
    categorical=False, 
    cmap='plasma_r',
    edgecolor='black',
    linewidth=0.1,
)
ax.set(
    title='Stops per Inhabitant per PLZ',
    aspect=1.3
);