In [1]:
import pandas as pd
import plotly.express as px
In [2]:
# get a sample size of each data
jan24_df = pd.read_csv("data_citi/202401-citibike-tripdata_2.csv", nrows=5000)
jan25_df = pd.read_csv("data_citi/202501-citibike-tripdata_3.csv", nrows=5000)
# Add a column to mark the month
jan24_df["month"] = "Jan 2024"
jan25_df["month"] = "Jan 2025"
# Combine into one dataframe
sample_df = pd.concat([jan24_df, jan25_df], ignore_index=True)
In [3]:
# scatter map
fig = px.scatter_mapbox(
sample_df,
lat="start_lat",
lon="start_lng",
color="month", # Different color for 2024 vs 2025
mapbox_style="carto-positron",
zoom=11,
opacity=0.5,
title="Citi Bike Start Locations: Jan 2024 vs Jan 2025 (Sample of 5k each)"
)
fig.show()
/tmp/ipykernel_46037/3716393995.py:2: DeprecationWarning: *scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/ fig = px.scatter_mapbox(
Comparing January 2024 to January 2025: there is a lot more ridership of citibikes in CBD zone after congestion pricing
In [4]:
# Convert to datetime (Jan 2024)
jan24_df["started_at"] = pd.to_datetime(jan24_df["started_at"])
jan24_df["ended_at"] = pd.to_datetime(jan24_df["ended_at"])
# Compute ride duration in minutes
jan24_df["ride_minutes"] = (jan24_df["ended_at"] - jan24_df["started_at"]).dt.total_seconds() / 60
In [5]:
# check for cbd
def in_cbd(lat, lng):
return (lat < 40.7681) and (-74.02 < lng < -73.95)
jan24_df["in_cbd"] = jan24_df.apply(
lambda row: in_cbd(row["start_lat"], row["start_lng"]) or
in_cbd(row["end_lat"], row["end_lng"]),
axis=1
)
# group by month and if under cbd or not
duration_summary = (
jan24_df.groupby(["month", "in_cbd"])["ride_minutes"]
.mean()
.reset_index()
)
print(duration_summary)
month in_cbd ride_minutes 0 Jan 2024 False 8.943878 1 Jan 2024 True 10.815900
In [6]:
# convert to datetime (Jan 2024)
jan25_df["started_at"] = pd.to_datetime(jan25_df["started_at"])
jan25_df["ended_at"] = pd.to_datetime(jan25_df["ended_at"])
# compute ride duration in minutes
jan25_df["ride_minutes"] = (jan25_df["ended_at"] - jan25_df["started_at"]).dt.total_seconds() / 60
In [7]:
# check for cbd
def in_cbd(lat, lng):
return (lat < 40.7681) and (-74.02 < lng < -73.95)
jan25_df["in_cbd"] = jan25_df.apply(
lambda row: in_cbd(row["start_lat"], row["start_lng"]) or
in_cbd(row["end_lat"], row["end_lng"]),
axis=1
)
# group by month and if under cbd or not
duration_summary2 = (
jan25_df.groupby(["month", "in_cbd"])["ride_minutes"]
.mean()
.reset_index()
)
print(duration_summary)
month in_cbd ride_minutes 0 Jan 2024 False 8.943878 1 Jan 2024 True 10.815900
In [8]:
all_summary = pd.concat([duration_summary, duration_summary2], ignore_index=True)
all_summary.head()
Out[8]:
month | in_cbd | ride_minutes | |
---|---|---|---|
0 | Jan 2024 | False | 8.943878 |
1 | Jan 2024 | True | 10.815900 |
2 | Jan 2025 | False | 8.774655 |
3 | Jan 2025 | True | 9.944376 |
Compare to Jan 2024, Jan 2025 has a lower ride in minutes especially in CBD areas.