import pandas as pd
import plotly.express as px

# get a sample size of each data
jan24_df = pd.read_csv("data_citi/202401-citibike-tripdata_2.csv", nrows=5000)
jan25_df = pd.read_csv("data_citi/202501-citibike-tripdata_3.csv", nrows=5000)

# Add a column to mark the month
jan24_df["month"] = "Jan 2024"
jan25_df["month"] = "Jan 2025"

# Combine into one dataframe
sample_df = pd.concat([jan24_df, jan25_df], ignore_index=True)

# scatter map
fig = px.scatter_mapbox(
    sample_df,
    lat="start_lat",
    lon="start_lng",
    color="month",   # Different color for 2024 vs 2025
    mapbox_style="carto-positron",
    zoom=11,
    opacity=0.5,
    title="Citi Bike Start Locations: Jan 2024 vs Jan 2025 (Sample of 5k each)"
)

fig.show()

/tmp/ipykernel_46037/3716393995.py:2: DeprecationWarning: *scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/
  fig = px.scatter_mapbox(

# Convert to datetime (Jan 2024)
jan24_df["started_at"] = pd.to_datetime(jan24_df["started_at"])
jan24_df["ended_at"] = pd.to_datetime(jan24_df["ended_at"])

# Compute ride duration in minutes
jan24_df["ride_minutes"] = (jan24_df["ended_at"] - jan24_df["started_at"]).dt.total_seconds() / 60

# check for cbd
def in_cbd(lat, lng):
    return (lat < 40.7681) and (-74.02 < lng < -73.95)

jan24_df["in_cbd"] = jan24_df.apply(
    lambda row: in_cbd(row["start_lat"], row["start_lng"]) or 
                in_cbd(row["end_lat"], row["end_lng"]),
    axis=1
)

# group by month and if under cbd or not
duration_summary = (
    jan24_df.groupby(["month", "in_cbd"])["ride_minutes"]
          .mean()
          .reset_index()
)

print(duration_summary)

      month  in_cbd  ride_minutes
0  Jan 2024   False      8.943878
1  Jan 2024    True     10.815900

# convert to datetime (Jan 2024)
jan25_df["started_at"] = pd.to_datetime(jan25_df["started_at"])
jan25_df["ended_at"] = pd.to_datetime(jan25_df["ended_at"])

# compute ride duration in minutes
jan25_df["ride_minutes"] = (jan25_df["ended_at"] - jan25_df["started_at"]).dt.total_seconds() / 60

# check for cbd
def in_cbd(lat, lng):
    return (lat < 40.7681) and (-74.02 < lng < -73.95)

jan25_df["in_cbd"] = jan25_df.apply(
    lambda row: in_cbd(row["start_lat"], row["start_lng"]) or 
                in_cbd(row["end_lat"], row["end_lng"]),
    axis=1
)

# group by month and if under cbd or not
duration_summary2 = (
    jan25_df.groupby(["month", "in_cbd"])["ride_minutes"]
          .mean()
          .reset_index()
)

print(duration_summary)

      month  in_cbd  ride_minutes
0  Jan 2024   False      8.943878
1  Jan 2024    True     10.815900

all_summary = pd.concat([duration_summary, duration_summary2], ignore_index=True)

all_summary.head()

	month	in_cbd	ride_minutes
0	Jan 2024	False	8.943878
1	Jan 2024	True	10.815900
2	Jan 2025	False	8.774655
3	Jan 2025	True	9.944376