From 6b45e275399d418549f069afcefd2364a158ea56 Mon Sep 17 00:00:00 2001
From: Nahuel Lofeudo <nlofeudo@gmail.com>
Date: Sun, 16 Apr 2023 13:49:46 +0100
Subject: [PATCH] Separate the loading of stop_times.txt and only load the stop
 times for the stops we are interested in.

---
 gtfs_client.py | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/gtfs_client.py b/gtfs_client.py
index 005b71e..765b73b 100644
--- a/gtfs_client.py
+++ b/gtfs_client.py
@@ -26,7 +26,7 @@ class GTFSClient():
             print("The feed file was up to date")
 
         # Load the feed
-        self.feed = self._read_feed(feed_name, dist_units='km')
+        self.feed = self._read_feed(feed_name, dist_units='km', stop_names = stop_names)
         self.stop_ids = self.__wanted_stop_ids()
 
         # Schedule refresh       
@@ -34,7 +34,7 @@ class GTFSClient():
         if update_interval_seconds and update_queue: 
             self._refresh_thread = threading.Thread(target=lambda: every(self._update_interval_seconds, self.refresh))
 
-    def _read_feed(self, path: gk.Path, dist_units: str) -> gk.Feed:
+    def _read_feed(self, path: gk.Path, dist_units: str, stop_names: list[str]) -> gk.Feed:
         """
         NOTE: This helper method was extracted from gtfs_kit.feed to modify it
         to only load the stop_times for the stops we are interested in,
@@ -69,6 +69,7 @@ class GTFSClient():
 
         # Read files into feed dictionary of DataFrames
         feed_dict = {table: None for table in gk.cs.GTFS_REF["table"]}
+        stop_times_p = None
         for p in src_path.iterdir():
             table = p.stem
             # Skip empty files, irrelevant files, and files with no data
@@ -78,11 +79,28 @@ class GTFSClient():
                 and p.suffix == ".txt"
                 and table in feed_dict
             ):
-                # utf-8-sig gets rid of the byte order mark (BOM);
-                # see http://stackoverflow.com/questions/17912307/u-ufeff-in-python-string
-                df = pd.read_csv(p, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
-                if not df.empty:
-                    feed_dict[table] = gk.cn.clean_column_names(df)
+                if p.name == "stop_times.txt":
+                    # Defer the loading of stop_times.txt until after the stop IDs are known
+                    stop_times_p = p
+                else:
+                    # utf-8-sig gets rid of the byte order mark (BOM);
+                    # see http://stackoverflow.com/questions/17912307/u-ufeff-in-python-string
+                    df = pd.read_csv(p, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
+                    if not df.empty:
+                        feed_dict[table] = gk.cn.clean_column_names(df)
+
+        # Finally, load stop_times.txt
+        if stop_times_p:
+            # Obtain the list of IDs of the desired stops. This is similar to what __wanted_stop_ids() does, 
+            # but without a dependency on a fully formed feed object
+            wanted_stop_ids = feed_dict.get("stops")[feed_dict.get("stops")["stop_name"].isin(stop_names)]["stop_id"]
+
+            iter_csv = pd.read_csv(stop_times_p, iterator=True, chunksize=1000)
+            df = pd.concat([chunk[chunk["stop_id"].isin(wanted_stop_ids)] for chunk in iter_csv])
+
+            #df = pd.read_csv(stop_times_p, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
+            if not df.empty:
+                feed_dict[stop_times_p.stem] = gk.cn.clean_column_names(df)
 
         feed_dict["dist_units"] = dist_units