Read feed files straight from the zip file to avoid wearing out the Pi's SD card.

2023-04-22 12:23:39 +01:00 · 2023-04-22 12:23:39 +01:00 · b8ec772c51
parent 9ca4a19672
commit b8ec772c51
1 changed files with 26 additions and 52 deletions
--- a/gtfs_client.py
+++ b/gtfs_client.py
@ -13,6 +13,7 @@ import time
 import threading
 import traceback
 import shutil
+import zipfile

 class GTFSClient():
    GTFS_URL = "https://api.nationaltransport.ie/gtfsr/v2/gtfsr?format=json"
@ -50,76 +51,49 @@ class GTFSClient():
        """
        NOTE: This helper method was extracted from gtfs_kit.feed to modify it
        to only load the stop_times for the stops we are interested in,
-        because loading the entire feed would use more memory than the SoC 
-        in the Raspberry Pi Zero W has.
+        because loading the entire feed would use more memory than the Raspberry Pi Zero W has.

-        Helper function for :func:`read_feed`.
-        Create a Feed instance from the given path and given distance units.
-        The path should be a directory containing GTFS text files or a
-        zip file that unzips as a collection of GTFS text files
-        (and not as a directory containing GTFS text files).
-        The distance units given must lie in :const:`constants.dist_units`
-
-        Notes:
-
-        - Ignore non-GTFS files in the feed
-        - Automatically strip whitespace from the column names in GTFS files
+        This version also reads CSV data straight from the zip file to avoid
+        wearing out the Pi's SD card.
        """
+        FILES_TO_LOAD = [
+            # List of feed files to load. stop_times.txt is loaded separately.
+            'shapes.txt',
+            'trips.txt',
+            'routes.txt',
+            'calendar.txt',
+            'calendar_dates.txt',
+            'stops.txt',
+            'agency.txt'
+        ]
+
        path = gk.Path(path)
        if not path.exists():
            raise ValueError(f"Path {path} does not exist")

-        # Unzip path to temporary directory if necessary
-        if path.is_file():
-            zipped = True
-            tmp_dir = tempfile.TemporaryDirectory()
-            src_path = gk.Path(tmp_dir.name)
-            shutil.unpack_archive(str(path), tmp_dir.name, "zip")
-        else:
-            zipped = False
-            src_path = path
-
-        # Read files into feed dictionary of DataFrames
        feed_dict = {table: None for table in gk.cs.GTFS_REF["table"]}
-        stop_times_p = None
-        for p in src_path.iterdir():
-            table = p.stem
-            # Skip empty files, irrelevant files, and files with no data
-            if (
-                p.is_file()
-                and p.stat().st_size
-                and p.suffix == ".txt"
-                and table in feed_dict
-            ):
-                if p.name == "stop_times.txt":
-                    # Defer the loading of stop_times.txt until after the stop IDs are known
-                    stop_times_p = p
-                else:
-                    # utf-8-sig gets rid of the byte order mark (BOM);
-                    # see http://stackoverflow.com/questions/17912307/u-ufeff-in-python-string
-                    df = pd.read_csv(p, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
+        with zipfile.ZipFile(path) as z:
+            for filename in FILES_TO_LOAD:
+                table = filename.split(".")[0]
+                # read the file
+                with z.open(filename) as f:
+                    df = pd.read_csv(f, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
                    if not df.empty:
                        feed_dict[table] = gk.cn.clean_column_names(df)

-        # Finally, load stop_times.txt
-        if stop_times_p:
+            # Finally, load stop_times.txt
            # Obtain the list of IDs of the desired stops. This is similar to what __wanted_stop_ids() does, 
            # but without a dependency on a fully formed feed object
            wanted_stop_ids = feed_dict.get("stops")[feed_dict.get("stops")["stop_code"].isin(stop_codes)]["stop_id"]
+            with z.open("stop_times.txt") as f:
+                iter_csv = pd.read_csv(f, iterator=True, chunksize=1000, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
+                df = pd.concat([chunk[chunk["stop_id"].isin(wanted_stop_ids)] for chunk in iter_csv])

-            iter_csv = pd.read_csv(stop_times_p, iterator=True, chunksize=1000)
-            df = pd.concat([chunk[chunk["stop_id"].isin(wanted_stop_ids)] for chunk in iter_csv])
-
-            #df = pd.read_csv(stop_times_p, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
            if not df.empty:
-                feed_dict[stop_times_p.stem] = gk.cn.clean_column_names(df)
+                feed_dict["stop_times"] = gk.cn.clean_column_names(df)

        feed_dict["dist_units"] = dist_units

-        # Delete temporary directory
-        if zipped:
-            tmp_dir.cleanup()
-
        # Create feed
        return gk.Feed(**feed_dict)