From 855bfc3efc14f5b74052367ab89385a66eedbf03 Mon Sep 17 00:00:00 2001 From: Nahuel Lofeudo Date: Sun, 16 Apr 2023 10:49:17 +0100 Subject: [PATCH] Copy the code to read the GTFS feed from gtfs_kit to gtfs_client.py to allow for selective loading of stop times. --- gtfs_client.py | 63 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/gtfs_client.py b/gtfs_client.py index 86c0bc4..005b71e 100644 --- a/gtfs_client.py +++ b/gtfs_client.py @@ -6,9 +6,11 @@ import gtfs_kit as gk import os import pandas as pd import queue +import tempfile import time import threading import traceback +import shutil class GTFSClient(): def __init__(self, feed_url: str, stop_names: list[str], update_queue: queue.Queue, update_interval_seconds: int = 60): @@ -24,7 +26,7 @@ class GTFSClient(): print("The feed file was up to date") # Load the feed - self.feed = gk.read_feed(feed_name, dist_units='km') + self.feed = self._read_feed(feed_name, dist_units='km') self.stop_ids = self.__wanted_stop_ids() # Schedule refresh @@ -32,6 +34,65 @@ class GTFSClient(): if update_interval_seconds and update_queue: self._refresh_thread = threading.Thread(target=lambda: every(self._update_interval_seconds, self.refresh)) + def _read_feed(self, path: gk.Path, dist_units: str) -> gk.Feed: + """ + NOTE: This helper method was extracted from gtfs_kit.feed to modify it + to only load the stop_times for the stops we are interested in, + because loading the entire feed would use more memory than the SoC + in the Raspberry Pi Zero W has. + + Helper function for :func:`read_feed`. + Create a Feed instance from the given path and given distance units. + The path should be a directory containing GTFS text files or a + zip file that unzips as a collection of GTFS text files + (and not as a directory containing GTFS text files). + The distance units given must lie in :const:`constants.dist_units` + + Notes: + + - Ignore non-GTFS files in the feed + - Automatically strip whitespace from the column names in GTFS files + """ + path = gk.Path(path) + if not path.exists(): + raise ValueError(f"Path {path} does not exist") + + # Unzip path to temporary directory if necessary + if path.is_file(): + zipped = True + tmp_dir = tempfile.TemporaryDirectory() + src_path = gk.Path(tmp_dir.name) + shutil.unpack_archive(str(path), tmp_dir.name, "zip") + else: + zipped = False + src_path = path + + # Read files into feed dictionary of DataFrames + feed_dict = {table: None for table in gk.cs.GTFS_REF["table"]} + for p in src_path.iterdir(): + table = p.stem + # Skip empty files, irrelevant files, and files with no data + if ( + p.is_file() + and p.stat().st_size + and p.suffix == ".txt" + and table in feed_dict + ): + # utf-8-sig gets rid of the byte order mark (BOM); + # see http://stackoverflow.com/questions/17912307/u-ufeff-in-python-string + df = pd.read_csv(p, dtype=gk.cs.DTYPE, encoding="utf-8-sig") + if not df.empty: + feed_dict[table] = gk.cn.clean_column_names(df) + + feed_dict["dist_units"] = dist_units + + # Delete temporary directory + if zipped: + tmp_dir.cleanup() + + # Create feed + return gk.Feed(**feed_dict) + def __wanted_stop_ids(self) -> pd.core.frame.DataFrame: """