Copy the code to read the GTFS feed from gtfs_kit to gtfs_client.py to allow for selective loading of stop times.

This commit is contained in:
Nahuel Lofeudo 2023-04-16 10:49:17 +01:00
parent e4f4e30b27
commit 855bfc3efc
1 changed files with 62 additions and 1 deletions

View File

@ -6,9 +6,11 @@ import gtfs_kit as gk
import os import os
import pandas as pd import pandas as pd
import queue import queue
import tempfile
import time import time
import threading import threading
import traceback import traceback
import shutil
class GTFSClient(): class GTFSClient():
def __init__(self, feed_url: str, stop_names: list[str], update_queue: queue.Queue, update_interval_seconds: int = 60): def __init__(self, feed_url: str, stop_names: list[str], update_queue: queue.Queue, update_interval_seconds: int = 60):
@ -24,7 +26,7 @@ class GTFSClient():
print("The feed file was up to date") print("The feed file was up to date")
# Load the feed # Load the feed
self.feed = gk.read_feed(feed_name, dist_units='km') self.feed = self._read_feed(feed_name, dist_units='km')
self.stop_ids = self.__wanted_stop_ids() self.stop_ids = self.__wanted_stop_ids()
# Schedule refresh # Schedule refresh
@ -32,6 +34,65 @@ class GTFSClient():
if update_interval_seconds and update_queue: if update_interval_seconds and update_queue:
self._refresh_thread = threading.Thread(target=lambda: every(self._update_interval_seconds, self.refresh)) self._refresh_thread = threading.Thread(target=lambda: every(self._update_interval_seconds, self.refresh))
def _read_feed(self, path: gk.Path, dist_units: str) -> gk.Feed:
"""
NOTE: This helper method was extracted from gtfs_kit.feed to modify it
to only load the stop_times for the stops we are interested in,
because loading the entire feed would use more memory than the SoC
in the Raspberry Pi Zero W has.
Helper function for :func:`read_feed`.
Create a Feed instance from the given path and given distance units.
The path should be a directory containing GTFS text files or a
zip file that unzips as a collection of GTFS text files
(and not as a directory containing GTFS text files).
The distance units given must lie in :const:`constants.dist_units`
Notes:
- Ignore non-GTFS files in the feed
- Automatically strip whitespace from the column names in GTFS files
"""
path = gk.Path(path)
if not path.exists():
raise ValueError(f"Path {path} does not exist")
# Unzip path to temporary directory if necessary
if path.is_file():
zipped = True
tmp_dir = tempfile.TemporaryDirectory()
src_path = gk.Path(tmp_dir.name)
shutil.unpack_archive(str(path), tmp_dir.name, "zip")
else:
zipped = False
src_path = path
# Read files into feed dictionary of DataFrames
feed_dict = {table: None for table in gk.cs.GTFS_REF["table"]}
for p in src_path.iterdir():
table = p.stem
# Skip empty files, irrelevant files, and files with no data
if (
p.is_file()
and p.stat().st_size
and p.suffix == ".txt"
and table in feed_dict
):
# utf-8-sig gets rid of the byte order mark (BOM);
# see http://stackoverflow.com/questions/17912307/u-ufeff-in-python-string
df = pd.read_csv(p, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
if not df.empty:
feed_dict[table] = gk.cn.clean_column_names(df)
feed_dict["dist_units"] = dist_units
# Delete temporary directory
if zipped:
tmp_dir.cleanup()
# Create feed
return gk.Feed(**feed_dict)
def __wanted_stop_ids(self) -> pd.core.frame.DataFrame: def __wanted_stop_ids(self) -> pd.core.frame.DataFrame:
""" """