Separate the loading of stop_times.txt and only load the stop times for the stops we are interested in.
This commit is contained in:
parent
855bfc3efc
commit
6b45e27539
|
|
@ -26,7 +26,7 @@ class GTFSClient():
|
||||||
print("The feed file was up to date")
|
print("The feed file was up to date")
|
||||||
|
|
||||||
# Load the feed
|
# Load the feed
|
||||||
self.feed = self._read_feed(feed_name, dist_units='km')
|
self.feed = self._read_feed(feed_name, dist_units='km', stop_names = stop_names)
|
||||||
self.stop_ids = self.__wanted_stop_ids()
|
self.stop_ids = self.__wanted_stop_ids()
|
||||||
|
|
||||||
# Schedule refresh
|
# Schedule refresh
|
||||||
|
|
@ -34,7 +34,7 @@ class GTFSClient():
|
||||||
if update_interval_seconds and update_queue:
|
if update_interval_seconds and update_queue:
|
||||||
self._refresh_thread = threading.Thread(target=lambda: every(self._update_interval_seconds, self.refresh))
|
self._refresh_thread = threading.Thread(target=lambda: every(self._update_interval_seconds, self.refresh))
|
||||||
|
|
||||||
def _read_feed(self, path: gk.Path, dist_units: str) -> gk.Feed:
|
def _read_feed(self, path: gk.Path, dist_units: str, stop_names: list[str]) -> gk.Feed:
|
||||||
"""
|
"""
|
||||||
NOTE: This helper method was extracted from gtfs_kit.feed to modify it
|
NOTE: This helper method was extracted from gtfs_kit.feed to modify it
|
||||||
to only load the stop_times for the stops we are interested in,
|
to only load the stop_times for the stops we are interested in,
|
||||||
|
|
@ -69,6 +69,7 @@ class GTFSClient():
|
||||||
|
|
||||||
# Read files into feed dictionary of DataFrames
|
# Read files into feed dictionary of DataFrames
|
||||||
feed_dict = {table: None for table in gk.cs.GTFS_REF["table"]}
|
feed_dict = {table: None for table in gk.cs.GTFS_REF["table"]}
|
||||||
|
stop_times_p = None
|
||||||
for p in src_path.iterdir():
|
for p in src_path.iterdir():
|
||||||
table = p.stem
|
table = p.stem
|
||||||
# Skip empty files, irrelevant files, and files with no data
|
# Skip empty files, irrelevant files, and files with no data
|
||||||
|
|
@ -78,11 +79,28 @@ class GTFSClient():
|
||||||
and p.suffix == ".txt"
|
and p.suffix == ".txt"
|
||||||
and table in feed_dict
|
and table in feed_dict
|
||||||
):
|
):
|
||||||
# utf-8-sig gets rid of the byte order mark (BOM);
|
if p.name == "stop_times.txt":
|
||||||
# see http://stackoverflow.com/questions/17912307/u-ufeff-in-python-string
|
# Defer the loading of stop_times.txt until after the stop IDs are known
|
||||||
df = pd.read_csv(p, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
|
stop_times_p = p
|
||||||
if not df.empty:
|
else:
|
||||||
feed_dict[table] = gk.cn.clean_column_names(df)
|
# utf-8-sig gets rid of the byte order mark (BOM);
|
||||||
|
# see http://stackoverflow.com/questions/17912307/u-ufeff-in-python-string
|
||||||
|
df = pd.read_csv(p, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
|
||||||
|
if not df.empty:
|
||||||
|
feed_dict[table] = gk.cn.clean_column_names(df)
|
||||||
|
|
||||||
|
# Finally, load stop_times.txt
|
||||||
|
if stop_times_p:
|
||||||
|
# Obtain the list of IDs of the desired stops. This is similar to what __wanted_stop_ids() does,
|
||||||
|
# but without a dependency on a fully formed feed object
|
||||||
|
wanted_stop_ids = feed_dict.get("stops")[feed_dict.get("stops")["stop_name"].isin(stop_names)]["stop_id"]
|
||||||
|
|
||||||
|
iter_csv = pd.read_csv(stop_times_p, iterator=True, chunksize=1000)
|
||||||
|
df = pd.concat([chunk[chunk["stop_id"].isin(wanted_stop_ids)] for chunk in iter_csv])
|
||||||
|
|
||||||
|
#df = pd.read_csv(stop_times_p, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
|
||||||
|
if not df.empty:
|
||||||
|
feed_dict[stop_times_p.stem] = gk.cn.clean_column_names(df)
|
||||||
|
|
||||||
feed_dict["dist_units"] = dist_units
|
feed_dict["dist_units"] = dist_units
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue