Read feed files straight from the zip file to avoid wearing out the Pi's SD card.
This commit is contained in:
parent
9ca4a19672
commit
b8ec772c51
|
|
@ -13,6 +13,7 @@ import time
|
||||||
import threading
|
import threading
|
||||||
import traceback
|
import traceback
|
||||||
import shutil
|
import shutil
|
||||||
|
import zipfile
|
||||||
|
|
||||||
class GTFSClient():
|
class GTFSClient():
|
||||||
GTFS_URL = "https://api.nationaltransport.ie/gtfsr/v2/gtfsr?format=json"
|
GTFS_URL = "https://api.nationaltransport.ie/gtfsr/v2/gtfsr?format=json"
|
||||||
|
|
@ -50,76 +51,49 @@ class GTFSClient():
|
||||||
"""
|
"""
|
||||||
NOTE: This helper method was extracted from gtfs_kit.feed to modify it
|
NOTE: This helper method was extracted from gtfs_kit.feed to modify it
|
||||||
to only load the stop_times for the stops we are interested in,
|
to only load the stop_times for the stops we are interested in,
|
||||||
because loading the entire feed would use more memory than the SoC
|
because loading the entire feed would use more memory than the Raspberry Pi Zero W has.
|
||||||
in the Raspberry Pi Zero W has.
|
|
||||||
|
|
||||||
Helper function for :func:`read_feed`.
|
This version also reads CSV data straight from the zip file to avoid
|
||||||
Create a Feed instance from the given path and given distance units.
|
wearing out the Pi's SD card.
|
||||||
The path should be a directory containing GTFS text files or a
|
|
||||||
zip file that unzips as a collection of GTFS text files
|
|
||||||
(and not as a directory containing GTFS text files).
|
|
||||||
The distance units given must lie in :const:`constants.dist_units`
|
|
||||||
|
|
||||||
Notes:
|
|
||||||
|
|
||||||
- Ignore non-GTFS files in the feed
|
|
||||||
- Automatically strip whitespace from the column names in GTFS files
|
|
||||||
"""
|
"""
|
||||||
|
FILES_TO_LOAD = [
|
||||||
|
# List of feed files to load. stop_times.txt is loaded separately.
|
||||||
|
'shapes.txt',
|
||||||
|
'trips.txt',
|
||||||
|
'routes.txt',
|
||||||
|
'calendar.txt',
|
||||||
|
'calendar_dates.txt',
|
||||||
|
'stops.txt',
|
||||||
|
'agency.txt'
|
||||||
|
]
|
||||||
|
|
||||||
path = gk.Path(path)
|
path = gk.Path(path)
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
raise ValueError(f"Path {path} does not exist")
|
raise ValueError(f"Path {path} does not exist")
|
||||||
|
|
||||||
# Unzip path to temporary directory if necessary
|
|
||||||
if path.is_file():
|
|
||||||
zipped = True
|
|
||||||
tmp_dir = tempfile.TemporaryDirectory()
|
|
||||||
src_path = gk.Path(tmp_dir.name)
|
|
||||||
shutil.unpack_archive(str(path), tmp_dir.name, "zip")
|
|
||||||
else:
|
|
||||||
zipped = False
|
|
||||||
src_path = path
|
|
||||||
|
|
||||||
# Read files into feed dictionary of DataFrames
|
|
||||||
feed_dict = {table: None for table in gk.cs.GTFS_REF["table"]}
|
feed_dict = {table: None for table in gk.cs.GTFS_REF["table"]}
|
||||||
stop_times_p = None
|
with zipfile.ZipFile(path) as z:
|
||||||
for p in src_path.iterdir():
|
for filename in FILES_TO_LOAD:
|
||||||
table = p.stem
|
table = filename.split(".")[0]
|
||||||
# Skip empty files, irrelevant files, and files with no data
|
# read the file
|
||||||
if (
|
with z.open(filename) as f:
|
||||||
p.is_file()
|
df = pd.read_csv(f, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
|
||||||
and p.stat().st_size
|
|
||||||
and p.suffix == ".txt"
|
|
||||||
and table in feed_dict
|
|
||||||
):
|
|
||||||
if p.name == "stop_times.txt":
|
|
||||||
# Defer the loading of stop_times.txt until after the stop IDs are known
|
|
||||||
stop_times_p = p
|
|
||||||
else:
|
|
||||||
# utf-8-sig gets rid of the byte order mark (BOM);
|
|
||||||
# see http://stackoverflow.com/questions/17912307/u-ufeff-in-python-string
|
|
||||||
df = pd.read_csv(p, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
|
|
||||||
if not df.empty:
|
if not df.empty:
|
||||||
feed_dict[table] = gk.cn.clean_column_names(df)
|
feed_dict[table] = gk.cn.clean_column_names(df)
|
||||||
|
|
||||||
# Finally, load stop_times.txt
|
# Finally, load stop_times.txt
|
||||||
if stop_times_p:
|
|
||||||
# Obtain the list of IDs of the desired stops. This is similar to what __wanted_stop_ids() does,
|
# Obtain the list of IDs of the desired stops. This is similar to what __wanted_stop_ids() does,
|
||||||
# but without a dependency on a fully formed feed object
|
# but without a dependency on a fully formed feed object
|
||||||
wanted_stop_ids = feed_dict.get("stops")[feed_dict.get("stops")["stop_code"].isin(stop_codes)]["stop_id"]
|
wanted_stop_ids = feed_dict.get("stops")[feed_dict.get("stops")["stop_code"].isin(stop_codes)]["stop_id"]
|
||||||
|
with z.open("stop_times.txt") as f:
|
||||||
|
iter_csv = pd.read_csv(f, iterator=True, chunksize=1000, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
|
||||||
|
df = pd.concat([chunk[chunk["stop_id"].isin(wanted_stop_ids)] for chunk in iter_csv])
|
||||||
|
|
||||||
iter_csv = pd.read_csv(stop_times_p, iterator=True, chunksize=1000)
|
|
||||||
df = pd.concat([chunk[chunk["stop_id"].isin(wanted_stop_ids)] for chunk in iter_csv])
|
|
||||||
|
|
||||||
#df = pd.read_csv(stop_times_p, dtype=gk.cs.DTYPE, encoding="utf-8-sig")
|
|
||||||
if not df.empty:
|
if not df.empty:
|
||||||
feed_dict[stop_times_p.stem] = gk.cn.clean_column_names(df)
|
feed_dict["stop_times"] = gk.cn.clean_column_names(df)
|
||||||
|
|
||||||
feed_dict["dist_units"] = dist_units
|
feed_dict["dist_units"] = dist_units
|
||||||
|
|
||||||
# Delete temporary directory
|
|
||||||
if zipped:
|
|
||||||
tmp_dir.cleanup()
|
|
||||||
|
|
||||||
# Create feed
|
# Create feed
|
||||||
return gk.Feed(**feed_dict)
|
return gk.Feed(**feed_dict)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue