avoid double download of JRC idees

2024-07-19 10:20:19 +02:00 · 2024-07-19 10:20:19 +02:00 · 4c46c57fec
commit 4c46c57fec
parent 5f009590f5
2 changed files with 22 additions and 29 deletions
--- a/scripts/build_energy_totals.py
+++ b/scripts/build_energy_totals.py
@ -329,9 +329,9 @@ def idees_per_country(ct: str, base_dir: str) -> pd.DataFrame:
    """

    ct_idees = idees_rename.get(ct, ct)
-    fn_residential = f"{base_dir}/JRC-IDEES-2021_Residential_{ct_idees}.xlsx"
-    fn_tertiary = f"{base_dir}/JRC-IDEES-2021_Tertiary_{ct_idees}.xlsx"
-    fn_transport = f"{base_dir}/JRC-IDEES-2021_Transport_{ct_idees}.xlsx"
+    fn_residential = f"{base_dir}/{ct_idees}/JRC-IDEES-2021_Residential_{ct_idees}.xlsx"
+    fn_tertiary = f"{base_dir}/{ct_idees}/JRC-IDEES-2021_Tertiary_{ct_idees}.xlsx"
+    fn_transport = f"{base_dir}/{ct_idees}/JRC-IDEES-2021_Transport_{ct_idees}.xlsx"

    ct_totals = {}

@ -1103,6 +1103,10 @@ def build_transport_data(
        transport_data = pd.concat([transport_data, swiss_cars]).sort_index()

    transport_data.rename(columns={"passenger cars": "number cars"}, inplace=True)
+    
+    # clean up dataframe
+    years = np.arange(2000, 2022)
+    transport_data = transport_data[transport_data.index.get_level_values(1).isin(years)]

    missing = transport_data.index[transport_data["number cars"].isna()]
    if not missing.empty:
--- a/scripts/retrieve_jrc_idees.py
+++ b/scripts/retrieve_jrc_idees.py
@ -10,22 +10,19 @@ import logging
 import os
 import zipfile
 from pathlib import Path
-
-import requests
 from _helpers import configure_logging, progress_retrieve, set_scenario_config
-from bs4 import BeautifulSoup
+

 logger = logging.getLogger(__name__)

 # Define the base URL
 url_jrc = (
-    "https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/JRC-IDEES/JRC-IDEES-2021_v1/"
+    "https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/JRC-IDEES/JRC-IDEES-2021_v1/JRC-IDEES-2021.zip"
 )

 if __name__ == "__main__":
    if "snakemake" not in globals():
        from _helpers import mock_snakemake
-
        snakemake = mock_snakemake("retrieve_jrc_idees")
        rootpath = ".."
    else:
@ -33,30 +30,22 @@ if __name__ == "__main__":

    configure_logging(snakemake)
    set_scenario_config(snakemake)
-
    disable_progress = snakemake.config["run"].get("disable_progressbar", False)

-    # create a local directory to save the zip files
-    local_dir = snakemake.output[0]
-    if not os.path.exists(local_dir):
-        os.makedirs(local_dir)
-
-    # get the list of zip files from the JRC URL
-    response = requests.get(url_jrc)
-    soup = BeautifulSoup(response.text, "html.parser")
-    zip_files = [
-        link.get("href")
-        for link in soup.find_all("a")
-        if link.get("href").endswith(".zip")
-    ]
+    to_fn = snakemake.output[0]
+    to_fn_zp = to_fn + ".zip"

+    # download .zip file
    logger.info(
-        f"Downloading {len(zip_files)} .zip files for JRC IDEES from '{url_jrc}'."
+        f"Downloading JRC IDEES from {url_jrc}."
    )
+    progress_retrieve(url_jrc, to_fn_zp, disable=disable_progress)
+    
+    # extract
+    logger.info("Extracting JRC IDEES data.")
+    with zipfile.ZipFile(to_fn_zp, "r") as zip_ref:
+        zip_ref.extractall(to_fn)

-    # download and unpack each zip file
-    for zip_file in zip_files:
-        logger.info(f"Downloading and unpacking {zip_file}")
-        zip_url = url_jrc + zip_file
-        to_fn = local_dir + "/" + zip_file[:-4]
-        progress_retrieve(zip_url, to_fn, disable=disable_progress)
+    logger.info(f"JRC IDEES data available in '{to_fn}'.")
+
+