removing old eurostat data reports as an option, cleaning up code

2024-02-27 12:04:07 +01:00 · 2024-02-27 12:04:07 +01:00 · d363aeb57d
commit d363aeb57d
parent 3298572ced
7 changed files with 152 additions and 234 deletions
--- a/config/config.default.yaml
+++ b/config/config.default.yaml
@ -316,7 +316,6 @@ pypsa_eur:
 energy:
  energy_totals_year: 2019
  base_emissions_year: 1990
-  eurostat_report_year: 2023
  emissions: CO2

 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#biomass
--- a/rules/build_sector.smk
+++ b/rules/build_sector.smk
@ -270,7 +270,7 @@ rule build_energy_totals:
        swiss="data/switzerland-new_format-all_years.csv",
        idees="data/bundle-sector/jrc-idees-2015",
        district_heat_share="data/district_heat_share.csv",
-        eurostat=input_eurostat,
+        eurostat="data/bundle-sector/eurostat-energy_balances-april_2023_edition",
    output:
        energy_name=resources("energy_totals.csv"),
        co2_name=resources("co2_totals.csv"),
@ -865,7 +865,7 @@ rule prepare_sector_network:
        ),
        network=resources("networks/elec_s{simpl}_{clusters}_ec_l{ll}_{opts}.nc"),
        energy_totals_name=resources("energy_totals.csv"),
-        eurostat=input_eurostat,
+        eurostat="data/bundle-sector/eurostat-energy_balances-april_2023_edition",
        pop_weighted_energy_totals=resources(
            "pop_weighted_energy_totals_s{simpl}_{clusters}.csv"
        ),
--- a/rules/common.smk
+++ b/rules/common.smk
@ -129,14 +129,6 @@ def has_internet_access(url="www.zenodo.org") -> bool:
    finally:
        conn.close()

-
-def input_eurostat(w):
-    if config["energy"]["eurostat_report_year"] != 2023:
-        report_year = config["energy"]["eurostat_report_year"]
-        return f"data/bundle-sector/eurostat-energy_balances-june_{report_year}_edition"
-    else:
-        return "data/bundle-sector/eurostat-energy_balances-april_2023_edition"
-
 def solved_previous_horizon(w):
    planning_horizons = config_provider("scenario", "planning_horizons")(w)
    i = planning_horizons.index(int(w.planning_horizons))
--- a/rules/postprocess.smk
+++ b/rules/postprocess.smk
@ -247,7 +247,7 @@ rule plot_summary:
        costs=RESULTS + "csvs/costs.csv",
        energy=RESULTS + "csvs/energy.csv",
        balances=RESULTS + "csvs/supply_energy.csv",
-        eurostat=input_eurostat,
+        eurostat="data/bundle-sector/eurostat-energy_balances-april_2023_edition",
        co2="data/bundle-sector/eea/UNFCCC_v23.csv",
    output:
        costs=RESULTS + "graphs/costs.pdf",
--- a/rules/retrieve.smk
+++ b/rules/retrieve.smk
@ -142,6 +142,9 @@ if config["enable"]["retrieve"] and config["enable"].get(
        protected(
            directory("data/bundle-sector/eurostat-energy_balances-may_2018_edition")
        ),
+        protected(
+            directory("data/bundle-sector/eurostat-energy_balances-april_2023_edition")
+        ),
        protected(directory("data/bundle-sector/jrc-idees-2015")),
    ]

--- a/scripts/build_energy_totals.py
+++ b/scripts/build_energy_totals.py
@ -117,97 +117,65 @@ to_ipcc = {
 }


-def build_eurostat(input_eurostat, countries, report_year, year):
+def build_eurostat(input_eurostat, countries, year):
    """
    Return multi-index for all countries' energy data in TWh/a.
    """
-    if report_year != 2023:
-        filenames = {
-        2016: f"/{year}-Energy-Balances-June2016edition.xlsx",
-        2017: f"/{year}-ENERGY-BALANCES-June2017edition.xlsx",
-    }
-
-        with mute_print():
-            dfs = pd.read_excel(
-                input_eurostat + filenames[report_year],
-                sheet_name=None,
-                skiprows=1,
-                index_col=list(range(4)),
-            )
-
-        # sorted_index necessary for slicing
-        lookup = eurostat_codes
-        labelled_dfs = {
-            lookup[df.columns[0]]: df
-            for df in dfs.values()
-            if lookup[df.columns[0]] in countries
-        }
-        df = pd.concat(labelled_dfs, sort=True).sort_index()
-        # drop non-numeric and country columns
-        non_numeric_cols = df.columns[df.dtypes != float]
-        country_cols = df.columns.intersection(lookup.keys())
-        to_drop = non_numeric_cols.union(country_cols)
-        df.drop(to_drop, axis=1, inplace=True)
-
-        # convert ktoe/a to TWh/a
-        df *= 11.63 / 1e3
+    # read in every country file in countries
+    eurostat = pd.DataFrame()
+    countries = [country if country != 'GB' else 'UK' for country in countries]
+    countries = [country if country != 'GR' else 'EL' for country in countries]
+    for country in countries:
+        filename = f"/{country}-Energy-balance-sheets-April-2023-edition.xlsb"
+        if os.path.exists(input_eurostat + filename):
+            df = pd.read_excel(
+                input_eurostat + filename,
+                engine='pyxlsb',
+                sheet_name=str(year),
+                skiprows=4,
+                index_col=list(range(4)))
+            # replace entry 'Z' with 0
+            df.replace('Z', 0, inplace=True)
+            # write 'International aviation' to the 2nd level of the multiindex
+            index_number = (df.index.get_level_values(1) == 'International aviation').argmax()
+            new_index = ('-', 'International aviation', 'International aviation', 'ktoe')
+            modified_index = list(df.index)
+            modified_index[index_number] = new_index
+            df.index = pd.MultiIndex.from_tuples(modified_index, names=df.index.names)
+            # drop the annoying subhead line
+            df.drop(df[df[year] == year].index, inplace=True)
+            # replace 'Z' with 0
+            df = df.replace('Z', 0)
+            # add country to the multiindex
+            new_tuple = [(country, *idx) for idx in df.index]
+            new_mindex = pd.MultiIndex.from_tuples(new_tuple, names=['country', None, 'name', None, 'unit'])
+            df.index = new_mindex
+            # make numeric values where possible
+            df = df.apply(pd.to_numeric, errors='coerce')
+            # drop non-numeric columns
+            non_numeric_cols = df.columns[df.dtypes != float]
+            df.drop(non_numeric_cols, axis=1, inplace=True)
+            # concatenate the dataframes
+            eurostat = pd.concat([eurostat, df], axis=0)
    
-    else:
-        # read in every country file in countries
-        eurostat = pd.DataFrame()
-        countries = [country if country != 'GB' else 'UK' for country in countries]
-        countries = [country if country != 'GR' else 'EL' for country in countries]
-        for country in countries:
-            filename = f"/{country}-Energy-balance-sheets-April-2023-edition.xlsb"
-            if os.path.exists(input_eurostat + filename):
-                df = pd.read_excel(
-                    input_eurostat + filename,
-                    engine='pyxlsb',
-                    sheet_name=str(year),
-                    skiprows=4,
-                    index_col=list(range(4)))
-                # replace entry 'Z' with 0
-                df.replace('Z', 0, inplace=True)
-                # write 'International aviation' to the 2nd level of the multiindex
-                index_number = (df.index.get_level_values(1) == 'International aviation').argmax()
-                new_index = ('-', 'International aviation', 'International aviation', 'ktoe')
-                modified_index = list(df.index)
-                modified_index[index_number] = new_index
-                df.index = pd.MultiIndex.from_tuples(modified_index, names=df.index.names)
-                # drop the annoying subhead line
-                df.drop(df[df[year] == year].index, inplace=True)
-                # replace 'Z' with 0
-                df = df.replace('Z', 0)
-                # add country to the multiindex
-                new_tuple = [(country, *idx) for idx in df.index]
-                new_mindex = pd.MultiIndex.from_tuples(new_tuple, names=['country', None, 'name', None, 'unit'])
-                df.index = new_mindex
-                # make numeric values where possible
-                df = df.apply(pd.to_numeric, errors='coerce')
-                # drop non-numeric columns
-                non_numeric_cols = df.columns[df.dtypes != float]
-                df.drop(non_numeric_cols, axis=1, inplace=True)
-                # concatenate the dataframes
-                eurostat = pd.concat([eurostat, df], axis=0)
-        
-        eurostat.drop(["Unnamed: 4", year, "Unnamed: 6"], axis=1, inplace=True)
-        # Renaming some indices
-        rename = {
-            'Households': 'Residential',
-            'Commercial & public services': 'Services',
-            'Domestic navigation': 'Domestic Navigation'
-        }
-        for name, rename in rename.items():
-            eurostat.index = eurostat.index.set_levels(
-            eurostat.index.levels[3].where(eurostat.index.levels[3] != name, rename),
-            level=3)
-        new_index = eurostat.index.set_levels(eurostat.index.levels[2].where(eurostat.index.levels[2] != 'International maritime bunkers', 'Bunkers'), level=2)
-        eurostat.index = new_index
+    eurostat.drop(["Unnamed: 4", year, "Unnamed: 6"], axis=1, inplace=True)
+    # Renaming some indices
+    rename = {
+        'Households': 'Residential',
+        'Commercial & public services': 'Services',
+        'Domestic navigation': 'Domestic Navigation'
+    }
+    for name, rename in rename.items():
+        eurostat.index = eurostat.index.set_levels(
+        eurostat.index.levels[3].where(eurostat.index.levels[3] != name, rename),
+        level=3)
+    new_index = eurostat.index.set_levels(eurostat.index.levels[2].where(eurostat.index.levels[2] != 'International maritime bunkers', 'Bunkers'), level=2)
+    eurostat.index = new_index

-        eurostat.rename(columns={'Total': 'Total all products'}, inplace=True)
-        eurostat.index = eurostat.index.set_levels(eurostat.index.levels[0].where(eurostat.index.levels[0] != 'UK', 'GB'), level=0)
-        
-        df = eurostat * 11.63 / 1e3
+    eurostat.rename(columns={'Total': 'Total all products'}, inplace=True)
+    eurostat.index = eurostat.index.set_levels(eurostat.index.levels[0].where(eurostat.index.levels[0] != 'UK', 'GB'), level=0)
+    
+    df = eurostat * 11.63 / 1e3

    return df

@ -709,8 +677,8 @@ def build_eea_co2(input_co2, year=1990, emissions_scope="CO2"):
    return emissions / 1e3


-def build_eurostat_co2(input_eurostat, countries, report_year, year=1990):
-    eurostat = build_eurostat(input_eurostat, countries, report_year, year)
+def build_eurostat_co2(input_eurostat, countries, year=1990):
+    eurostat = build_eurostat(input_eurostat, countries, year)

    specific_emissions = pd.Series(index=eurostat.columns, dtype=float)

@ -727,49 +695,26 @@ def build_eurostat_co2(input_eurostat, countries, report_year, year=1990):
    return eurostat.multiply(specific_emissions).sum(axis=1)


-def build_co2_totals(countries, eea_co2, eurostat_co2, report_year):
+def build_co2_totals(countries, eea_co2, eurostat_co2):
    co2 = eea_co2.reindex(countries)

    for ct in pd.Index(countries).intersection(["BA", "RS", "AL", "ME", "MK"]):
-        if report_year != 2023:
-            mappings = {
-                "electricity": (
-                    ct,
-                    "+",
-                    "Conventional Thermal Power Stations",
-                    "of which From Coal",
-                ),
-                "residential non-elec": (ct, "+", "+", "Residential"),
-                "services non-elec": (ct, "+", "+", "Services"),
-                "road non-elec": (ct, "+", "+", "Road"),
-                "rail non-elec": (ct, "+", "+", "Rail"),
-                "domestic navigation": (ct, "+", "+", "Domestic Navigation"),
-                "international navigation": (ct, "-", "Bunkers"),
-                "domestic aviation": (ct, "+", "+", "Domestic aviation"),
-                "international aviation": (ct, "+", "+", "International aviation"),
-                # does not include industrial process emissions or fuel processing/refining
-                "industrial non-elec": (ct, "+", "Industry"),
-                # does not include non-energy emissions
-                "agriculture": (eurostat_co2.index.get_level_values(0) == ct)
-                & eurostat_co2.index.isin(["Agriculture / Forestry", "Fishing"], level=3),
-            }
-        else:
-            mappings = {
-                "electricity": (ct, "+", "Electricity & heat generation", np.nan),
-                "residential non-elec": (ct, "+", "+", "Residential"),
-                "services non-elec": (ct, "+", "+", "Services"),
-                "road non-elec": (ct, "+", "+", "Road"),
-                "rail non-elec": (ct, "+", "+", "Rail"),
-                "domestic navigation": (ct, "+", "+", "Domestic Navigation"),
-                "international navigation": (ct, "-", "Bunkers"),
-                "domestic aviation": (ct, "+", "+", "Domestic aviation"),
-                "international aviation": (ct, "-", "International aviation"),
-                # does not include industrial process emissions or fuel processing/refining
-                "industrial non-elec": (ct, "+", "Industry sector"),
-                # does not include non-energy emissions
-                "agriculture": (eurostat_co2.index.get_level_values(0) == ct)
-                & eurostat_co2.index.isin(["Agriculture & forestry", "Fishing"], level=3),
-            }
+        mappings = {
+            "electricity": (ct, "+", "Electricity & heat generation", np.nan),
+            "residential non-elec": (ct, "+", "+", "Residential"),
+            "services non-elec": (ct, "+", "+", "Services"),
+            "road non-elec": (ct, "+", "+", "Road"),
+            "rail non-elec": (ct, "+", "+", "Rail"),
+            "domestic navigation": (ct, "+", "+", "Domestic Navigation"),
+            "international navigation": (ct, "-", "Bunkers"),
+            "domestic aviation": (ct, "+", "+", "Domestic aviation"),
+            "international aviation": (ct, "-", "International aviation"),
+            # does not include industrial process emissions or fuel processing/refining
+            "industrial non-elec": (ct, "+", "Industry sector"),
+            # does not include non-energy emissions
+            "agriculture": (eurostat_co2.index.get_level_values(0) == ct)
+            & eurostat_co2.index.isin(["Agriculture & forestry", "Fishing"], level=3),
+        }

        for i, mi in mappings.items():
            co2.at[ct, i] = eurostat_co2.loc[mi].sum()
@ -820,83 +765,65 @@ def rescale(idees_countries, energy, eurostat):
    '''
    # read in the eurostat data for 2015
    eurostat_2015 = build_eurostat(input_eurostat, countries, 2023, 2015)[["Total all products", "Electricity"]]
-    # eurostat_2015 = eurostat_2015.rename(index={'GB': 'UK'}, level=0)
    eurostat_year = eurostat[["Total all products", "Electricity"]]
    # calculate the ratio of the two data sets
    ratio = eurostat_year / eurostat_2015
    ratio = ratio.droplevel([1,4])
    ratio.rename(columns={"Total all products": "total", "Electricity": "ele"}, inplace=True)
-    ratio = ratio.rename(index={"GB": "UK"}, level=0)
+    ratio = ratio.rename(index={"EL": "GR"}, level=0)

-    residential_total = [
-        "total residential space",
-        "total residential water",
-        "total residential cooking",
-        "total residential",
-        "derived heat residential",
-        "thermal uses residential",
-    ]
-    residential_ele = [
-        "electricity residential space",
-        "electricity residential water",
-        "electricity residential cooking",
-        "electricity residential",
-    ]
-
-    service_total = [
-        "total services space",
-        "total services water",
-        "total services cooking",
-        "total services",
-        "derived heat services",
-        "thermal uses services",
-    ]
-    service_ele = [
-        "electricity services space",
-        "electricity services water",
-        "electricity services cooking",
-        "electricity services",
-    ]
-
-    agri_total = [
-        "total agriculture heat",
-        "total agriculture machinery",
-        "total agriculture",
-    ]
-    agri_ele = [
-        "total agriculture electricity",
-    ]
-
-    road_total = [
-        "total road",
-        "total passenger cars",
-        "total other road passenger",
-        "total light duty road freight",
-    ]
-    road_ele = [
-        "electricity road",
-        "electricity passenger cars",
-        "electricity other road passenger",
-        "electricity light duty road freight",
-    ]
-
-    rail_total = [
-        "total rail",
-        "total rail passenger",
-        "total rail freight",
-    ]
-    rail_ele = [
-        "electricity rail",
-        "electricity rail passenger",
-        "electricity rail freight",
-    ]
+    mappings = {
+        "Residential": {
+            "total": ["total residential space",
+                      "total residential water",
+                      "total residential cooking",
+                      "total residential",
+                      "derived heat residential",
+                      "thermal uses residential",],
+            "elec": ["electricity residential space",
+                     "electricity residential water",
+                     "electricity residential cooking",
+                     "electricity residential",]},
+        "Services": {
+            "total": ["total services space",
+                      "total services water",
+                      "total services cooking",
+                      "total services",
+                      "derived heat services",
+                      "thermal uses services",],
+            "elec": ["electricity services space",
+                     "electricity services water",
+                     "electricity services cooking",
+                     "electricity services",]},
+        "Agriculture & forestry": {
+            "total": ["total agriculture heat",
+                      "total agriculture machinery",
+                      "total agriculture",],
+            "elec": ["total agriculture electricity",]},
+        "Road": {
+            "total": ["total road",
+                      "total passenger cars",
+                      "total other road passenger",
+                      "total light duty road freight",],
+            "elec": ["electricity road",
+                     "electricity passenger cars",
+                     "electricity other road passenger",
+                     "electricity light duty road freight",]},
+        "Rail": {
+            "total": ["total rail",
+                      "total rail passenger",
+                      "total rail freight",],
+            "elec": ["electricity rail",
+                     "electricity rail passenger",
+                     "electricity rail freight",]},
+    }

    avia_inter = [
-        'total aviation passenger',
-        'total aviation freight',
-        'total international aviation passenger',
-        'total international aviation freight',
-        'total international aviation'
+    'total aviation passenger',
+    'total aviation freight',
+    'total international aviation passenger',
+    'total international aviation freight',
+    'total international aviation'
    ]
    avia_domestic = [
        'total domestic aviation passenger',
@ -906,30 +833,14 @@ def rescale(idees_countries, energy, eurostat):
    navigation = [
        "total domestic navigation",
    ]
-    
-    idees_countries = idees_countries.repalce({'GB': 'UK', 'GR': 'EL'})

    for country in idees_countries:
-        res = ratio.loc[(country, slice(None), 'Residential')]
-        energy.loc[country, residential_total] *= res[['total']].iloc[0,0]
-        energy.loc[country, residential_ele] *= res[['ele']].iloc[0,0]
-
-        ser = ratio.loc[(country, slice(None), 'Services')]
-        energy.loc[country, service_total] *= ser[['total']].iloc[0,0]
-        energy.loc[country, service_ele] *= ser[['ele']].iloc[0,0]
-
-        agri = ratio.loc[(country, slice(None), 'Agriculture & forestry')]
-        energy.loc[country, agri_total] *= agri[['total']].iloc[0,0]
-        energy.loc[country, agri_ele] *= agri[['ele']].iloc[0,0]
-
-        road = ratio.loc[(country, slice(None), 'Road')]
-        energy.loc[country, road_total] *= road[['total']].iloc[0,0]
-        energy.loc[country, road_ele] *= road[['ele']].iloc[0,0]
-
-        rail = ratio.loc[(country, slice(None), 'Rail')]
-        energy.loc[country, rail_total] *= rail[['total']].iloc[0,0]
-        energy.loc[country, rail_ele] *= rail[['ele']].iloc[0,0]
+        for sector, mapping in mappings.items():
+            sector_ratio = ratio.loc[(country, slice(None), sector)]

+            energy.loc[country, mapping["total"]] *= sector_ratio[['total']].iloc[0,0]
+            energy.loc[country, mapping["elec"]] *= sector_ratio[['ele']].iloc[0,0]
+        
        avi_d = ratio.loc[(country, slice(None), 'Domestic aviation')]
        avi_i = ratio.loc[(country, 'International aviation', slice(None))]
        energy.loc[country, avia_inter] *= avi_i[['total']].iloc[0,0]
@ -958,9 +869,8 @@ if __name__ == "__main__":
    idees_countries = pd.Index(countries).intersection(eu28)

    data_year = params["energy_totals_year"]
-    report_year = snakemake.params.energy["eurostat_report_year"]
    input_eurostat = snakemake.input.eurostat
-    eurostat = build_eurostat(input_eurostat, countries, report_year, data_year)
+    eurostat = build_eurostat(input_eurostat, countries, data_year)
    swiss = build_swiss(data_year)
    # data from idees only exists for 2015
    if data_year > 2015:
@ -984,10 +894,10 @@ if __name__ == "__main__":
    emissions_scope = snakemake.params.energy["emissions"]
    eea_co2 = build_eea_co2(snakemake.input.co2, base_year_emissions, emissions_scope)
    eurostat_co2 = build_eurostat_co2(
-        input_eurostat, countries, report_year, base_year_emissions
+        input_eurostat, countries, base_year_emissions
    )

-    co2 = build_co2_totals(countries, eea_co2, eurostat_co2, report_year)
+    co2 = build_co2_totals(countries, eea_co2, eurostat_co2)
    co2.to_csv(snakemake.output.co2_name)

    transport = build_transport_data(countries, population, idees)
--- a/scripts/retrieve_sector_databundle.py
+++ b/scripts/retrieve_sector_databundle.py
@ -8,6 +8,7 @@ Retrieve and extract data bundle for sector-coupled studies.

 import logging
 import tarfile
+import zipfile
 from pathlib import Path

 from _helpers import (
@ -47,3 +48,16 @@ if __name__ == "__main__":
    tarball_fn.unlink()

    logger.info(f"Databundle available in '{to_fn}'.")
+
+    url_eurostat = "https://ec.europa.eu/eurostat/documents/38154/4956218/Balances-December2022.zip/f7cf0d19-5c0f-60ad-4e48-098a5ddd6e48?t=1671184070589"
+    tarball_fn = Path(f"{rootpath}/data/bundle-sector/eurostat_2023.zip")
+    to_fn = Path(f"{rootpath}/data/bundle-sector/eurostat-energy_balances-april_2023_edition/")
+
+    logger.info(f"Downloading Eurostat data from '{url_eurostat}'.")
+    progress_retrieve(url_eurostat, tarball_fn, disable=disable_progress)
+
+    logger.info("Extracting Eurostat data.")
+    with zipfile.ZipFile(tarball_fn, 'r') as zip_ref:
+        zip_ref.extractall(to_fn)
+
+    logger.info(f"Eurostat data available in '{to_fn}'.")