build_energy_totals: revision of eurostat report upgrade

2024-03-05 18:43:24 +01:00 · 2024-03-05 18:43:24 +01:00 · bf60da973b
commit bf60da973b
parent 5b513f81db
1 changed files with 63 additions and 137 deletions
--- a/scripts/build_energy_totals.py
+++ b/scripts/build_energy_totals.py
@ -37,54 +37,6 @@ def reverse(dictionary):
    return {v: k for k, v in dictionary.items()}
 eurostat_codes = {
    "EU28": "EU",
    "EA19": "EA",
    "Belgium": "BE",
    "Bulgaria": "BG",
    "Czech Republic": "CZ",
    "Denmark": "DK",
    "Germany": "DE",
    "Estonia": "EE",
    "Ireland": "IE",
    "Greece": "GR",
    "Spain": "ES",
    "France": "FR",
    "Croatia": "HR",
    "Italy": "IT",
    "Cyprus": "CY",
    "Latvia": "LV",
    "Lithuania": "LT",
    "Luxembourg": "LU",
    "Hungary": "HU",
    "Malta": "MA",
    "Netherlands": "NL",
    "Austria": "AT",
    "Poland": "PL",
    "Portugal": "PT",
    "Romania": "RO",
    "Slovenia": "SI",
    "Slovakia": "SK",
    "Finland": "FI",
    "Sweden": "SE",
    "United Kingdom": "GB",
    "Iceland": "IS",
    "Norway": "NO",
    "Montenegro": "ME",
    "FYR of Macedonia": "MK",
    "Albania": "AL",
    "Serbia": "RS",
    "Turkey": "TU",
    "Bosnia and Herzegovina": "BA",
    "Kosovo\n(UNSCR 1244/99)": "KO",  # 2017 version
    # 2016 version
    "Kosovo\n(under United Nations Security Council Resolution 1244/99)": "KO",
    "Moldova": "MO",
    "Ukraine": "UK",
    "Switzerland": "CH",
 }
 idees_rename = {"GR": "EL", "GB": "UK"}
 eu28 = cc.EU28as("ISO2").ISO2.tolist()
@ -121,79 +73,54 @@ def build_eurostat(input_eurostat, countries, year):
    """
    Return multi-index for all countries' energy data in TWh/a.
    """
-    # read in every country file in countries
+    df = {}
-    eurostat = pd.DataFrame()
+    countries = {idees_rename.get(country, country) for country in countries} - {"CH"}
    countries = [country if country != "GB" else "UK" for country in countries]
    countries = [country if country != "GR" else "EL" for country in countries]
    for country in countries:
-        filename = f"/{country}-Energy-balance-sheets-April-2023-edition.xlsb"
+        filename = (
-        if os.path.exists(input_eurostat + filename):
+            f"{input_eurostat}/{country}-Energy-balance-sheets-April-2023-edition.xlsb"
-            df = pd.read_excel(
+        )
-                input_eurostat + filename,
+        sheet = pd.read_excel(
-                engine="pyxlsb",
+            filename,
-                sheet_name=str(year),
+            engine="pyxlsb",
-                skiprows=4,
+            sheet_name=str(year),
-                index_col=list(range(4)),
+            skiprows=4,
-            )
+            index_col=list(range(4)),
-            # replace entry 'Z' with 0
+        )
-            df.replace("Z", 0, inplace=True)
+        df[country] = sheet
-            # write 'International aviation' to the 2nd level of the multiindex
+    df = pd.concat(df, axis=0)
-            index_number = (
+
-                df.index.get_level_values(1) == "International aviation"
+    # drop columns with all NaNs
-            ).argmax()
+    unnamed_cols = df.columns[df.columns.astype(str).str.startswith("Unnamed")]
-            new_index = (
+    df.drop(unnamed_cols, axis=1, inplace=True)
-                "-",
+    df.drop(year, axis=1, inplace=True)
-                "International aviation",
+
-                "International aviation",
+    # make numeric values where possible
-                "ktoe",
+    df.replace("Z", 0, inplace=True)
-            )
+    df = df.apply(pd.to_numeric, errors="coerce")
-            modified_index = list(df.index)
+    df = df.select_dtypes(include=[np.number])
-            modified_index[index_number] = new_index
+
-            df.index = pd.MultiIndex.from_tuples(modified_index, names=df.index.names)
+    # write 'International aviation' to the 2nd level of the multiindex
-            # drop the annoying subhead line
+    int_avia = df.index.get_level_values(2) == "International aviation"
-            df.drop(df[df[year] == year].index, inplace=True)
+    temp = df.loc[int_avia]
-            # replace 'Z' with 0
+    temp.index = pd.MultiIndex.from_frame(
-            df = df.replace("Z", 0)
+        temp.index.to_frame().fillna("International aviation")
-            # add country to the multiindex
+    )
-            new_tuple = [(country, *idx) for idx in df.index]
+    df = pd.concat([temp, df.loc[~int_avia]])
            new_mindex = pd.MultiIndex.from_tuples(
                new_tuple, names=["country", None, "name", None, "unit"]
            )
            df.index = new_mindex
            # make numeric values where possible
            df = df.apply(pd.to_numeric, errors="coerce")
            # drop non-numeric columns
            non_numeric_cols = df.columns[df.dtypes != float]
            df.drop(non_numeric_cols, axis=1, inplace=True)
            # concatenate the dataframes
            eurostat = pd.concat([eurostat, df], axis=0)
    eurostat.drop(["Unnamed: 4", year, "Unnamed: 6"], axis=1, inplace=True)
    # Renaming some indices
-    rename = {
+    index_rename = {
        "Households": "Residential",
        "Commercial & public services": "Services",
        "Domestic navigation": "Domestic Navigation",
        "International maritime bunkers": "Bunkers",
    }
-    for name, rename in rename.items():
+    columns_rename = {"Total": "Total all products", "UK": "GB"}
-        eurostat.index = eurostat.index.set_levels(
+    df.rename(index=index_rename, columns=columns_rename, inplace=True)
-            eurostat.index.levels[3].where(eurostat.index.levels[3] != name, rename),
+    df.sort_index(inplace=True)
-            level=3,
+    df.index.names = [None] * len(df.index.names)
        )
    new_index = eurostat.index.set_levels(
        eurostat.index.levels[2].where(
            eurostat.index.levels[2] != "International maritime bunkers", "Bunkers"
        ),
        level=2,
    )
    eurostat.index = new_index
-    eurostat.rename(columns={"Total": "Total all products"}, inplace=True)
+    # convert to TWh/a from ktoe/a
-    eurostat.index = eurostat.index.set_levels(
+    df *= 11.63 / 1e3
        eurostat.index.levels[0].where(eurostat.index.levels[0] != "UK", "GB"), level=0
    )
    df = eurostat * 11.63 / 1e3
    return df
@ -776,25 +703,25 @@ def build_transport_data(countries, population, idees):
    return transport_data
-def rescale(idees_countries, energy, eurostat):
+def rescale_idees_from_eurostat(
    idees_countries, energy, eurostat, input_eurostat, countries
 ):
    """
    Takes JRC IDEES data from 2015 and rescales it by the ratio of the eurostat
    data and the 2015 eurostat data.
    missing data: ['passenger car efficiency', 'passenger cars']
    """
    main_cols = ["Total all products", "Electricity"]
    # read in the eurostat data for 2015
-    eurostat_2015 = build_eurostat(input_eurostat, countries, 2023, 2015)[
+    eurostat_2015 = build_eurostat(input_eurostat, countries, 2015)[main_cols]
-        ["Total all products", "Electricity"]
+    eurostat_year = eurostat[main_cols]
    ]
    eurostat_year = eurostat[["Total all products", "Electricity"]]
    # calculate the ratio of the two data sets
    ratio = eurostat_year / eurostat_2015
    ratio = ratio.droplevel([1, 4])
-    ratio.rename(
+    cols_rename = {"Total all products": "total", "Electricity": "ele"}
-        columns={"Total all products": "total", "Electricity": "ele"}, inplace=True
+    index_rename = {v: k for k, v in idees_rename.items()}
-    )
+    ratio.rename(columns=cols_rename, index=index_rename, inplace=True)
    ratio = ratio.rename(index={"EL": "GR"}, level=0)
    mappings = {
        "Residential": {
@ -887,16 +814,16 @@ def rescale(idees_countries, energy, eurostat):
        for sector, mapping in mappings.items():
            sector_ratio = ratio.loc[(country, slice(None), sector)]
-            energy.loc[country, mapping["total"]] *= sector_ratio[["total"]].iloc[0, 0]
+            energy.loc[country, mapping["total"]] *= sector_ratio["total"].iloc[0]
-            energy.loc[country, mapping["elec"]] *= sector_ratio[["ele"]].iloc[0, 0]
+            energy.loc[country, mapping["elec"]] *= sector_ratio["ele"].iloc[0]
-        avi_d = ratio.loc[(country, slice(None), "Domestic aviation")]
+        avi_d = ratio.loc[(country, slice(None), "Domestic aviation"), "total"]
-        avi_i = ratio.loc[(country, "International aviation", slice(None))]
+        avi_i = ratio.loc[(country, "International aviation", slice(None)), "total"]
-        energy.loc[country, avia_inter] *= avi_i[["total"]].iloc[0, 0]
+        energy.loc[country, avia_inter] *= avi_i.iloc[0]
-        energy.loc[country, avia_domestic] *= avi_d[["total"]].iloc[0, 0]
+        energy.loc[country, avia_domestic] *= avi_d.iloc[0]
-        nav = ratio.loc[(country, slice(None), "Domestic Navigation")]
+        nav = ratio.loc[(country, slice(None), "Domestic Navigation"), "total"]
-        energy.loc[country, navigation] *= nav[["total"]].iloc[0, 0]
+        energy.loc[country, navigation] *= nav.iloc[0]
    return energy
@ -922,17 +849,16 @@ if __name__ == "__main__":
    input_eurostat = snakemake.input.eurostat
    eurostat = build_eurostat(input_eurostat, countries, data_year)
    swiss = build_swiss(data_year)
-    # data from idees only exists from 2000-2015
+    # data from idees only exists from 2000-2015. read in latest data and rescale later
-    if data_year > 2015:
+    idees = build_idees(idees_countries, min(2015, data_year))
        # read in latest data and rescale later
        idees = build_idees(idees_countries, 2015)
    else:
        idees = build_idees(idees_countries, data_year)
    energy = build_energy_totals(countries, eurostat, swiss, idees)
    if data_year > 2015:
-        energy = rescale(idees_countries, energy, eurostat)
+        logger.info("Data year is after 2015. Rescaling IDEES data based on eurostat.")
        energy = rescale_idees_from_eurostat(
            idees_countries, energy, eurostat, input_eurostat, countries
        )
    energy.to_csv(snakemake.output.energy_name)