build_energy_totals: revision of eurostat report upgrade

2024-03-05 18:43:24 +01:00 · 2024-03-05 18:43:24 +01:00 · bf60da973b
commit bf60da973b
parent 5b513f81db
1 changed files with 63 additions and 137 deletions
--- a/scripts/build_energy_totals.py
+++ b/scripts/build_energy_totals.py
@ -37,54 +37,6 @@ def reverse(dictionary):
    return {v: k for k, v in dictionary.items()}


-eurostat_codes = {
-    "EU28": "EU",
-    "EA19": "EA",
-    "Belgium": "BE",
-    "Bulgaria": "BG",
-    "Czech Republic": "CZ",
-    "Denmark": "DK",
-    "Germany": "DE",
-    "Estonia": "EE",
-    "Ireland": "IE",
-    "Greece": "GR",
-    "Spain": "ES",
-    "France": "FR",
-    "Croatia": "HR",
-    "Italy": "IT",
-    "Cyprus": "CY",
-    "Latvia": "LV",
-    "Lithuania": "LT",
-    "Luxembourg": "LU",
-    "Hungary": "HU",
-    "Malta": "MA",
-    "Netherlands": "NL",
-    "Austria": "AT",
-    "Poland": "PL",
-    "Portugal": "PT",
-    "Romania": "RO",
-    "Slovenia": "SI",
-    "Slovakia": "SK",
-    "Finland": "FI",
-    "Sweden": "SE",
-    "United Kingdom": "GB",
-    "Iceland": "IS",
-    "Norway": "NO",
-    "Montenegro": "ME",
-    "FYR of Macedonia": "MK",
-    "Albania": "AL",
-    "Serbia": "RS",
-    "Turkey": "TU",
-    "Bosnia and Herzegovina": "BA",
-    "Kosovo\n(UNSCR 1244/99)": "KO",  # 2017 version
-    # 2016 version
-    "Kosovo\n(under United Nations Security Council Resolution 1244/99)": "KO",
-    "Moldova": "MO",
-    "Ukraine": "UK",
-    "Switzerland": "CH",
-}
-
-
 idees_rename = {"GR": "EL", "GB": "UK"}

 eu28 = cc.EU28as("ISO2").ISO2.tolist()
@ -121,79 +73,54 @@ def build_eurostat(input_eurostat, countries, year):
    """
    Return multi-index for all countries' energy data in TWh/a.
    """
-    # read in every country file in countries
-    eurostat = pd.DataFrame()
-    countries = [country if country != "GB" else "UK" for country in countries]
-    countries = [country if country != "GR" else "EL" for country in countries]
+    df = {}
+    countries = {idees_rename.get(country, country) for country in countries} - {"CH"}
    for country in countries:
-        filename = f"/{country}-Energy-balance-sheets-April-2023-edition.xlsb"
-        if os.path.exists(input_eurostat + filename):
-            df = pd.read_excel(
-                input_eurostat + filename,
+        filename = (
+            f"{input_eurostat}/{country}-Energy-balance-sheets-April-2023-edition.xlsb"
+        )
+        sheet = pd.read_excel(
+            filename,
            engine="pyxlsb",
            sheet_name=str(year),
            skiprows=4,
            index_col=list(range(4)),
        )
-            # replace entry 'Z' with 0
-            df.replace("Z", 0, inplace=True)
-            # write 'International aviation' to the 2nd level of the multiindex
-            index_number = (
-                df.index.get_level_values(1) == "International aviation"
-            ).argmax()
-            new_index = (
-                "-",
-                "International aviation",
-                "International aviation",
-                "ktoe",
-            )
-            modified_index = list(df.index)
-            modified_index[index_number] = new_index
-            df.index = pd.MultiIndex.from_tuples(modified_index, names=df.index.names)
-            # drop the annoying subhead line
-            df.drop(df[df[year] == year].index, inplace=True)
-            # replace 'Z' with 0
-            df = df.replace("Z", 0)
-            # add country to the multiindex
-            new_tuple = [(country, *idx) for idx in df.index]
-            new_mindex = pd.MultiIndex.from_tuples(
-                new_tuple, names=["country", None, "name", None, "unit"]
-            )
-            df.index = new_mindex
-            # make numeric values where possible
-            df = df.apply(pd.to_numeric, errors="coerce")
-            # drop non-numeric columns
-            non_numeric_cols = df.columns[df.dtypes != float]
-            df.drop(non_numeric_cols, axis=1, inplace=True)
-            # concatenate the dataframes
-            eurostat = pd.concat([eurostat, df], axis=0)
+        df[country] = sheet
+    df = pd.concat(df, axis=0)
+
+    # drop columns with all NaNs
+    unnamed_cols = df.columns[df.columns.astype(str).str.startswith("Unnamed")]
+    df.drop(unnamed_cols, axis=1, inplace=True)
+    df.drop(year, axis=1, inplace=True)
+
+    # make numeric values where possible
+    df.replace("Z", 0, inplace=True)
+    df = df.apply(pd.to_numeric, errors="coerce")
+    df = df.select_dtypes(include=[np.number])
+
+    # write 'International aviation' to the 2nd level of the multiindex
+    int_avia = df.index.get_level_values(2) == "International aviation"
+    temp = df.loc[int_avia]
+    temp.index = pd.MultiIndex.from_frame(
+        temp.index.to_frame().fillna("International aviation")
+    )
+    df = pd.concat([temp, df.loc[~int_avia]])

-    eurostat.drop(["Unnamed: 4", year, "Unnamed: 6"], axis=1, inplace=True)
    # Renaming some indices
-    rename = {
+    index_rename = {
        "Households": "Residential",
        "Commercial & public services": "Services",
        "Domestic navigation": "Domestic Navigation",
+        "International maritime bunkers": "Bunkers",
    }
-    for name, rename in rename.items():
-        eurostat.index = eurostat.index.set_levels(
-            eurostat.index.levels[3].where(eurostat.index.levels[3] != name, rename),
-            level=3,
-        )
-    new_index = eurostat.index.set_levels(
-        eurostat.index.levels[2].where(
-            eurostat.index.levels[2] != "International maritime bunkers", "Bunkers"
-        ),
-        level=2,
-    )
-    eurostat.index = new_index
+    columns_rename = {"Total": "Total all products", "UK": "GB"}
+    df.rename(index=index_rename, columns=columns_rename, inplace=True)
+    df.sort_index(inplace=True)
+    df.index.names = [None] * len(df.index.names)

-    eurostat.rename(columns={"Total": "Total all products"}, inplace=True)
-    eurostat.index = eurostat.index.set_levels(
-        eurostat.index.levels[0].where(eurostat.index.levels[0] != "UK", "GB"), level=0
-    )
-
-    df = eurostat * 11.63 / 1e3
+    # convert to TWh/a from ktoe/a
+    df *= 11.63 / 1e3

    return df

@ -776,25 +703,25 @@ def build_transport_data(countries, population, idees):
    return transport_data


-def rescale(idees_countries, energy, eurostat):
+def rescale_idees_from_eurostat(
+    idees_countries, energy, eurostat, input_eurostat, countries
+):
    """
    Takes JRC IDEES data from 2015 and rescales it by the ratio of the eurostat
    data and the 2015 eurostat data.

    missing data: ['passenger car efficiency', 'passenger cars']
    """
+    main_cols = ["Total all products", "Electricity"]
    # read in the eurostat data for 2015
-    eurostat_2015 = build_eurostat(input_eurostat, countries, 2023, 2015)[
-        ["Total all products", "Electricity"]
-    ]
-    eurostat_year = eurostat[["Total all products", "Electricity"]]
+    eurostat_2015 = build_eurostat(input_eurostat, countries, 2015)[main_cols]
+    eurostat_year = eurostat[main_cols]
    # calculate the ratio of the two data sets
    ratio = eurostat_year / eurostat_2015
    ratio = ratio.droplevel([1, 4])
-    ratio.rename(
-        columns={"Total all products": "total", "Electricity": "ele"}, inplace=True
-    )
-    ratio = ratio.rename(index={"EL": "GR"}, level=0)
+    cols_rename = {"Total all products": "total", "Electricity": "ele"}
+    index_rename = {v: k for k, v in idees_rename.items()}
+    ratio.rename(columns=cols_rename, index=index_rename, inplace=True)

    mappings = {
        "Residential": {
@ -887,16 +814,16 @@ def rescale(idees_countries, energy, eurostat):
        for sector, mapping in mappings.items():
            sector_ratio = ratio.loc[(country, slice(None), sector)]

-            energy.loc[country, mapping["total"]] *= sector_ratio[["total"]].iloc[0, 0]
-            energy.loc[country, mapping["elec"]] *= sector_ratio[["ele"]].iloc[0, 0]
+            energy.loc[country, mapping["total"]] *= sector_ratio["total"].iloc[0]
+            energy.loc[country, mapping["elec"]] *= sector_ratio["ele"].iloc[0]

-        avi_d = ratio.loc[(country, slice(None), "Domestic aviation")]
-        avi_i = ratio.loc[(country, "International aviation", slice(None))]
-        energy.loc[country, avia_inter] *= avi_i[["total"]].iloc[0, 0]
-        energy.loc[country, avia_domestic] *= avi_d[["total"]].iloc[0, 0]
+        avi_d = ratio.loc[(country, slice(None), "Domestic aviation"), "total"]
+        avi_i = ratio.loc[(country, "International aviation", slice(None)), "total"]
+        energy.loc[country, avia_inter] *= avi_i.iloc[0]
+        energy.loc[country, avia_domestic] *= avi_d.iloc[0]

-        nav = ratio.loc[(country, slice(None), "Domestic Navigation")]
-        energy.loc[country, navigation] *= nav[["total"]].iloc[0, 0]
+        nav = ratio.loc[(country, slice(None), "Domestic Navigation"), "total"]
+        energy.loc[country, navigation] *= nav.iloc[0]

    return energy

@ -922,17 +849,16 @@ if __name__ == "__main__":
    input_eurostat = snakemake.input.eurostat
    eurostat = build_eurostat(input_eurostat, countries, data_year)
    swiss = build_swiss(data_year)
-    # data from idees only exists from 2000-2015
-    if data_year > 2015:
-        # read in latest data and rescale later
-        idees = build_idees(idees_countries, 2015)
-    else:
-        idees = build_idees(idees_countries, data_year)
+    # data from idees only exists from 2000-2015. read in latest data and rescale later
+    idees = build_idees(idees_countries, min(2015, data_year))

    energy = build_energy_totals(countries, eurostat, swiss, idees)

    if data_year > 2015:
-        energy = rescale(idees_countries, energy, eurostat)
+        logger.info("Data year is after 2015. Rescaling IDEES data based on eurostat.")
+        energy = rescale_idees_from_eurostat(
+            idees_countries, energy, eurostat, input_eurostat, countries
+        )

    energy.to_csv(snakemake.output.energy_name)