From a8682db4fc14c0f6688cccf1de53d6555063c71d Mon Sep 17 00:00:00 2001 From: Fabian Neumann Date: Wed, 6 Mar 2024 20:42:45 +0100 Subject: [PATCH] build_energy_totals: pre-process data for all years first [incomplete] --- scripts/build_energy_totals.py | 303 ++++++++++++++++++++------------- 1 file changed, 186 insertions(+), 117 deletions(-) diff --git a/scripts/build_energy_totals.py b/scripts/build_energy_totals.py index 1ffc4ae2..8e9abb69 100644 --- a/scripts/build_energy_totals.py +++ b/scripts/build_energy_totals.py @@ -69,38 +69,58 @@ to_ipcc = { } -def build_eurostat(input_eurostat, countries, year): +def eurostat_per_country(input_eurostat, country): + filename = ( + f"{input_eurostat}/{country}-Energy-balance-sheets-April-2023-edition.xlsb" + ) + sheet = pd.read_excel( + filename, + engine="pyxlsb", + sheet_name=None, + skiprows=4, + index_col=list(range(4)), + ) + sheet.pop("Cover") + return pd.concat(sheet) + + +def build_eurostat(input_eurostat, countries): """ Return multi-index for all countries' energy data in TWh/a. """ - df = {} countries = {idees_rename.get(country, country) for country in countries} - {"CH"} - for country in countries: - filename = ( - f"{input_eurostat}/{country}-Energy-balance-sheets-April-2023-edition.xlsb" - ) - sheet = pd.read_excel( - filename, - engine="pyxlsb", - sheet_name=str(year), - skiprows=4, - index_col=list(range(4)), - ) - df[country] = sheet - df = pd.concat(df, axis=0) + + nprocesses = snakemake.threads + disable_progress = snakemake.config["run"].get("disable_progressbar", False) + + func = partial(eurostat_per_country, input_eurostat) + tqdm_kwargs = dict( + ascii=False, + unit=" country", + total=len(countries), + desc="Build from eurostat database", + disable=disable_progress, + ) + with mute_print(): + with mp.Pool(processes=nprocesses) as pool: + dfs = list(tqdm(pool.imap(func, countries), **tqdm_kwargs)) + + index_names = ["country", "year", "lvl1", "lvl2", "lvl3", "lvl4"] + df = pd.concat(dfs, keys=countries, names=index_names) + df.index = df.index.set_levels(df.index.levels[1].astype(int), level=1) # drop columns with all NaNs unnamed_cols = df.columns[df.columns.astype(str).str.startswith("Unnamed")] df.drop(unnamed_cols, axis=1, inplace=True) - df.drop(year, axis=1, inplace=True) + df.drop(list(range(1990, 2022)), axis=1, inplace=True, errors="ignore") # make numeric values where possible df.replace("Z", 0, inplace=True) df = df.apply(pd.to_numeric, errors="coerce") df = df.select_dtypes(include=[np.number]) - # write 'International aviation' to the 2nd level of the multiindex - int_avia = df.index.get_level_values(2) == "International aviation" + # write 'International aviation' to the lower level of the multiindex + int_avia = df.index.get_level_values(3) == "International aviation" temp = df.loc[int_avia] temp.index = pd.MultiIndex.from_frame( temp.index.to_frame().fillna("International aviation") @@ -113,11 +133,11 @@ def build_eurostat(input_eurostat, countries, year): "Commercial & public services": "Services", "Domestic navigation": "Domestic Navigation", "International maritime bunkers": "Bunkers", + "UK": "GB", } - columns_rename = {"Total": "Total all products", "UK": "GB"} + columns_rename = {"Total": "Total all products"} df.rename(index=index_rename, columns=columns_rename, inplace=True) df.sort_index(inplace=True) - df.index.names = [None] * len(df.index.names) # convert to TWh/a from ktoe/a df *= 11.63 / 1e3 @@ -125,13 +145,21 @@ def build_eurostat(input_eurostat, countries, year): return df -def build_swiss(year): +def build_swiss(): """ - Return a pd.Series of Swiss energy data in TWh/a. + Return a pd.DataFrame of Swiss energy data in TWh/a. """ fn = snakemake.input.swiss - df = pd.read_csv(fn, index_col=[0, 1]).loc["CH", str(year)] + df = pd.read_csv(fn, index_col=[0, 1]) + + df.columns = df.columns.astype(int) + + df.columns.name = "year" + + df = df.stack().unstack("item") + + df.columns.name = None # convert PJ/a to TWh/a df /= 3.6 @@ -139,35 +167,35 @@ def build_swiss(year): return df -def idees_per_country(ct, year, base_dir): +def idees_per_country(ct, base_dir): ct_idees = idees_rename.get(ct, ct) fn_residential = f"{base_dir}/JRC-IDEES-2015_Residential_{ct_idees}.xlsx" fn_tertiary = f"{base_dir}/JRC-IDEES-2015_Tertiary_{ct_idees}.xlsx" fn_transport = f"{base_dir}/JRC-IDEES-2015_Transport_{ct_idees}.xlsx" + ct_totals = {} + # residential - df = pd.read_excel(fn_residential, "RES_hh_fec", index_col=0)[year] + df = pd.read_excel(fn_residential, "RES_hh_fec", index_col=0) rows = ["Advanced electric heating", "Conventional electric heating"] - ct_totals = { - "total residential space": df["Space heating"], - "electricity residential space": df[rows].sum(), - } - ct_totals["total residential water"] = df.at["Water heating"] + ct_totals["electricity residential space"] = df.loc[rows].sum() + ct_totals["total residential space"] = df.loc["Space heating"] + ct_totals["total residential water"] = df.loc["Water heating"] assert df.index[23] == "Electricity" ct_totals["electricity residential water"] = df.iloc[23] - ct_totals["total residential cooking"] = df["Cooking"] + ct_totals["total residential cooking"] = df.loc["Cooking"] assert df.index[30] == "Electricity" ct_totals["electricity residential cooking"] = df.iloc[30] - df = pd.read_excel(fn_residential, "RES_summary", index_col=0)[year] + df = pd.read_excel(fn_residential, "RES_summary", index_col=0) row = "Energy consumption by fuel - Eurostat structure (ktoe)" - ct_totals["total residential"] = df[row] + ct_totals["total residential"] = df.loc[row] assert df.index[47] == "Electricity" ct_totals["electricity residential"] = df.iloc[47] @@ -180,27 +208,27 @@ def idees_per_country(ct, year, base_dir): # services - df = pd.read_excel(fn_tertiary, "SER_hh_fec", index_col=0)[year] + df = pd.read_excel(fn_tertiary, "SER_hh_fec", index_col=0) - ct_totals["total services space"] = df["Space heating"] + ct_totals["total services space"] = df.loc["Space heating"] rows = ["Advanced electric heating", "Conventional electric heating"] - ct_totals["electricity services space"] = df[rows].sum() + ct_totals["electricity services space"] = df.loc[rows].sum() - ct_totals["total services water"] = df["Hot water"] + ct_totals["total services water"] = df.loc["Hot water"] assert df.index[24] == "Electricity" ct_totals["electricity services water"] = df.iloc[24] - ct_totals["total services cooking"] = df["Catering"] + ct_totals["total services cooking"] = df.loc["Catering"] assert df.index[31] == "Electricity" ct_totals["electricity services cooking"] = df.iloc[31] - df = pd.read_excel(fn_tertiary, "SER_summary", index_col=0)[year] + df = pd.read_excel(fn_tertiary, "SER_summary", index_col=0) row = "Energy consumption by fuel - Eurostat structure (ktoe)" - ct_totals["total services"] = df[row] + ct_totals["total services"] = df.loc[row] assert df.index[50] == "Electricity" ct_totals["electricity services"] = df.iloc[50] @@ -216,7 +244,7 @@ def idees_per_country(ct, year, base_dir): start = "Detailed split of energy consumption (ktoe)" end = "Market shares of energy uses (%)" - df = pd.read_excel(fn_tertiary, "AGR_fec", index_col=0).loc[start:end, year] + df = pd.read_excel(fn_tertiary, "AGR_fec", index_col=0).loc[start:end] rows = [ "Lighting", @@ -224,30 +252,30 @@ def idees_per_country(ct, year, base_dir): "Specific electricity uses", "Pumping devices (electric)", ] - ct_totals["total agriculture electricity"] = df[rows].sum() + ct_totals["total agriculture electricity"] = df.loc[rows].sum() rows = ["Specific heat uses", "Low enthalpy heat"] - ct_totals["total agriculture heat"] = df[rows].sum() + ct_totals["total agriculture heat"] = df.loc[rows].sum() rows = [ "Motor drives", "Farming machine drives (diesel oil incl. biofuels)", "Pumping devices (diesel oil incl. biofuels)", ] - ct_totals["total agriculture machinery"] = df[rows].sum() + ct_totals["total agriculture machinery"] = df.loc[rows].sum() row = "Agriculture, forestry and fishing" - ct_totals["total agriculture"] = df[row] + ct_totals["total agriculture"] = df.loc[row] # transport - df = pd.read_excel(fn_transport, "TrRoad_ene", index_col=0)[year] + df = pd.read_excel(fn_transport, "TrRoad_ene", index_col=0) - ct_totals["total road"] = df["by fuel (EUROSTAT DATA)"] + ct_totals["total road"] = df.loc["by fuel (EUROSTAT DATA)"] - ct_totals["electricity road"] = df["Electricity"] + ct_totals["electricity road"] = df.loc["Electricity"] - ct_totals["total two-wheel"] = df["Powered 2-wheelers (Gasoline)"] + ct_totals["total two-wheel"] = df.loc["Powered 2-wheelers (Gasoline)"] assert df.index[19] == "Passenger cars" ct_totals["total passenger cars"] = df.iloc[19] @@ -268,16 +296,16 @@ def idees_per_country(ct, year, base_dir): ct_totals["electricity light duty road freight"] = df.iloc[49] row = "Heavy duty vehicles (Diesel oil incl. biofuels)" - ct_totals["total heavy duty road freight"] = df[row] + ct_totals["total heavy duty road freight"] = df.loc[row] assert df.index[61] == "Passenger cars" ct_totals["passenger car efficiency"] = df.iloc[61] - df = pd.read_excel(fn_transport, "TrRail_ene", index_col=0)[year] + df = pd.read_excel(fn_transport, "TrRail_ene", index_col=0) - ct_totals["total rail"] = df["by fuel (EUROSTAT DATA)"] + ct_totals["total rail"] = df.loc["by fuel (EUROSTAT DATA)"] - ct_totals["electricity rail"] = df["Electricity"] + ct_totals["electricity rail"] = df.loc["Electricity"] assert df.index[15] == "Passenger transport" ct_totals["total rail passenger"] = df.iloc[15] @@ -293,7 +321,7 @@ def idees_per_country(ct, year, base_dir): assert df.index[23] == "Electric" ct_totals["electricity rail freight"] = df.iloc[23] - df = pd.read_excel(fn_transport, "TrAvia_ene", index_col=0)[year] + df = pd.read_excel(fn_transport, "TrAvia_ene", index_col=0) assert df.index[6] == "Passenger transport" ct_totals["total aviation passenger"] = df.iloc[6] @@ -324,24 +352,24 @@ def idees_per_country(ct, year, base_dir): + ct_totals["total international aviation passenger"] ) - df = pd.read_excel(fn_transport, "TrNavi_ene", index_col=0)[year] + df = pd.read_excel(fn_transport, "TrNavi_ene", index_col=0) # coastal and inland - ct_totals["total domestic navigation"] = df["by fuel (EUROSTAT DATA)"] + ct_totals["total domestic navigation"] = df.loc["by fuel (EUROSTAT DATA)"] - df = pd.read_excel(fn_transport, "TrRoad_act", index_col=0)[year] + df = pd.read_excel(fn_transport, "TrRoad_act", index_col=0) assert df.index[85] == "Passenger cars" ct_totals["passenger cars"] = df.iloc[85] - return pd.Series(ct_totals, name=ct) + return pd.DataFrame(ct_totals) -def build_idees(countries, year): +def build_idees(countries): nprocesses = snakemake.threads disable_progress = snakemake.config["run"].get("disable_progressbar", False) - func = partial(idees_per_country, year=year, base_dir=snakemake.input.idees) + func = partial(idees_per_country, base_dir=snakemake.input.idees) tqdm_kwargs = dict( ascii=False, unit=" country", @@ -353,53 +381,67 @@ def build_idees(countries, year): with mp.Pool(processes=nprocesses) as pool: totals_list = list(tqdm(pool.imap(func, countries), **tqdm_kwargs)) - totals = pd.concat(totals_list, axis=1) + totals = pd.concat( + totals_list, + keys=countries, + names=["country", "year"], + ) # convert ktoe to TWh - exclude = totals.index.str.fullmatch("passenger cars") - totals.loc[~exclude] *= 11.63 / 1e3 + exclude = totals.columns.str.fullmatch("passenger cars") + totals.loc[:, ~exclude] *= 11.63 / 1e3 # convert TWh/100km to kWh/km - totals.loc["passenger car efficiency"] *= 10 + totals.loc[:, "passenger car efficiency"] *= 10 - return totals.T + return totals def build_energy_totals(countries, eurostat, swiss, idees): eurostat_fuels = {"electricity": "Electricity", "total": "Total all products"} + eurostat_countries = eurostat.index.levels[0] + eurostat_years = eurostat.index.levels[1] to_drop = ["passenger cars", "passenger car efficiency"] - df = idees.reindex(countries).drop(to_drop, axis=1) + new_index = pd.MultiIndex.from_product( + [countries, eurostat_years], names=["country", "year"] + ) - eurostat_countries = eurostat.index.levels[0] - in_eurostat = df.index.intersection(eurostat_countries) + df = idees.reindex(new_index).drop(to_drop, axis=1) + + in_eurostat = df.index.levels[0].intersection(eurostat_countries) # add international navigation - slicer = idx[in_eurostat, :, "Bunkers", :] - fill_values = eurostat.loc[slicer, "Total all products"].groupby(level=0).sum() + slicer = idx[in_eurostat, :, :, "Bunkers", :] + fill_values = eurostat.loc[slicer, "Total all products"].groupby(level=[0, 1]).sum() df.loc[in_eurostat, "total international navigation"] = fill_values # add swiss energy data - df.loc["CH"] = swiss + df = pd.concat([df.drop("CH"), swiss]).sort_index() # get values for missing countries based on Eurostat EnergyBalances # divide cooking/space/water according to averages in EU28 - missing = df.index[df["total residential"].isna()] - to_fill = missing.intersection(eurostat_countries) uses = ["space", "cooking", "water"] + to_fill = df.index[ + df["total residential"].isna() + & df.index.get_level_values("country").isin(eurostat_countries) + ] + c = to_fill.get_level_values("country") + y = to_fill.get_level_values("year") + for sector in ["residential", "services", "road", "rail"]: eurostat_sector = sector.capitalize() # fuel use for fuel in ["electricity", "total"]: - slicer = idx[to_fill, :, :, eurostat_sector] + slicer = idx[c, y, :, :, eurostat_sector] fill_values = ( - eurostat.loc[slicer, eurostat_fuels[fuel]].groupby(level=0).sum() + eurostat.loc[slicer, eurostat_fuels[fuel]].groupby(level=[0, 1]).sum() ) df.loc[to_fill, f"{fuel} {sector}"] = fill_values @@ -461,25 +503,27 @@ def build_energy_totals(countries, eurostat, swiss, idees): no_norway[f"total {sector}"] - no_norway[f"electricity {sector}"] ) fraction = nonelectric_use.div(nonelectric).mean() - df.loc["NO", f"total {sector} {use}"] = total_heating * fraction + df.loc["NO", f"total {sector} {use}"] = ( + total_heating * fraction + ).values df.loc["NO", f"electricity {sector} {use}"] = ( total_heating * fraction * elec_fraction - ) + ).values # Missing aviation - slicer = idx[to_fill, :, :, "Domestic aviation"] - fill_values = eurostat.loc[slicer, "Total all products"].groupby(level=0).sum() + slicer = idx[c, y, :, :, "Domestic aviation"] + fill_values = eurostat.loc[slicer, "Total all products"].groupby(level=[0, 1]).sum() df.loc[to_fill, "total domestic aviation"] = fill_values - slicer = idx[to_fill, :, :, "International aviation"] - fill_values = eurostat.loc[slicer, "Total all products"].groupby(level=0).sum() + slicer = idx[c, y, :, :, "International aviation"] + fill_values = eurostat.loc[slicer, "Total all products"].groupby(level=[0, 1]).sum() df.loc[to_fill, "total international aviation"] = fill_values # missing domestic navigation - slicer = idx[to_fill, :, :, "Domestic Navigation"] - fill_values = eurostat.loc[slicer, "Total all products"].groupby(level=0).sum() + slicer = idx[c, y, :, :, "Domestic Navigation"] + fill_values = eurostat.loc[slicer, "Total all products"].groupby(level=[0, 1]).sum() df.loc[to_fill, "total domestic navigation"] = fill_values # split road traffic for non-IDEES @@ -532,9 +576,11 @@ def build_energy_totals(countries, eurostat, swiss, idees): if "BA" in df.index: # fill missing data for BA (services and road energy data) # proportional to RS with ratio of total residential demand - missing = df.loc["BA"] == 0.0 - ratio = df.at["BA", "total residential"] / df.at["RS", "total residential"] - df.loc["BA", missing] = ratio * df.loc["RS", missing] + mean_BA = df.loc["BA"].loc[2014:2021, "total residential"].mean() + mean_RS = df.loc["RS"].loc[2014:2021, "total residential"].mean() + ratio = mean_BA / mean_RS + df.loc["BA"] = df.loc["BA"].replace(0.0, np.nan).values + df.loc["BA"] = df.loc["BA"].combine_first(ratio * df.loc["RS"]).values return df @@ -550,7 +596,7 @@ def build_district_heat_share(countries, idees): district_heat_share = district_heat / total_heat - district_heat_share = district_heat_share.reindex(countries) + district_heat_share = district_heat_share.reindex(countries, level="country") # Missing district heating share dh_share = ( @@ -578,8 +624,6 @@ def build_eea_co2(input_co2, year=1990, emissions_scope="CO2"): index_col = ["Country_code", "Pollutant_name", "Year", "Sector_name"] df = df.set_index(index_col).sort_index() - emissions_scope = emissions_scope - cts = ["CH", "EUA", "NO"] + eu28_eea slicer = idx[cts, emissions_scope, year, to_ipcc.values()] @@ -622,8 +666,8 @@ def build_eea_co2(input_co2, year=1990, emissions_scope="CO2"): return emissions / 1e3 -def build_eurostat_co2(input_eurostat, countries, year=1990): - eurostat = build_eurostat(input_eurostat, countries, year) +def build_eurostat_co2(eurostat, year=1990): + eurostat_year = eurostat.xs(year, level="year") specific_emissions = pd.Series(index=eurostat.columns, dtype=float) @@ -637,7 +681,7 @@ def build_eurostat_co2(input_eurostat, countries, year=1990): # Residual oil (No. 6) 0.298 # https://www.eia.gov/electricity/annual/html/epa_a_03.html - return eurostat.multiply(specific_emissions).sum(axis=1) + return eurostat_year.multiply(specific_emissions).sum(axis=1) def build_co2_totals(countries, eea_co2, eurostat_co2): @@ -704,7 +748,9 @@ def build_transport_data(countries, population, idees): def rescale_idees_from_eurostat( - idees_countries, energy, eurostat, input_eurostat, countries + idees_countries, + energy, + eurostat, ): """ Takes JRC IDEES data from 2015 and rescales it by the ratio of the eurostat @@ -714,11 +760,10 @@ def rescale_idees_from_eurostat( """ main_cols = ["Total all products", "Electricity"] # read in the eurostat data for 2015 - eurostat_2015 = build_eurostat(input_eurostat, countries, 2015)[main_cols] - eurostat_year = eurostat[main_cols] + eurostat_2015 = eurostat.xs(2015, level="year")[main_cols] # calculate the ratio of the two data sets - ratio = eurostat_year / eurostat_2015 - ratio = ratio.droplevel([1, 4]) + ratio = eurostat[main_cols] / eurostat_2015 + ratio = ratio.droplevel([2, 5]) cols_rename = {"Total all products": "total", "Electricity": "ele"} index_rename = {v: k for k, v in idees_rename.items()} ratio.rename(columns=cols_rename, index=index_rename, inplace=True) @@ -811,19 +856,45 @@ def rescale_idees_from_eurostat( ] for country in idees_countries: + slicer_target = idx[country, 2016:2021, :, :] + slicer_source = idx[country, 2015, :, :] for sector, mapping in mappings.items(): - sector_ratio = ratio.loc[(country, slice(None), sector)] + sector_ratio = ratio.loc[(country, slice(None), slice(None), sector)].droplevel("lvl2") - energy.loc[country, mapping["total"]] *= sector_ratio["total"].iloc[0] - energy.loc[country, mapping["elec"]] *= sector_ratio["ele"].iloc[0] + energy.loc[slicer_target, mapping["total"]] = cartesian( + sector_ratio.loc[2016:2021, "total"], + energy.loc[slicer_source, mapping["total"]].squeeze() + ).values + energy.loc[slicer_target, mapping["elec"]] = cartesian( + sector_ratio.loc[2016:2021, "ele"], + energy.loc[slicer_source, mapping["elec"]].squeeze() + ).values - avi_d = ratio.loc[(country, slice(None), "Domestic aviation"), "total"] - avi_i = ratio.loc[(country, "International aviation", slice(None)), "total"] - energy.loc[country, avia_inter] *= avi_i.iloc[0] - energy.loc[country, avia_domestic] *= avi_d.iloc[0] + level_drops = ["country", "lvl2", "lvl3"] - nav = ratio.loc[(country, slice(None), "Domestic Navigation"), "total"] - energy.loc[country, navigation] *= nav.iloc[0] + slicer = idx[country, :, :, "Domestic aviation"] + avi_d = ratio.loc[slicer, "total"].droplevel(level_drops) + + slicer = idx[country, :, :, "International aviation"] + avi_i = ratio.loc[slicer, "total"].droplevel(level_drops) + + slicer = idx[country, :, :, "Domestic Navigation"] + nav = ratio.loc[slicer, "total"].droplevel(level_drops) + + energy.loc[slicer_target, avia_inter] = cartesian( + avi_i.loc[2016:2021], + energy.loc[slicer_source, avia_inter].squeeze() + ).values + + energy.loc[slicer_target, avia_domestic] = cartesian( + avi_d.loc[2016:2021], + energy.loc[slicer_source, avia_domestic].squeeze() + ).values + + energy.loc[slicer_target, navigation] = cartesian( + nav.loc[2016:2021], + energy.loc[slicer_source, navigation] + ).values return energy @@ -845,20 +916,18 @@ if __name__ == "__main__": countries = snakemake.params.countries idees_countries = pd.Index(countries).intersection(eu28) - data_year = params["energy_totals_year"] input_eurostat = snakemake.input.eurostat - eurostat = build_eurostat(input_eurostat, countries, data_year) - swiss = build_swiss(data_year) - # data from idees only exists from 2000-2015. read in latest data and rescale later - idees = build_idees(idees_countries, min(2015, data_year)) + eurostat = build_eurostat(input_eurostat, countries) + swiss = build_swiss() + idees = build_idees(idees_countries) energy = build_energy_totals(countries, eurostat, swiss, idees) - if data_year > 2015: - logger.info("Data year is after 2015. Rescaling IDEES data based on eurostat.") - energy = rescale_idees_from_eurostat( - idees_countries, energy, eurostat, input_eurostat, countries - ) + # Data from IDEES only exists from 2000-2015. + logger.info("Extrapolate IDEES data based on eurostat for years 2015-2021.") + energy = rescale_idees_from_eurostat( + idees_countries, energy, eurostat + ) energy.to_csv(snakemake.output.energy_name) @@ -871,7 +940,7 @@ if __name__ == "__main__": base_year_emissions = params["base_emissions_year"] emissions_scope = snakemake.params.energy["emissions"] eea_co2 = build_eea_co2(snakemake.input.co2, base_year_emissions, emissions_scope) - eurostat_co2 = build_eurostat_co2(input_eurostat, countries, base_year_emissions) + eurostat_co2 = build_eurostat_co2(eurostat, base_year_emissions) co2 = build_co2_totals(countries, eea_co2, eurostat_co2) co2.to_csv(snakemake.output.co2_name)