From b957f88645bf6aa31261e58c697d4da525c39334 Mon Sep 17 00:00:00 2001 From: lisazeyen Date: Mon, 22 Jul 2024 21:40:47 +0200 Subject: [PATCH] fix bugs and clean up --- scripts/build_energy_totals.py | 54 +++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/scripts/build_energy_totals.py b/scripts/build_energy_totals.py index d4afe7c8..55da77e3 100644 --- a/scripts/build_energy_totals.py +++ b/scripts/build_energy_totals.py @@ -55,21 +55,6 @@ logger = logging.getLogger(__name__) idx = pd.IndexSlice - -# from JRC-2021 methodology p.58 -agriculture_idees_eurostat_mapping = { - "Solids": ["C0000X0350-0370", "P1000", "S2000"], - "LPG": ["O4630"], - "Diesel oil and liquid biofuels": ["O4671XR5220B", "R5210P", "R5210B", "R5220P", "R5220B", "R5230P", "R5230B", "R5290"], - "Fuel oil and other liquids": ["O4680", "O4100_TOT_4200-4500XBIO", "O4652XR5210B", "O4651", "O4653", "O4661XR5230B", "O4669", "O4640", "O4691", "O4692", "O4695", "O4694", "O4693", "O4699"], - "Natural gas and biogas": ["G3000", "C0350-0370", "R5300"], - "Biomass and waste": ["R5110-5150_W6000RI", "R5160", "W6210", "W6100_6220"], - "Solar and geothermal": ["RA200", "RA410"], - "Ambient heat": ["RA600"], - "Distributed heat": ["H8000"], - "Electricity": ["E7000"] -} - def cartesian(s1: pd.Series, s2: pd.Series) -> pd.DataFrame: """ Compute the Cartesian product of two pandas Series. @@ -603,6 +588,31 @@ def build_idees(countries: List[str]) -> pd.DataFrame: return totals +def fill_missing_years(fill_values: pd.Series) -> pd.Series: + """ + Fill missing years for some countries by mean over the other years. + + Parameters + ---------- + fill_values : pd.Series + A pandas Series with a MultiIndex (levels: country and year) representing + energy values, where some values may be zero and need to be filled. + + Returns + ------- + pd.Series + A pandas Series with zero values replaced by the mean value of the corresponding + country. + + Notes + ----- + - The function groups the data by the 'country' level and computes the mean for each group. + - Zero values in the original Series are replaced by the mean value of their respective country group. + """ + means = fill_values.groupby(level='country').transform('mean') + return fill_values.where(fill_values != 0, means) + + def build_energy_totals( countries: List[str], eurostat: pd.DataFrame, @@ -656,6 +666,8 @@ def build_energy_totals( slicer = idx[in_eurostat, :, :, "Bunkers", :] fill_values = eurostat.loc[slicer, "Total all products"].groupby(level=[0, 1]).sum() + # fill missing years for some countries by mean over the other years + fill_values = fill_missing_years(fill_values) df.loc[in_eurostat, "total international navigation"] = fill_values # add swiss energy data @@ -679,8 +691,8 @@ def build_energy_totals( fill_values = eurostat.loc[slicer]["Total all products"].groupby(level=[0,1]).sum() # fill missing years for some countries by mean over the other years - means = fill_values.groupby(level='country').transform('mean') - fill_values = fill_values.where(fill_values != 0, means) + fill_values = fill_missing_years(fill_values) + df.loc[to_fill, "total agriculture"] = fill_values # split into end uses by average EU data from IDEES uses = ["electricity", "heat", "machinery"] @@ -711,6 +723,8 @@ def build_energy_totals( fill_values = ( eurostat.loc[slicer, eurostat_fuels[fuel]].groupby(level=[0, 1]).sum() ) + # fill missing years for some countries by mean over the other years + fill_values = fill_missing_years(fill_values) df.loc[to_fill, f"{fuel} {sector}"] = fill_values for sector in ["residential", "services"]: @@ -786,16 +800,22 @@ def build_energy_totals( slicer = idx[c, y, :, :, "Domestic aviation"] fill_values = eurostat.loc[slicer, "Total all products"].groupby(level=[0, 1]).sum() + # fill missing years for some countries by mean over the other years + fill_values = fill_missing_years(fill_values) df.loc[to_fill, "total domestic aviation"] = fill_values slicer = idx[c, y, :, :, "International aviation"] fill_values = eurostat.loc[slicer, "Total all products"].groupby(level=[0, 1]).sum() + # fill missing years for some countries by mean over the other years + fill_values = fill_missing_years(fill_values) df.loc[to_fill, "total international aviation"] = fill_values # missing domestic navigation slicer = idx[c, y, :, :, "Domestic Navigation"] fill_values = eurostat.loc[slicer, "Total all products"].groupby(level=[0, 1]).sum() + # fill missing years for some countries by mean over the other years + fill_values = fill_missing_years(fill_values) df.loc[to_fill, "total domestic navigation"] = fill_values # split road traffic for non-IDEES