From 8d09e38627ffef4b9f5b6140d2eb7e0b6a35e845 Mon Sep 17 00:00:00 2001 From: lisazeyen <35347358+lisazeyen@users.noreply.github.com> Date: Mon, 21 Aug 2023 16:16:53 +0200 Subject: [PATCH] drop duplicates in prepare_hotmaps_database --- scripts/build_industrial_distribution_key.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/build_industrial_distribution_key.py b/scripts/build_industrial_distribution_key.py index 979a1493..fe7f5d82 100644 --- a/scripts/build_industrial_distribution_key.py +++ b/scripts/build_industrial_distribution_key.py @@ -93,6 +93,20 @@ def prepare_hotmaps_database(regions): gdf.rename(columns={"index_right": "bus"}, inplace=True) gdf["country"] = gdf.bus.str[:2] + # the .sjoin can lead to duplicates if a geom is in two regions + if gdf.index.duplicated().any(): + import pycountry + # get all duplicated entries + duplicated_i = gdf.index[gdf.index.duplicated()] + # convert from raw data country name to iso-2-code + s = df.loc[duplicated_i, "Country"].apply(lambda x: pycountry.countries.lookup(x).alpha_2) + # Get a boolean mask where gdf's country column matches s's values for the same index + mask = gdf['country'] == gdf.index.map(s) + # Filter gdf using the mask + gdf_filtered = gdf[mask] + # concat not duplicated and filtered gdf + gdf = pd.concat([gdf.drop(duplicated_i), gdf_filtered]).sort_index() + return gdf