drop duplicates in prepare_hotmaps_database

2023-08-21 16:16:53 +02:00 · 2023-08-21 16:16:53 +02:00 · 8d09e38627
commit 8d09e38627
parent 1fc83b793d
1 changed files with 14 additions and 0 deletions
--- a/scripts/build_industrial_distribution_key.py
+++ b/scripts/build_industrial_distribution_key.py
@ -93,6 +93,20 @@ def prepare_hotmaps_database(regions):
    gdf.rename(columns={"index_right": "bus"}, inplace=True)
    gdf["country"] = gdf.bus.str[:2]

+    # the .sjoin can lead to duplicates if a geom is in two regions
+    if gdf.index.duplicated().any():
+        import pycountry
+        # get all duplicated entries
+        duplicated_i = gdf.index[gdf.index.duplicated()]
+        # convert from raw data country name to iso-2-code
+        s = df.loc[duplicated_i, "Country"].apply(lambda x: pycountry.countries.lookup(x).alpha_2)
+        # Get a boolean mask where gdf's country column matches s's values for the same index
+        mask = gdf['country'] == gdf.index.map(s)
+        # Filter gdf using the mask
+        gdf_filtered = gdf[mask]
+        # concat not duplicated and filtered gdf
+        gdf = pd.concat([gdf.drop(duplicated_i), gdf_filtered]).sort_index()
+
    return gdf