From 299e71e2b3eacf53f796db3c9e965034bbb25d16 Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Wed, 17 Nov 2021 13:46:33 +0100
Subject: [PATCH 01/20] introduce hierarchical agglomeratice clustering (hac)

---
 config.default.yaml             |  9 +++--
 config.tutorial.yaml            |  9 +++--
 doc/configtables/clustering.csv |  9 +++--
 envs/environment.yaml           |  2 +-
 scripts/cluster_network.py      | 63 +++++++++++++++++++++++++++++----
 scripts/simplify_network.py     | 10 ++++--
 test/config.test1.yaml          |  9 +++--
 7 files changed, 93 insertions(+), 18 deletions(-)

diff --git a/config.default.yaml b/config.default.yaml
index f70e7c2c..631645b9 100755
--- a/config.default.yaml
+++ b/config.default.yaml
@@ -20,8 +20,13 @@ scenario:
 countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MK', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK']
 
 clustering:
-  simplify:
-    to_substations: false # network is simplified to nodes with positive or negative power injection (i.e. substations or offwind connections)
+  simplify_network:
+    to_substations: true # network is simplified to nodes with positive or negative power injection (i.e. substations or offwind connections)
+    algorithm: hac
+    feature: solar+onwind-time
+  cluster_network:
+    algorithm: hac # choose from: [hac, kmeans]
+    feature: solar+onwind-time # only for hac. choose from: [solar+onwind-time, solar+onwind-cap, solar-time, solar-cap, solar+offwind-cap] etc.
 
 snapshots:
   start: "2013-01-01"
diff --git a/config.tutorial.yaml b/config.tutorial.yaml
index 26ead242..919da193 100755
--- a/config.tutorial.yaml
+++ b/config.tutorial.yaml
@@ -20,8 +20,13 @@ scenario:
 countries: ['DE']
 
 clustering:
-  simplify:
-    to_substations: false # network is simplified to nodes with positive or negative power injection (i.e. substations or offwind connections)
+  simplify_network:
+    to_substations: true # network is simplified to nodes with positive or negative power injection (i.e. substations or offwind connections)
+    algorithm: hac
+    feature: solar+onwind-time
+  cluster_network:
+    algorithm: hac # choose from: [hac, kmeans]
+    feature: solar+onwind-time # only for hac. choose from: [solar+onwind-time, solar+onwind-cap, solar-time, solar-cap, solar+offwind-cap] etc.
 
 snapshots:
   start: "2013-03-01"
diff --git a/doc/configtables/clustering.csv b/doc/configtables/clustering.csv
index 2f63f955..e488dd39 100644
--- a/doc/configtables/clustering.csv
+++ b/doc/configtables/clustering.csv
@@ -1,3 +1,8 @@
 ,Unit,Values,Description
-simplify,,,
--- to_substations,bool,"{'true','false'}","Aggregates all nodes without power injection (positive or negative, i.e. demand or generation) to electrically closest ones"
+simplify_network,,,
+-- to_substations,bool,"One of {'true','false'}","Aggregates all nodes without power injection (positive or negative, i.e. demand or generation) to electrically closest ones"
+-- algorithm,str,"One of {‘kmenas’, ‘hac’}",
+-- feature,str,"Str in the format ‘carrier1+carrier2+...+carrierN-X’, where CarrierI can be from {‘solar’, ‘onwind’, ‘offwind’, ‘ror’} and X is one of {‘cap’, ‘time’}.",
+cluster_network,,,
+-- algorithm,str,"One of {‘kmenas’, ‘hac’}",
+-- feature,str,"Str in the format ‘carrier1+carrier2+...+carrierN-X’, where CarrierI can be from {‘solar’, ‘onwind’, ‘offwind’, ‘ror’} and X is one of {‘cap’, ‘time’}.",
diff --git a/envs/environment.yaml b/envs/environment.yaml
index 29d743ac..5f6a8310 100644
--- a/envs/environment.yaml
+++ b/envs/environment.yaml
@@ -12,7 +12,6 @@ dependencies:
   - pip
   - mamba # esp for windows build
 
-  - pypsa>=0.18
   - atlite>=0.2.5
   - dask<=2021.3.1 # until https://github.com/dask/dask/issues/7583 is solved
 
@@ -56,5 +55,6 @@ dependencies:
   - tabula-py
 
   - pip:
+    - git+https://github.com/pypsa/pypsa.git#egg=pypsa
     - vresutils==0.3.1
     - tsam>=1.1.0
diff --git a/scripts/cluster_network.py b/scripts/cluster_network.py
index 980b73b0..7aae3ae6 100644
--- a/scripts/cluster_network.py
+++ b/scripts/cluster_network.py
@@ -138,7 +138,7 @@ import seaborn as sns
 from functools import reduce
 
 from pypsa.networkclustering import (busmap_by_kmeans, busmap_by_spectral_clustering,
-                                     _make_consense, get_clustering_from_busmap)
+                                     busmap_by_hac, _make_consense, get_clustering_from_busmap)
 
 from add_electricity import load_costs
 
@@ -170,6 +170,45 @@ def weighting_for_country(n, x):
     return (w * (100. / w.max())).clip(lower=1.).astype(int)
 
 
+def get_feature_for_hac(n, buses_i, feature=None): #buses_i = n.buses.index
+
+    if feature is None:
+        feature = "solar+onwind-time"
+
+    carriers = feature.split('-')[0].split('+')
+    if "offwind" in carriers:
+        carriers.remove("offwind")
+        carriers = np.append(carriers, network.generators.carrier.filter(like='offwind').unique())
+
+    if feature.split('-')[1] == 'cap':
+        feature_data = pd.DataFrame(index=buses_i, columns=carriers)
+        for carrier in carriers:
+            try:
+                feature_data[carrier] = (n.generators_t.p_max_pu.filter(like=carrier).mean()
+                                         .rename(index=lambda x: x.split(' ')[0]))
+            except:
+                feature_data[carrier] = (n.generators_t.p_max_pu.filter(like=carrier).mean()
+                                         .rename(index=lambda x: x.split(' ')[0] + ' ' + x.split(' ')[1]))
+
+    if feature.split('-')[1] == 'time':
+        feature_data = pd.DataFrame(columns=buses_i)
+        for carrier in carriers:
+            try:
+                # without simpl wildcard (bus names are "X X"):
+                feature_data = feature_data.append(n.generators_t.p_max_pu.filter(like=carrier)
+                                                   .rename(columns=lambda x: x.split(' ')[0]))[buses_i]
+            except:
+                # with simpl wildcard (bus names are "X X X"):
+                feature_data = feature_data.append(n.generators_t.p_max_pu.filter(like=carrier)
+                                                   .rename(columns=lambda x: x.split(' ')[0] + ' ' + x.split(' ')[1]))[buses_i]
+        feature_data = feature_data.T
+        feature_data.columns = feature_data.columns.astype(str) # Timestamp will raise error in sklearn>=v1.2
+
+    feature_data = feature_data.fillna(0)
+
+    return feature_data
+
+
 def distribute_clusters(n, n_clusters, focus_weights=None, solver_name=None):
     """Determine the number of clusters per country"""
 
@@ -221,12 +260,18 @@ def distribute_clusters(n, n_clusters, focus_weights=None, solver_name=None):
     return pd.Series(m.n.get_values(), index=L.index).astype(int)
 
 
-def busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights=None, algorithm="kmeans", **algorithm_kwds):
+def busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights=None, algorithm="kmeans", feature=None, **algorithm_kwds):
     if algorithm == "kmeans":
         algorithm_kwds.setdefault('n_init', 1000)
         algorithm_kwds.setdefault('max_iter', 30000)
         algorithm_kwds.setdefault('tol', 1e-6)
 
+    if algorithm == "hac":
+        feature = get_feature_for_hac(n, buses_i=n.buses.index, feature=feature)
+    elif feature is not None:
+        logger.info(f"keyword argument feature is only valid for algorithm 'hac'."
+                    f"given feature {feature} will be ignored.")
+
     n.determine_network_topology()
 
     n_clusters = distribute_clusters(n, n_clusters, focus_weights=focus_weights, solver_name=solver_name)
@@ -250,8 +295,10 @@ def busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights=None, algori
             return prefix + busmap_by_spectral_clustering(reduce_network(n, x), n_clusters[x.name], **algorithm_kwds)
         elif algorithm == "louvain":
             return prefix + busmap_by_louvain(reduce_network(n, x), n_clusters[x.name], **algorithm_kwds)
+        elif algorithm == "hac":
+            return prefix + busmap_by_hac(n, n_clusters[x.name], buses_i=x.index, feature=feature.loc[x.index])
         else:
-            raise ValueError(f"`algorithm` must be one of 'kmeans', 'spectral' or 'louvain'. Is {algorithm}.")
+            raise ValueError(f"`algorithm` must be one of 'kmeans', 'hac', 'spectral' or 'louvain'. Is {algorithm}.")
 
     return (n.buses.groupby(['country', 'sub_network'], group_keys=False)
             .apply(busmap_for_country).squeeze().rename('busmap'))
@@ -259,7 +306,9 @@ def busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights=None, algori
 
 def clustering_for_n_clusters(n, n_clusters, custom_busmap=False, aggregate_carriers=None,
                               line_length_factor=1.25, potential_mode='simple', solver_name="cbc",
-                              algorithm="kmeans", extended_link_costs=0, focus_weights=None):
+                              algorithm="kmeans", feature=None, extended_link_costs=0, focus_weights=None):
+
+    logger.info(f"Clustering network using algorithm {algorithm} and feature {feature}...")
 
     if potential_mode == 'simple':
         p_nom_max_strategy = np.sum
@@ -273,7 +322,7 @@ def clustering_for_n_clusters(n, n_clusters, custom_busmap=False, aggregate_carr
         busmap.index = busmap.index.astype(str)
         logger.info(f"Imported custom busmap from {snakemake.input.custom_busmap}")
     else:
-        busmap = busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights, algorithm)
+        busmap = busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights, algorithm, feature)
 
     clustering = get_clustering_from_busmap(
         n, busmap,
@@ -313,7 +362,7 @@ def cluster_regions(busmaps, input=None, output=None):
 
     for which in ('regions_onshore', 'regions_offshore'):
         regions = gpd.read_file(getattr(input, which)).set_index('name')
-        geom_c = regions.geometry.groupby(busmap).apply(shapely.ops.cascaded_union)
+        geom_c = regions.geometry.groupby(busmap).apply(shapely.ops.unary_union)
         regions_c = gpd.GeoDataFrame(dict(geometry=geom_c))
         regions_c.index.name = 'name'
         save_to_geojson(regions_c, getattr(output, which))
@@ -377,6 +426,8 @@ if __name__ == "__main__":
                                                line_length_factor=line_length_factor,
                                                potential_mode=potential_mode,
                                                solver_name=snakemake.config['solving']['solver']['name'],
+                                               algorithm=snakemake.config.get('clustering', {}).get('cluster_network', {}).get('algorithm', 'kmeans'),
+                                               feature=snakemake.config.get('clustering', {}).get('cluster_network', {}).get('feature', None),
                                                extended_link_costs=hvac_overhead_cost,
                                                focus_weights=focus_weights)
 
diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 85bc4d15..8a93952c 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -360,7 +360,7 @@ def aggregate_to_substations(n, buses_i=None):
     return clustering.network, busmap
 
 
-def cluster(n, n_clusters):
+def cluster(n, n_clusters, algorithm="kmeans", feature=None):
     logger.info(f"Clustering to {n_clusters} buses")
 
     focus_weights = snakemake.config.get('focus_weights', None)
@@ -377,8 +377,10 @@ def cluster(n, n_clusters):
     potential_mode = (consense(pd.Series([snakemake.config['renewable'][tech]['potential']
                                             for tech in renewable_carriers]))
                         if len(renewable_carriers) > 0 else 'conservative')
+
     clustering = clustering_for_n_clusters(n, n_clusters, custom_busmap=False, potential_mode=potential_mode,
                                            solver_name=snakemake.config['solving']['solver']['name'],
+                                           algorithm=algorithm, feature=feature,
                                            focus_weights=focus_weights)
 
     return clustering.network, clustering.busmap
@@ -400,12 +402,14 @@ if __name__ == "__main__":
 
     busmaps = [trafo_map, simplify_links_map, stub_map]
 
-    if snakemake.config.get('clustering', {}).get('simplify', {}).get('to_substations', False):
+    if snakemake.config.get('clustering', {}).get('simplify_network', {}).get('to_substations', False):
         n, substation_map = aggregate_to_substations(n)
         busmaps.append(substation_map)
 
     if snakemake.wildcards.simpl:
-        n, cluster_map = cluster(n, int(snakemake.wildcards.simpl))
+        n, cluster_map = cluster(n, int(snakemake.wildcards.simpl),
+                                 algorithm=snakemake.config.get('clustering', {}).get('simplify_network', {}).get('algorithm', 'hac'),
+                                 feature=snakemake.config.get('clustering', {}).get('simplify_network', {}).get('feature', None))
         busmaps.append(cluster_map)
 
     # some entries in n.buses are not updated in previous functions, therefore can be wrong. as they are not needed
diff --git a/test/config.test1.yaml b/test/config.test1.yaml
index 2986037b..0c34ea13 100755
--- a/test/config.test1.yaml
+++ b/test/config.test1.yaml
@@ -19,8 +19,13 @@ scenario:
 countries: ['DE']
 
 clustering:
-  simplify:
-    to_substations: false # network is simplified to nodes with positive or negative power injection (i.e. substations or offwind connections)
+  simplify_network:
+    to_substations: true # network is simplified to nodes with positive or negative power injection (i.e. substations or offwind connections)
+    algorithm: hac
+    feature: solar+onwind-time
+  cluster_network:
+    algorithm: hac # choose from: [hac, kmeans]
+    feature: solar+onwind-time # only for hac. choose from: [solar+onwind-time, solar+onwind-cap, solar-time, solar-cap, solar+offwind-cap] etc.
 
 snapshots:
   start: "2013-03-01"

From 6c4ea69e9563c257961e10f3e8d37b85d0cff7cf Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Fri, 10 Dec 2021 17:53:40 +0100
Subject: [PATCH 02/20] clustering: own config for clustering settings

---
 scripts/cluster_network.py  | 14 +++++++++-----
 scripts/simplify_network.py |  5 +++--
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/scripts/cluster_network.py b/scripts/cluster_network.py
index 7aae3ae6..415d3820 100644
--- a/scripts/cluster_network.py
+++ b/scripts/cluster_network.py
@@ -170,7 +170,10 @@ def weighting_for_country(n, x):
     return (w * (100. / w.max())).clip(lower=1.).astype(int)
 
 
-def get_feature_for_hac(n, buses_i, feature=None): #buses_i = n.buses.index
+def get_feature_for_hac(n, buses_i=None, feature=None):
+
+    if buses_i is None:
+        buses_i = n.buses.index
 
     if feature is None:
         feature = "solar+onwind-time"
@@ -269,8 +272,8 @@ def busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights=None, algori
     if algorithm == "hac":
         feature = get_feature_for_hac(n, buses_i=n.buses.index, feature=feature)
     elif feature is not None:
-        logger.info(f"keyword argument feature is only valid for algorithm 'hac'."
-                    f"given feature {feature} will be ignored.")
+        logger.warning(f"Keyword argument feature is only valid for algorithm 'hac'."
+                       f"given feature {feature} will be ignored.")
 
     n.determine_network_topology()
 
@@ -422,12 +425,13 @@ if __name__ == "__main__":
         potential_mode = consense(pd.Series([snakemake.config['renewable'][tech]['potential']
                                              for tech in renewable_carriers]))
         custom_busmap = snakemake.config["enable"].get("custom_busmap", False)
+        cluster_config = snakemake.config.get('clustering', {}).get('cluster_network', {})
         clustering = clustering_for_n_clusters(n, n_clusters, custom_busmap, aggregate_carriers,
                                                line_length_factor=line_length_factor,
                                                potential_mode=potential_mode,
                                                solver_name=snakemake.config['solving']['solver']['name'],
-                                               algorithm=snakemake.config.get('clustering', {}).get('cluster_network', {}).get('algorithm', 'kmeans'),
-                                               feature=snakemake.config.get('clustering', {}).get('cluster_network', {}).get('feature', None),
+                                               algorithm=cluster_config.get('algorithm', 'kmeans'),
+                                               feature=cluster_config.get('feature', None),
                                                extended_link_costs=hvac_overhead_cost,
                                                focus_weights=focus_weights)
 
diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 8a93952c..9464c3f7 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -407,9 +407,10 @@ if __name__ == "__main__":
         busmaps.append(substation_map)
 
     if snakemake.wildcards.simpl:
+        cluster_config = snakemake.config.get('clustering', {}).get('simplify_network', {})
         n, cluster_map = cluster(n, int(snakemake.wildcards.simpl),
-                                 algorithm=snakemake.config.get('clustering', {}).get('simplify_network', {}).get('algorithm', 'hac'),
-                                 feature=snakemake.config.get('clustering', {}).get('simplify_network', {}).get('feature', None))
+                                 algorithm=cluster_config.get('algorithm', 'hac'),
+                                 feature=cluster_config.get('feature', None))
         busmaps.append(cluster_map)
 
     # some entries in n.buses are not updated in previous functions, therefore can be wrong. as they are not needed

From 256ac48b470470571d0b5751319e8f441ce39e0c Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Fri, 4 Feb 2022 16:45:00 +0100
Subject: [PATCH 03/20] resolve merging master bugs

---
 scripts/cluster_network.py  | 13 +++++++++----
 scripts/simplify_network.py |  6 +++---
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/scripts/cluster_network.py b/scripts/cluster_network.py
index d4b3b139..1c55234c 100644
--- a/scripts/cluster_network.py
+++ b/scripts/cluster_network.py
@@ -190,9 +190,11 @@ def get_feature_for_hac(n, buses_i=None, feature=None):
         feature_data = pd.DataFrame(index=buses_i, columns=carriers)
         for carrier in carriers:
             try:
+                # without simpl wildcard (bus names are "X X"):
                 feature_data[carrier] = (n.generators_t.p_max_pu.filter(like=carrier).mean()
                                          .rename(index=lambda x: x.split(' ')[0]))
             except:
+                # with simpl wildcard (bus names are "X X X"):
                 feature_data[carrier] = (n.generators_t.p_max_pu.filter(like=carrier).mean()
                                          .rename(index=lambda x: x.split(' ')[0] + ' ' + x.split(' ')[1]))
 
@@ -208,7 +210,7 @@ def get_feature_for_hac(n, buses_i=None, feature=None):
                 feature_data = feature_data.append(n.generators_t.p_max_pu.filter(like=carrier)
                                                    .rename(columns=lambda x: x.split(' ')[0] + ' ' + x.split(' ')[1]))[buses_i]
         feature_data = feature_data.T
-        feature_data.columns = feature_data.columns.astype(str) # Timestamp will raise error in sklearn>=v1.2
+        feature_data.columns = feature_data.columns.astype(str) # timestamp raises error in sklearn>=v1.2
 
     feature_data = feature_data.fillna(0)
 
@@ -309,9 +311,9 @@ def busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights=None, algori
 
 def clustering_for_n_clusters(n, n_clusters, custom_busmap=False, aggregate_carriers=None,
                               line_length_factor=1.25, potential_mode='simple', solver_name="cbc",
-                              algorithm="kmeans", feature=None, extended_link_costs=0, focus_weights=None):
+                              algorithm="hac", feature=None, extended_link_costs=0, focus_weights=None):
 
-    logger.info(f"Clustering network using algorithm {algorithm} and feature {feature}...")
+    logger.info(f"Clustering network using algorithm ``{algorithm}`` and feature ``{feature}``...")
 
     if potential_mode == 'simple':
         p_nom_max_strategy = np.sum
@@ -424,10 +426,13 @@ if __name__ == "__main__":
             custom_busmap.index = custom_busmap.index.astype(str)
             logger.info(f"Imported custom busmap from {snakemake.input.custom_busmap}")
 
+        cluster_config = snakemake.config.get('clustering', {}).get('cluster_network', {})
         clustering = clustering_for_n_clusters(n, n_clusters, custom_busmap, aggregate_carriers,
                                                line_length_factor, potential_mode,
                                                snakemake.config['solving']['solver']['name'],
-                                               "kmeans", hvac_overhead_cost, focus_weights)
+                                               cluster_config.get("algorithm", "hac"),
+                                               cluster_config.get("feature", "solar+onwind-time"),
+                                               hvac_overhead_cost, focus_weights)
 
     update_p_nom_max(n)
     
diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 0e9dd385..23facc79 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -353,11 +353,10 @@ def aggregate_to_substations(n, buses_i=None):
                                             line_length_factor=1.0,
                                             generator_strategies={'p_nom_max': 'sum'},
                                             scale_link_capital_costs=False)
-        
     return clustering.network, busmap
 
 
-def cluster(n, n_clusters, config, algorithm="kmeans", feature=None):
+def cluster(n, n_clusters, config, algorithm="hac", feature=None):
     logger.info(f"Clustering to {n_clusters} buses")
 
     focus_weights = config.get('focus_weights', None)
@@ -403,7 +402,8 @@ if __name__ == "__main__":
 
     busmaps = [trafo_map, simplify_links_map, stub_map]
 
-    if snakemake.config.get('clustering', {}).get('simplify_network', {}).get('to_substations', False):
+    cluster_config = snakemake.config.get('clustering', {}).get('simplify_network', {})
+    if cluster_config.get('to_substations', False):
         n, substation_map = aggregate_to_substations(n)
         busmaps.append(substation_map)
 

From b5dbf4eb3240222e3a3145bbf87b3d66a63d7169 Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Fri, 4 Feb 2022 17:19:23 +0100
Subject: [PATCH 04/20] overwrite country of isolated buses

---
 scripts/cluster_network.py | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/scripts/cluster_network.py b/scripts/cluster_network.py
index 1c55234c..bd49556f 100644
--- a/scripts/cluster_network.py
+++ b/scripts/cluster_network.py
@@ -271,11 +271,36 @@ def busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights=None, algori
         algorithm_kwds.setdefault('max_iter', 30000)
         algorithm_kwds.setdefault('tol', 1e-6)
 
+    def fix_country_assignment_for_hac(n):
+        # overwrite country of nodes that are disconnected from their country-topology
+        for country in n.buses.country.unique():
+            m = n.copy()
+            m.buses = m.buses.query("country in @country")
+            m.lines = m.lines.query("bus0 in @m.buses.index and bus1 in @m.buses.index")
+            m.links = m.links.query("bus0 in @m.buses.index and bus1 in @m.buses.index")
+
+            _, labels = csgraph.connected_components(m.adjacency_matrix(), directed=False)
+            component = pd.Series(labels, index=m.buses.index)
+            component_sizes = component.value_counts()
+
+            if len(component_sizes)>1:
+                disconnected_bus = component[component==component_sizes[component_sizes==component_sizes.min()].index[0]].index
+                neighbor_bus = n.lines.query("bus0 in @disconnected_bus or bus1 in @disconnected_bus").iloc[0][['bus0','bus1']]
+                new_country = list(set(n.buses.loc[neighbor_bus].country)-set([country]))[0]
+
+                logger.info(f"overwriting country ``{country}`` of bus ``{disconnected_bus}`` to new country ``{new_country}``, "
+                            "because it is disconnected from its inital inter-country transmission grid.")
+                n.buses.at[disconnected_bus, "country"] = new_country
+        return n
+
     if algorithm == "hac":
+        from scipy.sparse import csgraph
+
         feature = get_feature_for_hac(n, buses_i=n.buses.index, feature=feature)
+        n = fix_country_assignment_for_hac(n)
     elif feature is not None:
         logger.warning(f"Keyword argument feature is only valid for algorithm 'hac'."
-                       f"given feature {feature} will be ignored.")
+                       f"given feature ``{feature}`` will be ignored.")
 
     n.determine_network_topology()
 

From 82a0338e9f3f8b68eb7d73aad0ceb3b26862d028 Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Fri, 4 Feb 2022 20:27:18 +0100
Subject: [PATCH 05/20] treatment of outliers and small feature-bugfix

---
 scripts/cluster_network.py  | 17 +++++++++--------
 scripts/simplify_network.py | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/scripts/cluster_network.py b/scripts/cluster_network.py
index bd49556f..2cc406eb 100644
--- a/scripts/cluster_network.py
+++ b/scripts/cluster_network.py
@@ -272,6 +272,8 @@ def busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights=None, algori
         algorithm_kwds.setdefault('tol', 1e-6)
 
     def fix_country_assignment_for_hac(n):
+        from scipy.sparse import csgraph
+
         # overwrite country of nodes that are disconnected from their country-topology
         for country in n.buses.country.unique():
             m = n.copy()
@@ -288,19 +290,18 @@ def busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights=None, algori
                 neighbor_bus = n.lines.query("bus0 in @disconnected_bus or bus1 in @disconnected_bus").iloc[0][['bus0','bus1']]
                 new_country = list(set(n.buses.loc[neighbor_bus].country)-set([country]))[0]
 
-                logger.info(f"overwriting country ``{country}`` of bus ``{disconnected_bus}`` to new country ``{new_country}``, "
+                logger.info(f"overwriting country `{country}` of bus `{disconnected_bus}` to new country `{new_country}`, "
                             "because it is disconnected from its inital inter-country transmission grid.")
                 n.buses.at[disconnected_bus, "country"] = new_country
         return n
 
     if algorithm == "hac":
-        from scipy.sparse import csgraph
-
         feature = get_feature_for_hac(n, buses_i=n.buses.index, feature=feature)
         n = fix_country_assignment_for_hac(n)
-    elif feature is not None:
-        logger.warning(f"Keyword argument feature is only valid for algorithm 'hac'."
-                       f"given feature ``{feature}`` will be ignored.")
+
+    if (algorithm != "hac") and (feature is not None):
+        logger.warning(f"Keyword argument feature is only valid for algorithm `hac`. "
+                       f"Given feature `{feature}` will be ignored.")
 
     n.determine_network_topology()
 
@@ -338,7 +339,7 @@ def clustering_for_n_clusters(n, n_clusters, custom_busmap=False, aggregate_carr
                               line_length_factor=1.25, potential_mode='simple', solver_name="cbc",
                               algorithm="hac", feature=None, extended_link_costs=0, focus_weights=None):
 
-    logger.info(f"Clustering network using algorithm ``{algorithm}`` and feature ``{feature}``...")
+    logger.info(f"Clustering network using algorithm `{algorithm}` and feature `{feature}`...")
 
     if potential_mode == 'simple':
         p_nom_max_strategy = np.sum
@@ -348,7 +349,7 @@ def clustering_for_n_clusters(n, n_clusters, custom_busmap=False, aggregate_carr
         raise AttributeError(f"potential_mode should be one of 'simple' or 'conservative' but is '{potential_mode}'")
 
     if not custom_busmap:
-        busmap = busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights, algorithm)
+        busmap = busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights, algorithm, feature)
     else:
         busmap = custom_busmap
 
diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 23facc79..b28bc4dc 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -407,7 +407,21 @@ if __name__ == "__main__":
         n, substation_map = aggregate_to_substations(n)
         busmaps.append(substation_map)
 
+    # treatment of outliers (nodes without a profile for considered carrier) for "cluster_network"
+    if snakemake.config.get("clustering", {}).get("cluster_network", {}).get("algorithm", "hac") == "hac":
+        carriers = cluster_config.get("feature", "solar+onwind-time").split('-')[0].split('+')
+        buses_i = list(set(n.buses.index)-set(n.generators.query("carrier in @carriers").bus))
+        n, busmap_hac = aggregate_to_substations(n, buses_i)
+        busmaps.append(busmap_hac)
+
     if snakemake.wildcards.simpl:
+        # treatment of outliers (nodes without a profile for a considered carrier) for "simplify"
+        if cluster_config.get("algorithm", "hac") == "hac":
+            carriers = cluster_config.get("feature", "solar+onwind-time").split('-')[0].split('+')
+            buses_i = list(set(n.buses.index)-set(n.generators.query("carrier in @carriers").bus))
+            n, busmap_hac = aggregate_to_substations(n, buses_i)
+            busmaps.append(busmap_hac)
+        # conduct clustering
         n, cluster_map = cluster(n, int(snakemake.wildcards.simpl), snakemake.config,
                                  algorithm=cluster_config.get('algorithm', 'hac'),
                                  feature=cluster_config.get('feature', None))

From f02d5fe82161f7373423c8ff6571a4319ab8c164 Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Mon, 7 Mar 2022 11:43:04 +0100
Subject: [PATCH 06/20] fix clustering setup for hac according to fneum
 suggestions

---
 doc/configtables/clustering.csv | 4 ++--
 envs/environment.yaml           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/configtables/clustering.csv b/doc/configtables/clustering.csv
index e488dd39..d14ab395 100644
--- a/doc/configtables/clustering.csv
+++ b/doc/configtables/clustering.csv
@@ -1,8 +1,8 @@
 ,Unit,Values,Description
 simplify_network,,,
 -- to_substations,bool,"One of {'true','false'}","Aggregates all nodes without power injection (positive or negative, i.e. demand or generation) to electrically closest ones"
--- algorithm,str,"One of {‘kmenas’, ‘hac’}",
+-- algorithm,str,"One of {‘kmeans’, ‘hac’}",
 -- feature,str,"Str in the format ‘carrier1+carrier2+...+carrierN-X’, where CarrierI can be from {‘solar’, ‘onwind’, ‘offwind’, ‘ror’} and X is one of {‘cap’, ‘time’}.",
 cluster_network,,,
--- algorithm,str,"One of {‘kmenas’, ‘hac’}",
+-- algorithm,str,"One of {‘kmeans’, ‘hac’}",
 -- feature,str,"Str in the format ‘carrier1+carrier2+...+carrierN-X’, where CarrierI can be from {‘solar’, ‘onwind’, ‘offwind’, ‘ror’} and X is one of {‘cap’, ‘time’}.",
diff --git a/envs/environment.yaml b/envs/environment.yaml
index 1d8d9d28..c64b5fbb 100644
--- a/envs/environment.yaml
+++ b/envs/environment.yaml
@@ -10,6 +10,7 @@ dependencies:
   - python>=3.8
   - pip
 
+  - pypsa>=0.19.1
   - atlite>=0.2.5
   - dask
 
@@ -53,6 +54,5 @@ dependencies:
   - tabula-py
 
   - pip:
-    - git+https://github.com/pypsa/pypsa.git#egg=pypsa
     - vresutils>=0.3.1
     - tsam>=1.1.0

From 68f332f5cc8603ceb890fb41f01cfc02bbebb10e Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Thu, 17 Mar 2022 15:42:04 +0100
Subject: [PATCH 07/20] suggestions by coroa and fneum

---
 scripts/cluster_network.py | 42 +++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/scripts/cluster_network.py b/scripts/cluster_network.py
index f4058e79..003442ff 100644
--- a/scripts/cluster_network.py
+++ b/scripts/cluster_network.py
@@ -189,28 +189,20 @@ def get_feature_for_hac(n, buses_i=None, feature=None):
     if feature.split('-')[1] == 'cap':
         feature_data = pd.DataFrame(index=buses_i, columns=carriers)
         for carrier in carriers:
-            try:
-                # without simpl wildcard (bus names are "X X"):
-                feature_data[carrier] = (n.generators_t.p_max_pu.filter(like=carrier).mean()
-                                         .rename(index=lambda x: x.split(' ')[0]))
-            except:
-                # with simpl wildcard (bus names are "X X X"):
-                feature_data[carrier] = (n.generators_t.p_max_pu.filter(like=carrier).mean()
-                                         .rename(index=lambda x: x.split(' ')[0] + ' ' + x.split(' ')[1]))
+            gen_i = n.generators.query("carrier == @carrier").index
+            attach = n.generators_t.p_max_pu[gen_i].mean().rename(index = n.generators.loc[gen_i].bus)
+            feature_data[carrier] = attach
 
     if feature.split('-')[1] == 'time':
         feature_data = pd.DataFrame(columns=buses_i)
         for carrier in carriers:
-            try:
-                # without simpl wildcard (bus names are "X X"):
-                feature_data = feature_data.append(n.generators_t.p_max_pu.filter(like=carrier)
-                                                   .rename(columns=lambda x: x.split(' ')[0]))[buses_i]
-            except:
-                # with simpl wildcard (bus names are "X X X"):
-                feature_data = feature_data.append(n.generators_t.p_max_pu.filter(like=carrier)
-                                                   .rename(columns=lambda x: x.split(' ')[0] + ' ' + x.split(' ')[1]))[buses_i]
+            gen_i = n.generators.query("carrier == @carrier").index
+            attach = n.generators_t.p_max_pu[gen_i].rename(columns = n.generators.loc[gen_i].bus)
+            feature_data = pd.concat([feature_data, attach], axis=0)[buses_i]
+
         feature_data = feature_data.T
-        feature_data.columns = feature_data.columns.astype(str) # timestamp raises error in sklearn>=v1.2
+        # timestamp raises error in sklearn >= v1.2:
+        feature_data.columns = feature_data.columns.astype(str)
 
     feature_data = feature_data.fillna(0)
 
@@ -283,16 +275,24 @@ def busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights=None, algori
             m.links = m.links.query("bus0 in @m.buses.index and bus1 in @m.buses.index")
 
             _, labels = csgraph.connected_components(m.adjacency_matrix(), directed=False)
+
             component = pd.Series(labels, index=m.buses.index)
             component_sizes = component.value_counts()
 
             if len(component_sizes)>1:
-                disconnected_bus = component[component==component_sizes[component_sizes==component_sizes.min()].index[0]].index
-                neighbor_bus = n.lines.query("bus0 in @disconnected_bus or bus1 in @disconnected_bus").iloc[0][['bus0','bus1']]
+                disconnected_bus = component[component==component_sizes.index[-1]].index[0]
+
+                neighbor_bus = (
+                    n.lines.query("bus0 == @disconnected_bus or bus1 == @disconnected_bus")
+                    .iloc[0][['bus0', 'bus1']]
+                )
                 new_country = list(set(n.buses.loc[neighbor_bus].country)-set([country]))[0]
 
-                logger.info(f"overwriting country `{country}` of bus `{disconnected_bus}` to new country `{new_country}`, "
-                            "because it is disconnected from its inital inter-country transmission grid.")
+                logger.info(
+                    f"overwriting country `{country}` of bus `{disconnected_bus}` "
+                    f"to new country `{new_country}`, because it is disconnected "
+                    "from its inital inter-country transmission grid."
+                )
                 n.buses.at[disconnected_bus, "country"] = new_country
         return n
 

From 4c1c5e3a4e2a527792cb9a3dcc0b4d5c08d45f29 Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Thu, 17 Mar 2022 17:38:30 +0100
Subject: [PATCH 08/20] suggestions by coroa and fneum

---
 scripts/cluster_network.py | 42 +++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/scripts/cluster_network.py b/scripts/cluster_network.py
index f4058e79..003442ff 100644
--- a/scripts/cluster_network.py
+++ b/scripts/cluster_network.py
@@ -189,28 +189,20 @@ def get_feature_for_hac(n, buses_i=None, feature=None):
     if feature.split('-')[1] == 'cap':
         feature_data = pd.DataFrame(index=buses_i, columns=carriers)
         for carrier in carriers:
-            try:
-                # without simpl wildcard (bus names are "X X"):
-                feature_data[carrier] = (n.generators_t.p_max_pu.filter(like=carrier).mean()
-                                         .rename(index=lambda x: x.split(' ')[0]))
-            except:
-                # with simpl wildcard (bus names are "X X X"):
-                feature_data[carrier] = (n.generators_t.p_max_pu.filter(like=carrier).mean()
-                                         .rename(index=lambda x: x.split(' ')[0] + ' ' + x.split(' ')[1]))
+            gen_i = n.generators.query("carrier == @carrier").index
+            attach = n.generators_t.p_max_pu[gen_i].mean().rename(index = n.generators.loc[gen_i].bus)
+            feature_data[carrier] = attach
 
     if feature.split('-')[1] == 'time':
         feature_data = pd.DataFrame(columns=buses_i)
         for carrier in carriers:
-            try:
-                # without simpl wildcard (bus names are "X X"):
-                feature_data = feature_data.append(n.generators_t.p_max_pu.filter(like=carrier)
-                                                   .rename(columns=lambda x: x.split(' ')[0]))[buses_i]
-            except:
-                # with simpl wildcard (bus names are "X X X"):
-                feature_data = feature_data.append(n.generators_t.p_max_pu.filter(like=carrier)
-                                                   .rename(columns=lambda x: x.split(' ')[0] + ' ' + x.split(' ')[1]))[buses_i]
+            gen_i = n.generators.query("carrier == @carrier").index
+            attach = n.generators_t.p_max_pu[gen_i].rename(columns = n.generators.loc[gen_i].bus)
+            feature_data = pd.concat([feature_data, attach], axis=0)[buses_i]
+
         feature_data = feature_data.T
-        feature_data.columns = feature_data.columns.astype(str) # timestamp raises error in sklearn>=v1.2
+        # timestamp raises error in sklearn >= v1.2:
+        feature_data.columns = feature_data.columns.astype(str)
 
     feature_data = feature_data.fillna(0)
 
@@ -283,16 +275,24 @@ def busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights=None, algori
             m.links = m.links.query("bus0 in @m.buses.index and bus1 in @m.buses.index")
 
             _, labels = csgraph.connected_components(m.adjacency_matrix(), directed=False)
+
             component = pd.Series(labels, index=m.buses.index)
             component_sizes = component.value_counts()
 
             if len(component_sizes)>1:
-                disconnected_bus = component[component==component_sizes[component_sizes==component_sizes.min()].index[0]].index
-                neighbor_bus = n.lines.query("bus0 in @disconnected_bus or bus1 in @disconnected_bus").iloc[0][['bus0','bus1']]
+                disconnected_bus = component[component==component_sizes.index[-1]].index[0]
+
+                neighbor_bus = (
+                    n.lines.query("bus0 == @disconnected_bus or bus1 == @disconnected_bus")
+                    .iloc[0][['bus0', 'bus1']]
+                )
                 new_country = list(set(n.buses.loc[neighbor_bus].country)-set([country]))[0]
 
-                logger.info(f"overwriting country `{country}` of bus `{disconnected_bus}` to new country `{new_country}`, "
-                            "because it is disconnected from its inital inter-country transmission grid.")
+                logger.info(
+                    f"overwriting country `{country}` of bus `{disconnected_bus}` "
+                    f"to new country `{new_country}`, because it is disconnected "
+                    "from its inital inter-country transmission grid."
+                )
                 n.buses.at[disconnected_bus, "country"] = new_country
         return n
 

From 8cb4c17930909c873d66f003ea41a2e09a924e4f Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Tue, 22 Mar 2022 16:53:05 +0100
Subject: [PATCH 09/20] unify vre treatment for hac clustering for
 simplify_network and cluster_network

---
 scripts/simplify_network.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index b28bc4dc..f9ac8ad7 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -351,7 +351,7 @@ def aggregate_to_substations(n, buses_i=None):
                                             aggregate_generators_carriers=None,
                                             aggregate_one_ports=["Load", "StorageUnit"],
                                             line_length_factor=1.0,
-                                            generator_strategies={'p_nom_max': 'sum'},
+                                            generator_strategies={'p_nom_max': 'sum', 'p_nom_min': 'sum'},
                                             scale_link_capital_costs=False)
     return clustering.network, busmap
 
@@ -407,21 +407,20 @@ if __name__ == "__main__":
         n, substation_map = aggregate_to_substations(n)
         busmaps.append(substation_map)
 
-    # treatment of outliers (nodes without a profile for considered carrier) for "cluster_network"
-    if snakemake.config.get("clustering", {}).get("cluster_network", {}).get("algorithm", "hac") == "hac":
+    # treatment of outliers (nodes without a profile for considered carrier):
+    # all nodes that have no profile of the given carrier are being aggregated to closest neighbor
+    if (
+            snakemake.config.get("clustering", {}).get("cluster_network", {}).get("algorithm", "hac") == "hac" or
+            cluster_config.get("algorithm", "hac") == "hac"
+    ):
         carriers = cluster_config.get("feature", "solar+onwind-time").split('-')[0].split('+')
-        buses_i = list(set(n.buses.index)-set(n.generators.query("carrier in @carriers").bus))
-        n, busmap_hac = aggregate_to_substations(n, buses_i)
-        busmaps.append(busmap_hac)
-
-    if snakemake.wildcards.simpl:
-        # treatment of outliers (nodes without a profile for a considered carrier) for "simplify"
-        if cluster_config.get("algorithm", "hac") == "hac":
-            carriers = cluster_config.get("feature", "solar+onwind-time").split('-')[0].split('+')
-            buses_i = list(set(n.buses.index)-set(n.generators.query("carrier in @carriers").bus))
+        for carrier in carriers:
+            buses_i = list(set(n.buses.index)-set(n.generators.query("carrier == @carrier").bus))
+            logger.info(f'clustering preparaton (hac): aggregating {len(buses_i)} buses of type {carrier}.')
             n, busmap_hac = aggregate_to_substations(n, buses_i)
             busmaps.append(busmap_hac)
-        # conduct clustering
+
+    if snakemake.wildcards.simpl:
         n, cluster_map = cluster(n, int(snakemake.wildcards.simpl), snakemake.config,
                                  algorithm=cluster_config.get('algorithm', 'hac'),
                                  feature=cluster_config.get('feature', None))

From 7f29c31abe6cf133f28f4b4558cccb7f20899d72 Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Thu, 24 Mar 2022 13:17:01 +0100
Subject: [PATCH 10/20] .copy() shortcut

---
 scripts/cluster_network.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/scripts/cluster_network.py b/scripts/cluster_network.py
index 9c6d541f..f659f8a8 100644
--- a/scripts/cluster_network.py
+++ b/scripts/cluster_network.py
@@ -269,10 +269,7 @@ def busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights=None, algori
 
         # overwrite country of nodes that are disconnected from their country-topology
         for country in n.buses.country.unique():
-            m = n.copy()
-            m.buses = m.buses.query("country in @country")
-            m.lines = m.lines.query("bus0 in @m.buses.index and bus1 in @m.buses.index")
-            m.links = m.links.query("bus0 in @m.buses.index and bus1 in @m.buses.index")
+            m = n[n.buses.country ==country].copy()
 
             _, labels = csgraph.connected_components(m.adjacency_matrix(), directed=False)
 

From bef4967e84567434aa836d6d572c0316abe6f96f Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Mon, 20 Jun 2022 18:58:23 +0200
Subject: [PATCH 11/20] clustering strategies moved to configurables

---
 config.default.yaml         |  4 ++++
 config.tutorial.yaml        |  4 ++++
 scripts/cluster_network.py  | 43 +++++++++++++++--------------------
 scripts/simplify_network.py | 45 ++++++++++++++++++++-----------------
 4 files changed, 51 insertions(+), 45 deletions(-)

diff --git a/config.default.yaml b/config.default.yaml
index d2bf6159..9462719b 100755
--- a/config.default.yaml
+++ b/config.default.yaml
@@ -22,6 +22,10 @@ countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'F
 clustering:
   simplify:
     to_substations: false # network is simplified to nodes with positive or negative power injection (i.e. substations or offwind connections)
+  aggregation_strategies:
+    generators:
+      p_nom_max: "sum" # use "min" for more conservative assumptions
+      p_nom_min: "sum"
 
 snapshots:
   start: "2013-01-01"
diff --git a/config.tutorial.yaml b/config.tutorial.yaml
index 225d8f78..4dc7a94d 100755
--- a/config.tutorial.yaml
+++ b/config.tutorial.yaml
@@ -22,6 +22,10 @@ countries: ['BE']
 clustering:
   simplify:
     to_substations: false # network is simplified to nodes with positive or negative power injection (i.e. substations or offwind connections)
+  aggregation_strategies:
+    generators:
+      p_nom_max: "sum" # use "min" for more conservative assumptions
+      p_nom_min: "sum"
 
 snapshots:
   start: "2013-03-01"
diff --git a/scripts/cluster_network.py b/scripts/cluster_network.py
index 1d5608e2..0a3d768a 100644
--- a/scripts/cluster_network.py
+++ b/scripts/cluster_network.py
@@ -11,11 +11,10 @@ Relevant Settings
 
 .. code:: yaml
 
-    focus_weights:
+    clustering:
+      aggregation_strategies:
 
-    renewable: (keys)
-        {technology}:
-            potential:
+    focus_weights:
 
     solving:
         solver:
@@ -259,15 +258,16 @@ def busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights=None, algori
 
 
 def clustering_for_n_clusters(n, n_clusters, custom_busmap=False, aggregate_carriers=None,
-                              line_length_factor=1.25, potential_mode='simple', solver_name="cbc",
+                              line_length_factor=1.25, aggregation_strategies=dict(), solver_name="cbc",
                               algorithm="kmeans", extended_link_costs=0, focus_weights=None):
 
-    if potential_mode == 'simple':
-        p_nom_max_strategy = pd.Series.sum
-    elif potential_mode == 'conservative':
-        p_nom_max_strategy = pd.Series.min
-    else:
-        raise AttributeError(f"potential_mode should be one of 'simple' or 'conservative' but is '{potential_mode}'")
+    bus_strategies = dict(country=_make_consense("Bus", "country"))
+    bus_strategies.update(aggregation_strategies.get("buses", {}))
+    generator_strategies = aggregation_strategies.get("generators", {"p_nom_max": "sum"})
+
+    # this snippet supports compatibility of PyPSA and PyPSA-EUR:
+    if "p_nom_max" in generator_strategies:
+        if generator_strategies["p_nom_max"] == "min": generator_strategies["p_nom_max"] = np.min
 
     if not isinstance(custom_busmap, pd.Series):
         busmap = busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights, algorithm)
@@ -276,19 +276,12 @@ def clustering_for_n_clusters(n, n_clusters, custom_busmap=False, aggregate_carr
 
     clustering = get_clustering_from_busmap(
         n, busmap,
-        bus_strategies=dict(country=_make_consense("Bus", "country")),
+        bus_strategies=bus_strategies,
         aggregate_generators_weighted=True,
         aggregate_generators_carriers=aggregate_carriers,
         aggregate_one_ports=["Load", "StorageUnit"],
         line_length_factor=line_length_factor,
-        generator_strategies={'p_nom_max': p_nom_max_strategy, 
-                              'p_nom_min': pd.Series.sum, 
-                              'p_min_pu': pd.Series.mean, 
-                              'marginal_cost': pd.Series.mean, 
-                              'committable': np.any, 
-                              'ramp_limit_up': pd.Series.max, 
-                              'ramp_limit_down': pd.Series.max,
-                              },
+        generator_strategies=generator_strategies,
         scale_link_capital_costs=False)
 
     if not n.links.empty:
@@ -375,8 +368,8 @@ if __name__ == "__main__":
                 "The `potential` configuration option must agree for all renewable carriers, for now!"
             )
             return v
-        potential_mode = consense(pd.Series([snakemake.config['renewable'][tech]['potential']
-                                             for tech in renewable_carriers]))
+        aggregation_strategies = snakemake.config["clustering"].get("aggregation_strategies", {})
+
         custom_busmap = snakemake.config["enable"].get("custom_busmap", False)
         if custom_busmap:
             custom_busmap = pd.read_csv(snakemake.input.custom_busmap, index_col=0, squeeze=True)
@@ -384,12 +377,12 @@ if __name__ == "__main__":
             logger.info(f"Imported custom busmap from {snakemake.input.custom_busmap}")
 
         clustering = clustering_for_n_clusters(n, n_clusters, custom_busmap, aggregate_carriers,
-                                               line_length_factor, potential_mode,
+                                               line_length_factor, aggregation_strategies,
                                                snakemake.config['solving']['solver']['name'],
                                                "kmeans", hvac_overhead_cost, focus_weights)
-
-    update_p_nom_max(n)
     
+    update_p_nom_max(clustering.network)
+
     clustering.network.export_to_netcdf(snakemake.output.network)
     for attr in ('busmap', 'linemap'): #also available: linemap_positive, linemap_negative
         getattr(clustering, attr).to_csv(snakemake.output[attr])
diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 287dfe32..7d51c511 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -13,6 +13,10 @@ Relevant Settings
 
 .. code:: yaml
 
+    clustering:
+      simplify:
+      aggregation_strategies:
+
     costs:
         USD2013_to_EUR2013:
         discountrate:
@@ -22,10 +26,6 @@ Relevant Settings
     electricity:
         max_hours:
 
-    renewables: (keys)
-        {technology}:
-            potential:
-
     lines:
         length_factor:
 
@@ -320,7 +320,7 @@ def remove_stubs(n, costs, config, output):
 
     return n, busmap
 
-def aggregate_to_substations(n, buses_i=None):
+def aggregate_to_substations(n, config, aggregation_strategies=dict(), buses_i=None):
     # can be used to aggregate a selection of buses to electrically closest neighbors
     # if no buses are given, nodes that are no substations or without offshore connection are aggregated
     
@@ -345,19 +345,29 @@ def aggregate_to_substations(n, buses_i=None):
     busmap = n.buses.index.to_series()
     busmap.loc[buses_i] = dist.idxmin(1)
 
+    # default aggregation strategies must be specified within the function, otherwise (when defaults are passed in
+    # the function's definition) they get lost in case custom values for different variables are specified in the config
+    bus_strategies = dict(country=_make_consense("Bus", "country"))
+    bus_strategies.update(aggregation_strategies.get("buses", {}))
+    generator_strategies = aggregation_strategies.get("generators", {"p_nom_max": "sum"})
+
+    # this snippet supports compatibility of PyPSA and PyPSA-EUR:
+    if "p_nom_max" in generator_strategies:
+        if generator_strategies["p_nom_max"] == "min": generator_strategies["p_nom_max"] = np.min
+
     clustering = get_clustering_from_busmap(n, busmap,
-                                            bus_strategies=dict(country=_make_consense("Bus", "country")),
+                                            bus_strategies=bus_strategies,
                                             aggregate_generators_weighted=True,
                                             aggregate_generators_carriers=None,
                                             aggregate_one_ports=["Load", "StorageUnit"],
                                             line_length_factor=1.0,
-                                            generator_strategies={'p_nom_max': 'sum'},
+                                            generator_strategies=generator_strategies,
                                             scale_link_capital_costs=False)
         
     return clustering.network, busmap
 
 
-def cluster(n, n_clusters, config):
+def cluster(n, n_clusters, config, aggregation_strategies=dict()):
     logger.info(f"Clustering to {n_clusters} buses")
 
     focus_weights = config.get('focus_weights', None)
@@ -365,16 +375,9 @@ def cluster(n, n_clusters, config):
     renewable_carriers = pd.Index([tech
                                     for tech in n.generators.carrier.unique()
                                     if tech.split('-', 2)[0] in config['renewable']])
-    def consense(x):
-        v = x.iat[0]
-        assert ((x == v).all() or x.isnull().all()), (
-            "The `potential` configuration option must agree for all renewable carriers, for now!"
-        )
-        return v
-    potential_mode = (consense(pd.Series([config['renewable'][tech]['potential']
-                                            for tech in renewable_carriers]))
-                        if len(renewable_carriers) > 0 else 'conservative')
-    clustering = clustering_for_n_clusters(n, n_clusters, custom_busmap=False, potential_mode=potential_mode,
+
+    clustering = clustering_for_n_clusters(n, n_clusters, custom_busmap=False,
+                                           aggregation_strategies=aggregation_strategies,
                                            solver_name=config['solving']['solver']['name'],
                                            focus_weights=focus_weights)
 
@@ -389,6 +392,8 @@ if __name__ == "__main__":
 
     n = pypsa.Network(snakemake.input.network)
 
+    aggregation_strategies = snakemake.config["clustering"].get("aggregation_strategies", {})
+
     n, trafo_map = simplify_network_to_380(n)
 
     Nyears = n.snapshot_weightings.objective.sum() / 8760
@@ -402,11 +407,11 @@ if __name__ == "__main__":
     busmaps = [trafo_map, simplify_links_map, stub_map]
 
     if snakemake.config.get('clustering', {}).get('simplify', {}).get('to_substations', False):
-        n, substation_map = aggregate_to_substations(n)
+        n, substation_map = aggregate_to_substations(n, snakemake.config, aggregation_strategies)
         busmaps.append(substation_map)
 
     if snakemake.wildcards.simpl:
-        n, cluster_map = cluster(n, int(snakemake.wildcards.simpl), snakemake.config)
+        n, cluster_map = cluster(n, int(snakemake.wildcards.simpl), snakemake.config, aggregation_strategies)
         busmaps.append(cluster_map)
 
     # some entries in n.buses are not updated in previous functions, therefore can be wrong. as they are not needed

From b4d8dd8ecb827c3ed450375e80fcf3717cc2aaab Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Mon, 20 Jun 2022 19:21:08 +0200
Subject: [PATCH 12/20] add changes from PR #379

---
 config.default.yaml  | 9 +++++++--
 config.tutorial.yaml | 9 +++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/config.default.yaml b/config.default.yaml
index 9462719b..7162d449 100755
--- a/config.default.yaml
+++ b/config.default.yaml
@@ -24,8 +24,13 @@ clustering:
     to_substations: false # network is simplified to nodes with positive or negative power injection (i.e. substations or offwind connections)
   aggregation_strategies:
     generators:
-      p_nom_max: "sum" # use "min" for more conservative assumptions
-      p_nom_min: "sum"
+      p_nom_max: sum # use "min" for more conservative assumptions
+      p_nom_min: sum
+      p_min_pu: mean
+      marginal_cost: mean
+      committable: any
+      ramp_limit_up: max
+      ramp_limit_down: max
 
 snapshots:
   start: "2013-01-01"
diff --git a/config.tutorial.yaml b/config.tutorial.yaml
index 4dc7a94d..f18f23f4 100755
--- a/config.tutorial.yaml
+++ b/config.tutorial.yaml
@@ -24,8 +24,13 @@ clustering:
     to_substations: false # network is simplified to nodes with positive or negative power injection (i.e. substations or offwind connections)
   aggregation_strategies:
     generators:
-      p_nom_max: "sum" # use "min" for more conservative assumptions
-      p_nom_min: "sum"
+      p_nom_max: sum # use "min" for more conservative assumptions
+      p_nom_min: sum
+      p_min_pu: mean
+      marginal_cost: mean
+      committable: any
+      ramp_limit_up: max
+      ramp_limit_down: max
 
 snapshots:
   start: "2013-03-01"

From bdd0cc3aa125fa5a721357ebfc47418b79475716 Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Tue, 21 Jun 2022 18:42:49 +0200
Subject: [PATCH 13/20] clustering strats to configurables: review suggestions

---
 scripts/cluster_network.py  | 11 +++++++----
 scripts/simplify_network.py | 19 +++++++++++--------
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/scripts/cluster_network.py b/scripts/cluster_network.py
index 0a3d768a..fd66b043 100644
--- a/scripts/cluster_network.py
+++ b/scripts/cluster_network.py
@@ -263,11 +263,8 @@ def clustering_for_n_clusters(n, n_clusters, custom_busmap=False, aggregate_carr
 
     bus_strategies = dict(country=_make_consense("Bus", "country"))
     bus_strategies.update(aggregation_strategies.get("buses", {}))
-    generator_strategies = aggregation_strategies.get("generators", {"p_nom_max": "sum"})
 
-    # this snippet supports compatibility of PyPSA and PyPSA-EUR:
-    if "p_nom_max" in generator_strategies:
-        if generator_strategies["p_nom_max"] == "min": generator_strategies["p_nom_max"] = np.min
+    generator_strategies = aggregation_strategies.get("generators", {"p_nom_max": "sum"})
 
     if not isinstance(custom_busmap, pd.Series):
         busmap = busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights, algorithm)
@@ -369,6 +366,12 @@ if __name__ == "__main__":
             )
             return v
         aggregation_strategies = snakemake.config["clustering"].get("aggregation_strategies", {})
+        aggregation_strategies = {}
+        # translate str entries of aggregation_strategies to pd.Series functions:
+        aggregation_strategies = {
+            p: {k: getattr(pd.Series, v) for k,v in aggregation_strategies[p].items()}
+            for p in aggregation_strategies.keys()
+        }
 
         custom_busmap = snakemake.config["enable"].get("custom_busmap", False)
         if custom_busmap:
diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 7d51c511..52e0c815 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -320,7 +320,7 @@ def remove_stubs(n, costs, config, output):
 
     return n, busmap
 
-def aggregate_to_substations(n, config, aggregation_strategies=dict(), buses_i=None):
+def aggregate_to_substations(n, aggregation_strategies=dict(), buses_i=None):
     # can be used to aggregate a selection of buses to electrically closest neighbors
     # if no buses are given, nodes that are no substations or without offshore connection are aggregated
     
@@ -345,15 +345,13 @@ def aggregate_to_substations(n, config, aggregation_strategies=dict(), buses_i=N
     busmap = n.buses.index.to_series()
     busmap.loc[buses_i] = dist.idxmin(1)
 
-    # default aggregation strategies must be specified within the function, otherwise (when defaults are passed in
-    # the function's definition) they get lost in case custom values for different variables are specified in the config
+    # default aggregation strategies must be specified within the function, otherwise (when defaults
+    # are passed in the function's definition) they get lost in case custom values for different
+    # variables are specified in the config.
     bus_strategies = dict(country=_make_consense("Bus", "country"))
     bus_strategies.update(aggregation_strategies.get("buses", {}))
-    generator_strategies = aggregation_strategies.get("generators", {"p_nom_max": "sum"})
 
-    # this snippet supports compatibility of PyPSA and PyPSA-EUR:
-    if "p_nom_max" in generator_strategies:
-        if generator_strategies["p_nom_max"] == "min": generator_strategies["p_nom_max"] = np.min
+    generator_strategies = aggregation_strategies.get("generators", {"p_nom_max": "sum"})
 
     clustering = get_clustering_from_busmap(n, busmap,
                                             bus_strategies=bus_strategies,
@@ -393,6 +391,11 @@ if __name__ == "__main__":
     n = pypsa.Network(snakemake.input.network)
 
     aggregation_strategies = snakemake.config["clustering"].get("aggregation_strategies", {})
+    # translate str entries of aggregation_strategies to pd.Series functions:
+    aggregation_strategies = {
+        p: {k: getattr(pd.Series, v) for k,v in aggregation_strategies[p].items()}
+        for p in aggregation_strategies.keys()
+    }
 
     n, trafo_map = simplify_network_to_380(n)
 
@@ -407,7 +410,7 @@ if __name__ == "__main__":
     busmaps = [trafo_map, simplify_links_map, stub_map]
 
     if snakemake.config.get('clustering', {}).get('simplify', {}).get('to_substations', False):
-        n, substation_map = aggregate_to_substations(n, snakemake.config, aggregation_strategies)
+        n, substation_map = aggregate_to_substations(n, aggregation_strategies)
         busmaps.append(substation_map)
 
     if snakemake.wildcards.simpl:

From 8dba48c4125a5578a65f4cf96c94f33780fb36ca Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Tue, 21 Jun 2022 19:08:22 +0200
Subject: [PATCH 14/20] clustering strats to configurables: documentation and
 testing

---
 doc/configtables/clustering.csv | 5 +++++
 doc/release_notes.rst           | 1 +
 test/config.test1.yaml          | 9 +++++++++
 3 files changed, 15 insertions(+)

diff --git a/doc/configtables/clustering.csv b/doc/configtables/clustering.csv
index 2f63f955..f178ff5c 100644
--- a/doc/configtables/clustering.csv
+++ b/doc/configtables/clustering.csv
@@ -1,3 +1,8 @@
 ,Unit,Values,Description
 simplify,,,
 -- to_substations,bool,"{'true','false'}","Aggregates all nodes without power injection (positive or negative, i.e. demand or generation) to electrically closest ones"
+-- aggregation_strategies,,,
+-- -- generators,,,
+-- -- -- {key},str,"{key} can be any of the component of the generator (str). It’s value can be any that can be converted to pandas.Series using getattr(). For example one of {min, max, sum}.","Aggregates the component according to the given strategy. For example, if sum, then all values within each cluster are summed to represent the new generator."
+-- -- buses,,,
+-- -- -- {key},str,"{key} can be any of the component of the bus (str). It’s value can be any that can be converted to pandas.Series using getattr(). For example one of {min, max, sum}.","Aggregates the component according to the given strategy. For example, if sum, then all values within each cluster are summed to represent the new bus."
diff --git a/doc/release_notes.rst b/doc/release_notes.rst
index 963a1175..c000a046 100644
--- a/doc/release_notes.rst
+++ b/doc/release_notes.rst
@@ -74,6 +74,7 @@ Upcoming Release
 
 * Update rasterio version to correctly calculate exclusion raster
 
+* Clustering strategies for generators and buses have moved from distinct scripts to configurables to unify the process and make it more transparent.
 
 PyPSA-Eur 0.4.0 (22th September 2021)
 =====================================
diff --git a/test/config.test1.yaml b/test/config.test1.yaml
index a9ce1e50..e3f39ab5 100755
--- a/test/config.test1.yaml
+++ b/test/config.test1.yaml
@@ -21,6 +21,15 @@ countries: ['BE']
 clustering:
   simplify:
     to_substations: false # network is simplified to nodes with positive or negative power injection (i.e. substations or offwind connections)
+  aggregation_strategies:
+    generators:
+      p_nom_max: sum # use "min" for more conservative assumptions
+      p_nom_min: sum
+      p_min_pu: mean
+      marginal_cost: mean
+      committable: any
+      ramp_limit_up: max
+      ramp_limit_down: max
 
 snapshots:
   start: "2013-03-01"

From c9c738e96b2240c8522f84a1941b163069c0e351 Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Fri, 24 Jun 2022 20:57:53 +0200
Subject: [PATCH 15/20] clustering strats to configurables: set defaults for
 yaml-incompatible strats + include in stubaggregation

---
 config.default.yaml         |  1 +
 config.tutorial.yaml        |  1 +
 scripts/cluster_network.py  |  6 +++---
 scripts/simplify_network.py | 37 +++++++++++++++++++++++++------------
 test/config.test1.yaml      |  1 +
 5 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/config.default.yaml b/config.default.yaml
index 7162d449..144f416e 100755
--- a/config.default.yaml
+++ b/config.default.yaml
@@ -31,6 +31,7 @@ clustering:
       committable: any
       ramp_limit_up: max
       ramp_limit_down: max
+      efficiency: mean
 
 snapshots:
   start: "2013-01-01"
diff --git a/config.tutorial.yaml b/config.tutorial.yaml
index f18f23f4..31ca7f99 100755
--- a/config.tutorial.yaml
+++ b/config.tutorial.yaml
@@ -31,6 +31,7 @@ clustering:
       committable: any
       ramp_limit_up: max
       ramp_limit_down: max
+      efficiency:mean
 
 snapshots:
   start: "2013-03-01"
diff --git a/scripts/cluster_network.py b/scripts/cluster_network.py
index fd66b043..93cd89cd 100644
--- a/scripts/cluster_network.py
+++ b/scripts/cluster_network.py
@@ -264,7 +264,8 @@ def clustering_for_n_clusters(n, n_clusters, custom_busmap=False, aggregate_carr
     bus_strategies = dict(country=_make_consense("Bus", "country"))
     bus_strategies.update(aggregation_strategies.get("buses", {}))
 
-    generator_strategies = aggregation_strategies.get("generators", {"p_nom_max": "sum"})
+    generator_strategies = {'build_year': lambda x: 0, 'lifetime': lambda x: np.inf}
+    generator_strategies.update(aggregation_strategies.get("generators", {}))
 
     if not isinstance(custom_busmap, pd.Series):
         busmap = busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights, algorithm)
@@ -366,7 +367,6 @@ if __name__ == "__main__":
             )
             return v
         aggregation_strategies = snakemake.config["clustering"].get("aggregation_strategies", {})
-        aggregation_strategies = {}
         # translate str entries of aggregation_strategies to pd.Series functions:
         aggregation_strategies = {
             p: {k: getattr(pd.Series, v) for k,v in aggregation_strategies[p].items()}
@@ -383,7 +383,7 @@ if __name__ == "__main__":
                                                line_length_factor, aggregation_strategies,
                                                snakemake.config['solving']['solver']['name'],
                                                "kmeans", hvac_overhead_cost, focus_weights)
-    
+
     update_p_nom_max(clustering.network)
 
     clustering.network.export_to_netcdf(snakemake.output.network)
diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 52e0c815..0fda5c77 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -189,7 +189,10 @@ def _adjust_capital_costs_using_connection_costs(n, connection_costs_to_bus, out
             
 
 
-def _aggregate_and_move_components(n, busmap, connection_costs_to_bus, output, aggregate_one_ports={"Load", "StorageUnit"}):
+def _aggregate_and_move_components(n, busmap, connection_costs_to_bus, output,
+                                   aggregate_one_ports={"Load", "StorageUnit"},
+                                   aggregation_strategies=dict()):
+
     def replace_components(n, c, df, pnl):
         n.mremove(c, n.df(c).index)
 
@@ -200,7 +203,12 @@ def _aggregate_and_move_components(n, busmap, connection_costs_to_bus, output, a
 
     _adjust_capital_costs_using_connection_costs(n, connection_costs_to_bus, output)
 
-    generators, generators_pnl = aggregategenerators(n, busmap, custom_strategies={'p_nom_min': np.sum})
+    generator_strategies = {'build_year': lambda x: 0, 'lifetime': lambda x: np.inf}
+    generator_strategies.update(aggregation_strategies.get("generators", {}))
+
+    generators, generators_pnl = aggregategenerators(
+        n, busmap, custom_strategies=generator_strategies
+    )
     replace_components(n, "Generator", generators, generators_pnl)
 
     for one_port in aggregate_one_ports:
@@ -214,7 +222,7 @@ def _aggregate_and_move_components(n, busmap, connection_costs_to_bus, output, a
         n.mremove(c, df.index[df.bus0.isin(buses_to_del) | df.bus1.isin(buses_to_del)])
 
 
-def simplify_links(n, costs, config, output):
+def simplify_links(n, costs, config, output, aggregation_strategies=dict()):
     ## Complex multi-node links are folded into end-points
     logger.info("Simplifying connected link components")
 
@@ -306,17 +314,19 @@ def simplify_links(n, costs, config, output):
 
     logger.debug("Collecting all components using the busmap")
 
-    _aggregate_and_move_components(n, busmap, connection_costs_to_bus, output)
+    _aggregate_and_move_components(n, busmap, connection_costs_to_bus, output,
+                                   aggregation_strategies=aggregation_strategies)
     return n, busmap
 
-def remove_stubs(n, costs, config, output):
+def remove_stubs(n, costs, config, output, aggregation_strategies=dict()):
     logger.info("Removing stubs")
 
     busmap = busmap_by_stubs(n) #  ['country'])
 
     connection_costs_to_bus = _compute_connection_costs_to_bus(n, busmap, costs, config)
 
-    _aggregate_and_move_components(n, busmap, connection_costs_to_bus, output)
+    _aggregate_and_move_components(n, busmap, connection_costs_to_bus, output,
+                                   aggregation_strategies=aggregation_strategies)
 
     return n, busmap
 
@@ -345,13 +355,14 @@ def aggregate_to_substations(n, aggregation_strategies=dict(), buses_i=None):
     busmap = n.buses.index.to_series()
     busmap.loc[buses_i] = dist.idxmin(1)
 
-    # default aggregation strategies must be specified within the function, otherwise (when defaults
-    # are passed in the function's definition) they get lost in case custom values for different
-    # variables are specified in the config.
+    # default aggregation strategies that cannot be defined in .yaml format must be specified within
+    # the function, otherwise (when defaults are passed in the function's definition) they get lost
+    # in case custom values for different variables are specified in the config.
     bus_strategies = dict(country=_make_consense("Bus", "country"))
     bus_strategies.update(aggregation_strategies.get("buses", {}))
 
-    generator_strategies = aggregation_strategies.get("generators", {"p_nom_max": "sum"})
+    generator_strategies = {'build_year': lambda x: 0, 'lifetime': lambda x: np.inf}
+    generator_strategies.update(aggregation_strategies.get("generators", {}))
 
     clustering = get_clustering_from_busmap(n, busmap,
                                             bus_strategies=bus_strategies,
@@ -403,9 +414,11 @@ if __name__ == "__main__":
 
     technology_costs = load_costs(snakemake.input.tech_costs, snakemake.config['costs'], snakemake.config['electricity'], Nyears)
 
-    n, simplify_links_map = simplify_links(n, technology_costs, snakemake.config, snakemake.output)
+    n, simplify_links_map = simplify_links(n, technology_costs, snakemake.config, snakemake.output,
+                                           aggregation_strategies)
 
-    n, stub_map = remove_stubs(n, technology_costs, snakemake.config, snakemake.output)
+    n, stub_map = remove_stubs(n, technology_costs, snakemake.config, snakemake.output,
+                               aggregation_strategies=aggregation_strategies)
 
     busmaps = [trafo_map, simplify_links_map, stub_map]
 
diff --git a/test/config.test1.yaml b/test/config.test1.yaml
index e3f39ab5..6d626f7e 100755
--- a/test/config.test1.yaml
+++ b/test/config.test1.yaml
@@ -30,6 +30,7 @@ clustering:
       committable: any
       ramp_limit_up: max
       ramp_limit_down: max
+      efficiency: mean
 
 snapshots:
   start: "2013-03-01"

From 2dfa2753ba8b5de6df56d6e6f9b278aa32047d55 Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Mon, 27 Jun 2022 13:59:04 +0200
Subject: [PATCH 16/20] clustering strats to configurables: config spacing

---
 config.tutorial.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config.tutorial.yaml b/config.tutorial.yaml
index 31ca7f99..edf0091c 100755
--- a/config.tutorial.yaml
+++ b/config.tutorial.yaml
@@ -31,7 +31,7 @@ clustering:
       committable: any
       ramp_limit_up: max
       ramp_limit_down: max
-      efficiency:mean
+      efficiency: mean
 
 snapshots:
   start: "2013-03-01"

From a3af137b74d6e505e2dae32fc7cea2b2226f834c Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Mon, 27 Jun 2022 14:18:47 +0200
Subject: [PATCH 17/20] clustering strats to configurables: move duplicate code
 to _helpers script & import

---
 scripts/_helpers.py         | 16 ++++++++++++++++
 scripts/cluster_network.py  |  8 ++------
 scripts/simplify_network.py | 14 +++-----------
 3 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/scripts/_helpers.py b/scripts/_helpers.py
index 6e47c053..3c3e6ac7 100644
--- a/scripts/_helpers.py
+++ b/scripts/_helpers.py
@@ -210,6 +210,22 @@ def progress_retrieve(url, file):
 
     urllib.request.urlretrieve(url, file, reporthook=dlProgress)
 
+def get_aggregation_strategies(aggregation_strategies):
+    # default aggregation strategies that cannot be defined in .yaml format must be specified within
+    # the function, otherwise (when defaults are passed in the function's definition) they get lost
+    # when custom values are specified in the config.
+
+    import numpy as np
+    from pypsa.networkclustering import _make_consense
+
+    bus_strategies = dict(country=_make_consense("Bus", "country"))
+    bus_strategies.update(aggregation_strategies.get("buses", {}))
+
+    generator_strategies = {'build_year': lambda x: 0, 'lifetime': lambda x: np.inf}
+    generator_strategies.update(aggregation_strategies.get("generators", {}))
+
+    return bus_strategies, generator_strategies
+
 
 def mock_snakemake(rulename, **wildcards):
     """
diff --git a/scripts/cluster_network.py b/scripts/cluster_network.py
index 93cd89cd..6186a00e 100644
--- a/scripts/cluster_network.py
+++ b/scripts/cluster_network.py
@@ -121,7 +121,7 @@ Exemplary unsolved network clustered to 37 nodes:
 """
 
 import logging
-from _helpers import configure_logging, update_p_nom_max
+from _helpers import configure_logging, update_p_nom_max, get_aggregation_strategies
 
 import pypsa
 import os
@@ -261,11 +261,7 @@ def clustering_for_n_clusters(n, n_clusters, custom_busmap=False, aggregate_carr
                               line_length_factor=1.25, aggregation_strategies=dict(), solver_name="cbc",
                               algorithm="kmeans", extended_link_costs=0, focus_weights=None):
 
-    bus_strategies = dict(country=_make_consense("Bus", "country"))
-    bus_strategies.update(aggregation_strategies.get("buses", {}))
-
-    generator_strategies = {'build_year': lambda x: 0, 'lifetime': lambda x: np.inf}
-    generator_strategies.update(aggregation_strategies.get("generators", {}))
+    bus_strategies, generator_strategies = get_aggregation_strategies(aggregation_strategies)
 
     if not isinstance(custom_busmap, pd.Series):
         busmap = busmap_for_n_clusters(n, n_clusters, solver_name, focus_weights, algorithm)
diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 0fda5c77..37e7e698 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -83,7 +83,7 @@ The rule :mod:`simplify_network` does up to four things:
 """
 
 import logging
-from _helpers import configure_logging, update_p_nom_max
+from _helpers import configure_logging, update_p_nom_max, get_aggregation_strategies
 
 from cluster_network import clustering_for_n_clusters, cluster_regions
 from add_electricity import load_costs
@@ -203,8 +203,7 @@ def _aggregate_and_move_components(n, busmap, connection_costs_to_bus, output,
 
     _adjust_capital_costs_using_connection_costs(n, connection_costs_to_bus, output)
 
-    generator_strategies = {'build_year': lambda x: 0, 'lifetime': lambda x: np.inf}
-    generator_strategies.update(aggregation_strategies.get("generators", {}))
+    _, generator_strategies = get_aggregation_strategies(aggregation_strategies)
 
     generators, generators_pnl = aggregategenerators(
         n, busmap, custom_strategies=generator_strategies
@@ -355,14 +354,7 @@ def aggregate_to_substations(n, aggregation_strategies=dict(), buses_i=None):
     busmap = n.buses.index.to_series()
     busmap.loc[buses_i] = dist.idxmin(1)
 
-    # default aggregation strategies that cannot be defined in .yaml format must be specified within
-    # the function, otherwise (when defaults are passed in the function's definition) they get lost
-    # in case custom values for different variables are specified in the config.
-    bus_strategies = dict(country=_make_consense("Bus", "country"))
-    bus_strategies.update(aggregation_strategies.get("buses", {}))
-
-    generator_strategies = {'build_year': lambda x: 0, 'lifetime': lambda x: np.inf}
-    generator_strategies.update(aggregation_strategies.get("generators", {}))
+    bus_strategies, generator_strategies = get_aggregation_strategies(aggregation_strategies)
 
     clustering = get_clustering_from_busmap(n, busmap,
                                             bus_strategies=bus_strategies,

From 51de606aab6eddcba8d121873f779c4ce995afa8 Mon Sep 17 00:00:00 2001
From: Fabian Neumann <fabian.neumann@outlook.de>
Date: Mon, 27 Jun 2022 19:00:41 +0200
Subject: [PATCH 18/20] Revert "remove build/retrieve natura raster, directly
 use shapefile"

---
 .github/workflows/ci.yaml                    | 23 ++---
 Snakefile                                    | 27 ++++--
 config.default.yaml                          |  2 +
 config.tutorial.yaml                         | 10 ++-
 doc/configtables/toplevel.csv                |  2 +
 doc/preparation.rst                          |  2 +
 doc/preparation/build_natura_raster.rst      | 39 +++++++++
 doc/preparation/build_renewable_profiles.rst |  3 +
 doc/preparation/retrieve.rst                 | 27 ++++++
 doc/release_notes.rst                        |  5 --
 doc/tutorial.rst                             |  4 +-
 envs/environment.yaml                        |  1 -
 scripts/build_natura_raster.py               | 89 ++++++++++++++++++++
 scripts/build_renewable_profiles.py          |  4 +-
 test/config.test1.yaml                       |  2 +
 15 files changed, 200 insertions(+), 40 deletions(-)
 create mode 100644 doc/preparation/build_natura_raster.rst
 create mode 100644 scripts/build_natura_raster.py

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index d447a56f..c753deab 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -13,12 +13,13 @@ on:
     branches:
     - master
   pull_request:
+    branches:
+    - master
   schedule:
     - cron: "0 5 * * TUE"
 
 env:
-  CONDA_CACHE_NUMBER: 1 # Change this value to manually reset the environment cache
-  DATA_CACHE_NUMBER: 1
+  CACHE_NUMBER: 1 # Change this value to manually reset the environment cache
 
 jobs:
   build:
@@ -65,26 +66,16 @@ jobs:
             miniforge-version: latest
             activate-environment: pypsa-eur
             use-mamba: true
-
-      - name: Set cache dates
-        run: |
-          echo "DATE=$(date +'%Y%m%d')" >> $GITHUB_ENV
-          echo "WEEK=$(date +'%Y%U')" >> $GITHUB_ENV
-
-      - name: Cache data and cutouts folders
-        uses: actions/cache@v3
-        with:
-          path: |
-            data
-            cutouts
-          key: data-cutouts-${{ env.WEEK }}-${{ env.DATA_CACHE_NUMBER }}
+            
+      - name: Set cache date
+        run: echo "DATE=$(date +'%Y%m%d')" >> $GITHUB_ENV
 
       - name: Create environment cache
         uses: actions/cache@v2
         id: cache
         with:
           path: ${{ matrix.prefix }}
-          key: ${{ matrix.label }}-conda-${{ hashFiles('envs/environment.yaml') }}-${{ env.DATE }}-${{ env.CONDA_CACHE_NUMBER }}
+          key: ${{ matrix.label }}-conda-${{ hashFiles('envs/environment.yaml') }}-${{ env.DATE }}-${{ env.CACHE_NUMBER }}
 
       - name: Update environment due to outdated or unavailable cache
         run: mamba env update -n pypsa-eur -f envs/environment.yaml
diff --git a/Snakefile b/Snakefile
index 42ecc45b..7678a401 100644
--- a/Snakefile
+++ b/Snakefile
@@ -67,12 +67,6 @@ if config['enable'].get('retrieve_databundle', True):
         script: 'scripts/retrieve_databundle.py'
 
 
-rule retrieve_natura_data:
-    input: HTTP.remote("sdi.eea.europa.eu/datashare/s/H6QGCybMdLLnywo/download", additional_request_string="?path=%2FNatura2000_end2020_gpkg&files=Natura2000_end2020.gpkg", static=True)
-    output: "data/Natura2000_end2020.gpkg"
-    run: move(input[0], output[0])
-
-
 rule retrieve_load_data:
     input: HTTP.remote("data.open-power-system-data.org/time_series/2019-06-05/time_series_60min_singleindex.csv", keep_local=True, static=True)
     output: "data/load_raw.csv"
@@ -171,13 +165,28 @@ if config['enable'].get('retrieve_cutout', True):
         run: move(input[0], output[0])
 
 
+if config['enable'].get('build_natura_raster', False):
+    rule build_natura_raster:
+        input:
+            natura="data/bundle/natura/Natura2000_end2015.shp",
+            cutouts=expand("cutouts/{cutouts}.nc", **config['atlite'])
+        output: "resources/natura.tiff"
+        log: "logs/build_natura_raster.log"
+        script: "scripts/build_natura_raster.py"
+
+
+if config['enable'].get('retrieve_natura_raster', True):
+    rule retrieve_natura_raster:
+        input: HTTP.remote("zenodo.org/record/4706686/files/natura.tiff", keep_local=True, static=True)
+        output: "resources/natura.tiff"
+        run: move(input[0], output[0])
+
+
 rule build_renewable_profiles:
     input:
         base_network="networks/base.nc",
         corine="data/bundle/corine/g250_clc06_V18_5.tif",
-        natura=lambda w: ("data/Natura2000_end2020.gpkg"
-                          if config["renewable"][w.technology]["natura"]
-                          else []),
+        natura="resources/natura.tiff",
         gebco=lambda w: ("data/bundle/GEBCO_2014_2D.nc"
                          if "max_depth" in config["renewable"][w.technology].keys()
                          else []),
diff --git a/config.default.yaml b/config.default.yaml
index 7e6a1f78..144f416e 100755
--- a/config.default.yaml
+++ b/config.default.yaml
@@ -43,6 +43,8 @@ enable:
   retrieve_databundle: true
   build_cutout: false
   retrieve_cutout: true
+  build_natura_raster: false
+  retrieve_natura_raster: true
   custom_busmap: false
 
 electricity:
diff --git a/config.tutorial.yaml b/config.tutorial.yaml
index ba0bb01a..edf0091c 100755
--- a/config.tutorial.yaml
+++ b/config.tutorial.yaml
@@ -43,6 +43,8 @@ enable:
   retrieve_databundle: true
   build_cutout: false
   retrieve_cutout: true
+  build_natura_raster: false
+  retrieve_natura_raster: true
   custom_busmap: false
 
 electricity:
@@ -87,7 +89,7 @@ renewable:
                    24, 25, 26, 27, 28, 29, 31, 32]
       distance: 1000
       distance_grid_codes: [1, 2, 3, 4, 5, 6]
-    natura: false
+    natura: true
     potential: simple # or conservative
     clip_p_max_pu: 1.e-2
   offwind-ac:
@@ -98,7 +100,7 @@ renewable:
     capacity_per_sqkm: 3
     # correction_factor: 0.93
     corine: [44, 255]
-    natura: false
+    natura: true
     max_shore_distance: 30000
     potential: simple # or conservative
     clip_p_max_pu: 1.e-2
@@ -111,7 +113,7 @@ renewable:
     capacity_per_sqkm: 3
     # correction_factor: 0.93
     corine: [44, 255]
-    natura: false
+    natura: true
     min_shore_distance: 30000
     potential: simple # or conservative
     clip_p_max_pu: 1.e-2
@@ -133,7 +135,7 @@ renewable:
     # correction_factor: 0.854337
     corine: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
              14, 15, 16, 17, 18, 19, 20, 26, 31, 32]
-    natura: false
+    natura: true
     potential: simple # or conservative
     clip_p_max_pu: 1.e-2
 
diff --git a/doc/configtables/toplevel.csv b/doc/configtables/toplevel.csv
index 8965a0bc..b7f39d05 100644
--- a/doc/configtables/toplevel.csv
+++ b/doc/configtables/toplevel.csv
@@ -12,4 +12,6 @@ enable,,,
 -- retrieve_databundle,bool,"{true, false}","Switch to retrieve databundle from zenodo via the rule :mod:`retrieve_databundle` or whether to keep a custom databundle located in the corresponding folder."
 -- build_cutout,bool,"{true, false}","Switch to enable the building of cutouts via the rule :mod:`build_cutout`."
 -- retrieve_cutout,bool,"{true, false}","Switch to enable the retrieval of cutouts from zenodo with :mod:`retrieve_cutout`."
+-- build_natura_raster,bool,"{true, false}","Switch to enable the creation of the raster ``natura.tiff`` via the rule :mod:`build_natura_raster`."
+-- retrieve_natura_raster,bool,"{true, false}","Switch to enable the retrieval of ``natura.tiff`` from zenodo with :mod:`retrieve_natura_raster`."
 -- custom_busmap,bool,"{true, false}","Switch to enable the use of custom busmaps in rule :mod:`cluster_network`. If activated the rule looks for provided busmaps at ``data/custom_busmap_elec_s{simpl}_{clusters}.csv`` which should have the same format as ``resources/busmap_elec_s{simpl}_{clusters}.csv``, i.e. the index should contain the buses of ``networks/elec_s{simpl}.nc``."
diff --git a/doc/preparation.rst b/doc/preparation.rst
index 7f42190c..dba5e981 100644
--- a/doc/preparation.rst
+++ b/doc/preparation.rst
@@ -27,6 +27,7 @@ With these and the externally extracted ENTSO-E online map topology
 Then the process continues by calculating conventional power plant capacities, potentials, and per-unit availability time series for variable renewable energy carriers and hydro power plants with the following rules:
 
 - :mod:`build_powerplants` for today's thermal power plant capacities using `powerplantmatching <https://github.com/FRESNA/powerplantmatching>`_ allocating these to the closest substation for each powerplant,
+- :mod:`build_natura_raster` for rasterising NATURA2000 natural protection areas,
 - :mod:`build_renewable_profiles` for the hourly capacity factors and installation potentials constrained by land-use in each substation's Voronoi cell for PV, onshore and offshore wind, and
 - :mod:`build_hydro_profile` for the hourly per-unit hydro power availability time series.
 
@@ -40,6 +41,7 @@ together into a detailed PyPSA network stored in ``networks/elec.nc``.
    preparation/build_shapes
    preparation/build_load_data
    preparation/build_cutout
+   preparation/build_natura_raster
    preparation/prepare_links_p_nom
    preparation/base_network
    preparation/build_bus_regions
diff --git a/doc/preparation/build_natura_raster.rst b/doc/preparation/build_natura_raster.rst
new file mode 100644
index 00000000..e3ec4364
--- /dev/null
+++ b/doc/preparation/build_natura_raster.rst
@@ -0,0 +1,39 @@
+..
+  SPDX-FileCopyrightText: 2019-2020 The PyPSA-Eur Authors
+
+  SPDX-License-Identifier: CC-BY-4.0
+
+.. _natura:
+
+Rule ``build_natura_raster``
+===============================
+
+.. graphviz::
+    :align: center
+
+    digraph snakemake_dag {
+        graph [bgcolor=white,
+            margin=0,
+            size="8,5"
+        ];
+        node [fontname=sans,
+            fontsize=10,
+            penwidth=2,
+            shape=box,
+            style=rounded
+        ];
+        edge [color=grey,
+            penwidth=2
+        ];
+        9	 [color="0.22 0.6 0.85",
+            label=build_renewable_profiles];
+        12	 [color="0.31 0.6 0.85",
+            fillcolor=gray,
+            label=build_natura_raster,
+            style=filled];
+        12 -> 9;
+    }
+
+|
+
+.. automodule:: build_natura_raster
diff --git a/doc/preparation/build_renewable_profiles.rst b/doc/preparation/build_renewable_profiles.rst
index adc4d6ca..27e61583 100644
--- a/doc/preparation/build_renewable_profiles.rst
+++ b/doc/preparation/build_renewable_profiles.rst
@@ -41,6 +41,9 @@ Rule ``build_renewable_profiles``
         8	 [color="0.00 0.6 0.85",
             label=build_shapes];
         8 -> 9;
+        12	 [color="0.31 0.6 0.85",
+            label=build_natura_raster];
+        12 -> 9;
         13	 [color="0.56 0.6 0.85",
             label=build_cutout];
         13 -> 9;
diff --git a/doc/preparation/retrieve.rst b/doc/preparation/retrieve.rst
index 21187924..42479284 100644
--- a/doc/preparation/retrieve.rst
+++ b/doc/preparation/retrieve.rst
@@ -50,3 +50,30 @@ The :ref:`tutorial` uses a smaller cutout than required for the full model (30 M
 
 .. seealso::
     For details see :mod:`build_cutout` and read the `atlite documentation <https://atlite.readthedocs.io>`_.
+
+
+Rule ``retrieve_natura_raster``
+-------------------------------
+
+.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.4706686.svg
+   :target: https://doi.org/10.5281/zenodo.4706686
+
+This rule, as a substitute for :mod:`build_natura_raster`, downloads an already rasterized version (`natura.tiff <https://zenodo.org/record/4706686/files/natura.tiff>`_) of `Natura 2000 <https://en.wikipedia.org/wiki/Natura_2000>`_ natural protection areas to reduce computation times. The file is placed into the ``resources`` sub-directory.
+
+**Relevant Settings**
+
+.. code:: yaml
+
+    enable:
+        build_natura_raster:
+
+.. seealso::
+    Documentation of the configuration file ``config.yaml`` at
+    :ref:`toplevel_cf`
+
+**Outputs**
+
+- ``resources/natura.tiff``: Rasterized version of `Natura 2000 <https://en.wikipedia.org/wiki/Natura_2000>`_ natural protection areas to reduce computation times.
+
+.. seealso::
+    For details see :mod:`build_natura_raster`.
diff --git a/doc/release_notes.rst b/doc/release_notes.rst
index df7af28e..c000a046 100644
--- a/doc/release_notes.rst
+++ b/doc/release_notes.rst
@@ -62,11 +62,6 @@ Upcoming Release
 
 * New network topology extracted from the ENTSO-E interactive map.
 
-* Remove rules to build or retrieve rasterized NATURA 2000 dataset. Renewable potential calculation now directly uses the shapefiles.
-
-* Cache data and cutouts folders. This cache will be updated weekly.
-
-* Add rule to automatically retrieve Natura2000 natural protection areas. Switch of file format to GPKG.
 * The unused argument ``simple_hvdc_costs`` in :mod:`add_electricity` was removed.
 
 * Iterative solving with impedance updates is skipped if there are no expandable lines.
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 1c247f8e..c37abb39 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -35,8 +35,8 @@ To run the tutorial, use this as your configuration file ``config.yaml``.
 
     .../pypsa-eur % cp config.tutorial.yaml config.yaml
 
-This configuration is set to download a reduced data set via the rules :mod:`retrieve_databundle`
-and :mod:`retrieve_cutout` totalling at less than 250 MB.
+This configuration is set to download a reduced data set via the rules :mod:`retrieve_databundle`,
+:mod:`retrieve_natura_raster`, :mod:`retrieve_cutout` totalling at less than 250 MB.
 The full set of data dependencies would consume 5.3 GB.
 For more information on the data dependencies of PyPSA-Eur, continue reading :ref:`data`.
 
diff --git a/envs/environment.yaml b/envs/environment.yaml
index fd6dff49..f8060de1 100644
--- a/envs/environment.yaml
+++ b/envs/environment.yaml
@@ -45,7 +45,6 @@ dependencies:
   # GIS dependencies:
   - cartopy
   - descartes
-  - fiona # explicit for Windows
   - rasterio<=1.2.9  # 1.2.10 creates error https://github.com/PyPSA/atlite/issues/238
 
   # PyPSA-Eur-Sec Dependencies
diff --git a/scripts/build_natura_raster.py b/scripts/build_natura_raster.py
new file mode 100644
index 00000000..7fa9d544
--- /dev/null
+++ b/scripts/build_natura_raster.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: : 2017-2020 The PyPSA-Eur Authors
+#
+# SPDX-License-Identifier: MIT
+
+"""
+Rasters the vector data of the `Natura 2000 <https://en.wikipedia.org/wiki/Natura_2000>`_ natural protection areas onto all cutout regions.
+
+Relevant Settings
+-----------------
+
+.. code:: yaml
+
+    renewable:
+        {technology}:
+            cutout:
+
+.. seealso::
+    Documentation of the configuration file ``config.yaml`` at
+    :ref:`renewable_cf`
+
+Inputs
+------
+
+- ``data/bundle/natura/Natura2000_end2015.shp``: `Natura 2000 <https://en.wikipedia.org/wiki/Natura_2000>`_ natural protection areas.
+
+    .. image:: ../img/natura.png
+        :scale: 33 %
+
+Outputs
+-------
+
+- ``resources/natura.tiff``: Rasterized version of `Natura 2000 <https://en.wikipedia.org/wiki/Natura_2000>`_ natural protection areas to reduce computation times.
+
+    .. image:: ../img/natura.png
+        :scale: 33 %
+
+Description
+-----------
+
+"""
+
+import logging
+from _helpers import configure_logging
+
+import atlite
+import geopandas as gpd
+import rasterio as rio
+from rasterio.features import geometry_mask
+from rasterio.warp import transform_bounds
+
+logger = logging.getLogger(__name__)
+
+
+def determine_cutout_xXyY(cutout_name):
+    cutout = atlite.Cutout(cutout_name)
+    assert cutout.crs.to_epsg() == 4326
+    x, X, y, Y = cutout.extent
+    dx, dy = cutout.dx, cutout.dy
+    return [x - dx/2., X + dx/2., y - dy/2., Y + dy/2.]
+
+
+def get_transform_and_shape(bounds, res):
+    left, bottom = [(b // res)* res for b in bounds[:2]]
+    right, top = [(b // res + 1) * res for b in bounds[2:]]
+    shape = int((top - bottom) // res), int((right - left) / res)
+    transform = rio.Affine(res, 0, left, 0, -res, top)
+    return transform, shape
+
+
+if __name__ == "__main__":
+    if 'snakemake' not in globals():
+        from _helpers import mock_snakemake
+        snakemake = mock_snakemake('build_natura_raster')
+    configure_logging(snakemake)
+
+    cutouts = snakemake.input.cutouts
+    xs, Xs, ys, Ys = zip(*(determine_cutout_xXyY(cutout) for cutout in cutouts))
+    bounds = transform_bounds(4326, 3035, min(xs), min(ys), max(Xs), max(Ys))
+    transform, out_shape = get_transform_and_shape(bounds, res=100)
+
+    # adjusted boundaries
+    shapes = gpd.read_file(snakemake.input.natura).to_crs(3035)
+    raster = ~geometry_mask(shapes.geometry, out_shape[::-1], transform)
+    raster = raster.astype(rio.uint8)
+
+    with rio.open(snakemake.output[0], 'w', driver='GTiff', dtype=rio.uint8,
+                  count=1, transform=transform, crs=3035, compress='lzw',
+                  width=raster.shape[1], height=raster.shape[0]) as dst:
+        dst.write(raster, indexes=1)
diff --git a/scripts/build_renewable_profiles.py b/scripts/build_renewable_profiles.py
index a49df1f5..37e1e9de 100644
--- a/scripts/build_renewable_profiles.py
+++ b/scripts/build_renewable_profiles.py
@@ -227,9 +227,7 @@ if __name__ == '__main__':
     excluder = atlite.ExclusionContainer(crs=3035, res=100)
 
     if config['natura']:
-        mask = regions.to_crs(3035).buffer(0) # buffer to avoid invalid geometry
-        natura = gpd.read_file(snakemake.input.natura, mask=mask)
-        excluder.add_geometry(natura.geometry)
+        excluder.add_raster(snakemake.input.natura, nodata=0, allow_no_overlap=True)
 
     corine = config.get("corine", {})
     if "grid_codes" in corine:
diff --git a/test/config.test1.yaml b/test/config.test1.yaml
index 64b169a4..6d626f7e 100755
--- a/test/config.test1.yaml
+++ b/test/config.test1.yaml
@@ -42,6 +42,8 @@ enable:
   retrieve_databundle: true
   build_cutout: false
   retrieve_cutout: true
+  build_natura_raster: false
+  retrieve_natura_raster: true
   custom_busmap: false
 
 electricity:

From 29fe0fb7fb8ab34cde94dc01ca947cece2ed924c Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Mon, 27 Jun 2022 21:02:45 +0200
Subject: [PATCH 19/20] hierarchical clustering: account for changes from
 merging master

---
 scripts/cluster_network.py  | 1 +
 scripts/simplify_network.py | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/cluster_network.py b/scripts/cluster_network.py
index b1bf759d..59641a7e 100644
--- a/scripts/cluster_network.py
+++ b/scripts/cluster_network.py
@@ -12,6 +12,7 @@ Relevant Settings
 .. code:: yaml
 
     clustering:
+      cluster_network:
       aggregation_strategies:
 
     focus_weights:
diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 7e258966..0b31e5c6 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -14,7 +14,8 @@ Relevant Settings
 .. code:: yaml
 
     clustering:
-      simplify:
+      simplify_network:
+      cluster_network:
       aggregation_strategies:
 
     costs:
@@ -415,7 +416,7 @@ if __name__ == "__main__":
     busmaps = [trafo_map, simplify_links_map, stub_map]
 
     cluster_config = snakemake.config.get('clustering', {}).get('simplify_network', {})
-    if cluster_config.get('clustering', {}).get('simplify', {}).get('to_substations', False):
+    if cluster_config.get('clustering', {}).get('simplify_network', {}).get('to_substations', False):
         n, substation_map = aggregate_to_substations(n, aggregation_strategies)
         busmaps.append(substation_map)
 
@@ -429,7 +430,7 @@ if __name__ == "__main__":
         for carrier in carriers:
             buses_i = list(set(n.buses.index)-set(n.generators.query("carrier == @carrier").bus))
             logger.info(f'clustering preparaton (hac): aggregating {len(buses_i)} buses of type {carrier}.')
-            n, busmap_hac = aggregate_to_substations(n, buses_i)
+            n, busmap_hac = aggregate_to_substations(n, aggregation_strategies, buses_i)
             busmaps.append(busmap_hac)
 
     if snakemake.wildcards.simpl:

From f1dedd9429fe6a5633fdfeead552abfacea48e39 Mon Sep 17 00:00:00 2001
From: martacki <martha.frysztacki@kit.edu>
Date: Mon, 27 Jun 2022 21:11:49 +0200
Subject: [PATCH 20/20] hierarchical clustering: release notes & revert to old
 default clustering algorithm (kmeans)

---
 config.default.yaml    | 8 ++++----
 config.tutorial.yaml   | 8 ++++----
 doc/release_notes.rst  | 2 ++
 test/config.test1.yaml | 8 ++++----
 4 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/config.default.yaml b/config.default.yaml
index e699c38f..e1ac6237 100755
--- a/config.default.yaml
+++ b/config.default.yaml
@@ -22,11 +22,11 @@ countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'F
 clustering:
   simplify_network:
     to_substations: false # network is simplified to nodes with positive or negative power injection (i.e. substations or offwind connections)
-    algorithm: hac
-    feature: solar+onwind-time
-  cluster_network:
-    algorithm: hac # choose from: [hac, kmeans]
+    algorithm: kmeans # choose from: [hac, kmeans]
     feature: solar+onwind-time # only for hac. choose from: [solar+onwind-time, solar+onwind-cap, solar-time, solar-cap, solar+offwind-cap] etc.
+  cluster_network:
+    algorithm: kmeans
+    feature: solar+onwind-time
   aggregation_strategies:
     generators:
       p_nom_max: sum # use "min" for more conservative assumptions
diff --git a/config.tutorial.yaml b/config.tutorial.yaml
index 75703b8a..db20f1a4 100755
--- a/config.tutorial.yaml
+++ b/config.tutorial.yaml
@@ -22,11 +22,11 @@ countries: ['BE']
 clustering:
   simplify_network:
     to_substations: false # network is simplified to nodes with positive or negative power injection (i.e. substations or offwind connections)
-    algorithm: hac
-    feature: solar+onwind-time
-  cluster_network:
-    algorithm: hac # choose from: [hac, kmeans]
+    algorithm: kmeans # choose from: [hac, kmeans]
     feature: solar+onwind-time # only for hac. choose from: [solar+onwind-time, solar+onwind-cap, solar-time, solar-cap, solar+offwind-cap] etc.
+  cluster_network:
+    algorithm: kmeans
+    feature: solar+onwind-time
   aggregation_strategies:
     generators:
       p_nom_max: sum # use "min" for more conservative assumptions
diff --git a/doc/release_notes.rst b/doc/release_notes.rst
index c000a046..47a67970 100644
--- a/doc/release_notes.rst
+++ b/doc/release_notes.rst
@@ -76,6 +76,8 @@ Upcoming Release
 
 * Clustering strategies for generators and buses have moved from distinct scripts to configurables to unify the process and make it more transparent.
 
+* Hierarchical clustering was introduced. Distance metric is calculated from renewable potentials on hourly (feature entry ends with `-time`) or annual (feature entry in config end with `-cap`) values.
+
 PyPSA-Eur 0.4.0 (22th September 2021)
 =====================================
 
diff --git a/test/config.test1.yaml b/test/config.test1.yaml
index d8d095b8..5aecfdf2 100755
--- a/test/config.test1.yaml
+++ b/test/config.test1.yaml
@@ -21,11 +21,11 @@ countries: ['BE']
 clustering:
   simplify_network:
     to_substations: false # network is simplified to nodes with positive or negative power injection (i.e. substations or offwind connections)
-    algorithm: hac
-    feature: solar+onwind-time
-  cluster_network:
-    algorithm: hac # choose from: [hac, kmeans]
+    algorithm: kmeans # choose from: [hac, kmeans]
     feature: solar+onwind-time # only for hac. choose from: [solar+onwind-time, solar+onwind-cap, solar-time, solar-cap, solar+offwind-cap] etc.
+  cluster_network:
+    algorithm: kmeans
+    feature: solar+onwind-time
   aggregation_strategies:
     generators:
       p_nom_max: sum # use "min" for more conservative assumptions