# -*- coding: utf-8 -*- # SPDX-FileCopyrightText: : 2020-2024 The PyPSA-Eur Authors # # SPDX-License-Identifier: MIT """ This script is used to clean OpenStreetMap (OSM) data for creating a PyPSA-Eur ready network. The script performs various cleaning operations on the OSM data, including: - Cleaning voltage, circuits, cables, wires, and frequency columns - Splitting semicolon-separated cells into new rows - Distributing values to circuits based on the number of splits - Adding line endings to substations based on line data """ import json import logging import os import re import geopandas as gpd import numpy as np import pandas as pd from _helpers import configure_logging, set_scenario_config from shapely.geometry import LineString, MultiLineString, Point, Polygon from shapely.ops import linemerge, unary_union logger = logging.getLogger(__name__) def _create_linestring(row): """ Create a LineString object from the given row. Args: row (dict): A dictionary containing the row data. Returns: LineString: A LineString object representing the geometry. """ coords = [(coord["lon"], coord["lat"]) for coord in row["geometry"]] return LineString(coords) def _create_polygon(row): """ Create a Shapely Polygon from a list of coordinate dictionaries. Parameters: coords (list): List of dictionaries with 'lat' and 'lon' keys representing coordinates. Returns: shapely.geometry.Polygon: The constructed polygon object. """ # Extract coordinates as tuples point_coords = [(coord["lon"], coord["lat"]) for coord in row["geometry"]] # Ensure closure by repeating the first coordinate as the last coordinate if point_coords[0] != point_coords[-1]: point_coords.append(point_coords[0]) # Create Polygon object polygon = Polygon(point_coords) return polygon def _find_closest_polygon(gdf, point): """ Find the closest polygon in a GeoDataFrame to a given point. Parameters: gdf (GeoDataFrame): A GeoDataFrame containing polygons. point (Point): A Point object representing the target point. Returns: int: The index of the closest polygon in the GeoDataFrame. """ # Compute the distance to each polygon gdf["distance"] = gdf["geometry"].apply(lambda geom: point.distance(geom)) # Find the index of the closest polygon closest_idx = gdf["distance"].idxmin() # Get the closest polygon's row closest_polygon = gdf.loc[closest_idx] return closest_idx def _clean_voltage(column): """ Function to clean the raw voltage column: manual fixing and drop nan values Args: - column: pandas Series, the column to be cleaned Returns: - column: pandas Series, the cleaned column """ logger.info("Cleaning voltages.") column = column.copy() column = ( column.astype(str) .str.lower() .str.replace("400/220/110 kV'", "400000;220000;110000") .str.replace("400/220/110/20_kv", "400000;220000;110000;20000") .str.replace("2x25000", "25000;25000") .str.replace("é", ";") ) column = ( column.astype(str) .str.lower() .str.replace("(temp 150000)", "") .str.replace("low", "1000") .str.replace("minor", "1000") .str.replace("medium", "33000") .str.replace("med", "33000") .str.replace("m", "33000") .str.replace("high", "150000") .str.replace("23000-109000", "109000") .str.replace("380000>220000", "380000;220000") .str.replace(":", ";") .str.replace("<", ";") .str.replace(",", ";") .str.replace("kv", "000") .str.replace("kva", "000") .str.replace("/", ";") .str.replace("nan", "") .str.replace("", "") ) # Remove all remaining non-numeric characters except for semicolons column = column.apply(lambda x: re.sub(r"[^0-9;]", "", str(x))) column.dropna(inplace=True) return column def _clean_circuits(column): """ Function to clean the raw circuits column: manual fixing and drop nan values Args: - column: pandas Series, the column to be cleaned Returns: - column: pandas Series, the cleaned column """ logger.info("Cleaning circuits.") column = column.copy() column = ( column.astype(str) .str.replace("partial", "") .str.replace("1operator=RTE operator:wikidata=Q2178795", "") .str.lower() .str.replace("1,5", "3") .str.replace("1/3", "1") .str.replace("", "") .str.replace("nan", "") ) # Remove all remaining non-numeric characters except for semicolons column = column.apply(lambda x: re.sub(r"[^0-9;]", "", x)) column.dropna(inplace=True) return column.astype(str) def _clean_cables(column): """ Function to clean the raw cables column: manual fixing and drop nan values Args: - column: pandas Series, the column to be cleaned Returns: - column: pandas Series, the cleaned column """ logger.info("Cleaning cables.") column = column.copy() column = ( column.astype(str) .str.lower() .str.replace("1/3", "1") .str.replace("3x2;2", "3") .str.replace("", "") .str.replace("nan", "") ) # Remove all remaining non-numeric characters except for semicolons column = column.apply(lambda x: re.sub(r"[^0-9;]", "", x)) column.dropna(inplace=True) return column.astype(str) def _clean_wires(column): """ Function to clean the raw wires column: manual fixing and drop nan values Args: - column: pandas Series, the column to be cleaned Returns: - column: pandas Series, the cleaned column """ logger.info("Cleaning wires.") column = column.copy() column = ( column.astype(str) .str.lower() .str.replace("?", "") .str.replace("trzyprzewodowe", "3") .str.replace("pojedyńcze", "1") .str.replace("single", "1") .str.replace("double", "2") .str.replace("triple", "3") .str.replace("quad", "4") .str.replace("fivefold", "5") .str.replace("yes", "3") .str.replace("1/3", "1") .str.replace("3x2;2", "3") .str.replace("_", "") .str.replace("", "") .str.replace("nan", "") ) # Remove all remaining non-numeric characters except for semicolons column = column.apply(lambda x: re.sub(r"[^0-9;]", "", x)) column.dropna(inplace=True) return column.astype(str) def _check_voltage(voltage, list_voltages): """ Check if the given voltage is present in the list of allowed voltages. Parameters: voltage (str): The voltage to check. list_voltages (list): A list of allowed voltages. Returns: bool: True if the voltage is present in the list of allowed voltages, False otherwise. """ voltages = voltage.split(";") for v in voltages: if v in list_voltages: return True return False def _clean_frequency(column): """ Function to clean the raw frequency column: manual fixing and drop nan values Args: - column: pandas Series, the column to be cleaned Returns: - column: pandas Series, the cleaned column """ logger.info("Cleaning frequencies.") column = column.copy() column = ( column.astype(str) .str.lower() .str.replace("16.67", "16.7") .str.replace("16,7", "16.7") .str.replace("?", "") .str.replace("hz", "") .str.replace(" ", "") .str.replace("", "") .str.replace("nan", "") ) # Remove all remaining non-numeric characters except for semicolons column = column.apply(lambda x: re.sub(r"[^0-9;.]", "", x)) column.dropna(inplace=True) return column.astype(str) def _clean_rating(column): """ Function to clean and sum the rating columns: Args: - column: pandas Series, the column to be cleaned Returns: - column: pandas Series, the cleaned column """ logger.info("Cleaning ratings.") column = column.copy() column = column.astype(str).str.replace("MW", "") # Remove all remaining non-numeric characters except for semicolons column = column.apply(lambda x: re.sub(r"[^0-9;]", "", x)) # Sum up all ratings if there are multiple entries column = column.str.split(";").apply(lambda x: sum([int(i) for i in x])) column.dropna(inplace=True) return column.astype(str) def _split_cells(df, cols=["voltage"]): """ Split semicolon separated cells i.e. [66000;220000] and create new identical rows. Parameters ---------- df : dataframe Dataframe under analysis cols : list List of target columns over which to perform the analysis Example ------- Original data: row 1: '66000;220000', '50' After applying split_cells(): row 1, '66000', '50', 2 row 2, '220000', '50', 2 """ if df.empty: return df # Create a dictionary to store the suffix count for each original ID suffix_counts = {} # Create a dictionary to store the number of splits associated with each # original ID num_splits = {} # Split cells and create new rows x = df.assign(**{col: df[col].str.split(";") for col in cols}) x = x.explode(cols, ignore_index=True) # Count the number of splits associated with each original ID num_splits = x.groupby("id").size().to_dict() # Update the 'split_elements' column x["split_elements"] = x["id"].map(num_splits) # Function to generate the new ID with suffix and update the number of # splits def generate_new_id(row): original_id = row["id"] if row["split_elements"] == 1: return original_id else: suffix_counts[original_id] = suffix_counts.get(original_id, 0) + 1 return f"{original_id}-{suffix_counts[original_id]}" # Update the ID column with the new IDs x["id"] = x.apply(generate_new_id, axis=1) return x def _distribute_to_circuits(row): """ Distributes the number of circuits or cables to individual circuits based on the given row data. Parameters: - row: A dictionary representing a row of data containing information about circuits and cables. Returns: - single_circuit: The number of circuits to be assigned to each individual circuit. """ if row["circuits"] != "": circuits = int(row["circuits"]) else: cables = int(row["cables"]) circuits = cables / 3 single_circuit = int(max(1, np.floor_divide(circuits, row["split_elements"]))) single_circuit = str(single_circuit) return single_circuit def _add_line_endings_to_substations( df_substations, gdf_lines, path_country_shapes, path_offshore_shapes, prefix, ): """ Add line endings to substations. This function takes two pandas DataFrames, `substations` and `lines`, and adds line endings to the substations based on the information from the lines DataFrame. Parameters: - substations (pandas DataFrame): DataFrame containing information about substations. - lines (pandas DataFrame): DataFrame containing information about lines. Returns: - buses (pandas DataFrame): DataFrame containing the updated information about substations with line endings. """ if gdf_lines.empty: return df_substations logger.info("Adding line endings to substations") # extract columns from df_substations bus_s = pd.DataFrame(columns=df_substations.columns) bus_e = pd.DataFrame(columns=df_substations.columns) # TODO pypsa-eur: fix country code to contain single country code # Read information from gdf_lines bus_s[["voltage", "country"]] = gdf_lines[["voltage", "country"]] bus_s.loc[:, "geometry"] = gdf_lines.geometry.boundary.map( lambda p: p.geoms[0] if len(p.geoms) >= 2 else None ) bus_s.loc[:, "lon"] = bus_s["geometry"].map(lambda p: p.x if p != None else None) bus_s.loc[:, "lat"] = bus_s["geometry"].map(lambda p: p.y if p != None else None) bus_s.loc[:, "dc"] = gdf_lines["dc"] bus_e[["voltage", "country"]] = gdf_lines[["voltage", "country"]] bus_e.loc[:, "geometry"] = gdf_lines.geometry.boundary.map( lambda p: p.geoms[1] if len(p.geoms) >= 2 else None ) bus_e.loc[:, "lon"] = bus_e["geometry"].map(lambda p: p.x if p != None else None) bus_e.loc[:, "lat"] = bus_e["geometry"].map(lambda p: p.y if p != None else None) bus_e.loc[:, "dc"] = gdf_lines["dc"] bus_all = pd.concat([bus_s, bus_e], ignore_index=True) # Group gdf_substations by voltage and and geometry (dropping duplicates) bus_all = bus_all.groupby(["voltage", "lon", "lat", "dc"]).first().reset_index() bus_all = bus_all[df_substations.columns] bus_all.loc[:, "bus_id"] = bus_all.apply( lambda row: f"{prefix}/{row.name + 1}", axis=1 ) # Initialize default values bus_all["station_id"] = None # Assuming substations completed for installed lines bus_all["under_construction"] = False bus_all["tag_area"] = None bus_all["symbol"] = "substation" # TODO: this tag may be improved, maybe depending on voltage levels bus_all["tag_substation"] = "transmission" bus_all["tag_source"] = prefix buses = pd.concat([df_substations, bus_all], ignore_index=True) buses.set_index("bus_id", inplace=True) # Fix country codes # TODO pypsa-eur: Temporary solution as long as the shapes have a low, # incomplete resolution (cf. 2500 meters for buffering) bool_multiple_countries = buses["country"].str.contains(";") gdf_offshore = gpd.read_file(path_offshore_shapes).set_index("name")["geometry"] gdf_offshore = gpd.GeoDataFrame( gdf_offshore, geometry=gdf_offshore, crs=gdf_offshore.crs ) gdf_countries = gpd.read_file(path_country_shapes).set_index("name")["geometry"] # reproject to enable buffer gdf_countries = gpd.GeoDataFrame(geometry=gdf_countries, crs=gdf_countries.crs) gdf_union = gdf_countries.merge( gdf_offshore, how="outer", left_index=True, right_index=True ) gdf_union["geometry"] = gdf_union.apply( lambda row: gpd.GeoSeries([row["geometry_x"], row["geometry_y"]]).union_all(), axis=1, ) gdf_union = gpd.GeoDataFrame(geometry=gdf_union["geometry"], crs=crs) gdf_buses_tofix = gpd.GeoDataFrame( buses[bool_multiple_countries], geometry="geometry", crs=crs ) joined = gpd.sjoin( gdf_buses_tofix, gdf_union.reset_index(), how="left", predicate="within" ) # For all remaining rows where the country/index_right column is NaN, find # find the closest polygon index joined.loc[joined["name"].isna(), "name"] = joined.loc[ joined["name"].isna(), "geometry" ].apply(lambda x: _find_closest_polygon(gdf_union, x)) joined.reset_index(inplace=True) joined = joined.drop_duplicates(subset="bus_id") joined.set_index("bus_id", inplace=True) buses.loc[bool_multiple_countries, "country"] = joined.loc[ bool_multiple_countries, "name" ] return buses.reset_index() def _import_lines_and_cables(path_lines): """ Import lines and cables from the given input paths. Parameters: - path_lines (dict): A dictionary containing the input paths for lines and cables data. Returns: - df_lines (DataFrame): A DataFrame containing the imported lines and cables data. """ columns = [ "id", "bounds", "nodes", "geometry", "country", "power", "cables", "circuits", "frequency", "voltage", "wires", ] df_lines = pd.DataFrame(columns=columns) logger.info("Importing lines and cables") for key in path_lines: logger.info(f"Processing {key}...") for idx, ip in enumerate(path_lines[key]): if ( os.path.exists(ip) and os.path.getsize(ip) > 400 ): # unpopulated OSM json is about 51 bytes country = os.path.basename(os.path.dirname(path_lines[key][idx])) logger.info( f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(path_lines[key])).zfill(2)}: {ip}" ) with open(ip, "r") as f: data = json.load(f) df = pd.DataFrame(data["elements"]) df["id"] = df["id"].astype(str) df["country"] = country col_tags = [ "power", "cables", "circuits", "frequency", "voltage", "wires", ] tags = pd.json_normalize(df["tags"]).map( lambda x: str(x) if pd.notnull(x) else x ) for ct in col_tags: if ct not in tags.columns: tags[ct] = pd.NA tags = tags.loc[:, col_tags] df = pd.concat([df, tags], axis="columns") df.drop(columns=["type", "tags"], inplace=True) df_lines = pd.concat([df_lines, df], axis="rows") else: logger.info( f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(path_lines[key])).zfill(2)} (empty): {ip}" ) continue logger.info("---") return df_lines def _import_links(path_links): """ Import links from the given input paths. Parameters: - path_links (dict): A dictionary containing the input paths for links. Returns: - df_links (DataFrame): A DataFrame containing the imported links data. """ columns = [ "id", "bounds", "nodes", "geometry", "country", "circuits", "frequency", "rating", "voltage", ] df_links = pd.DataFrame(columns=columns) logger.info("Importing links") for key in path_links: logger.info(f"Processing {key}...") for idx, ip in enumerate(path_links[key]): if ( os.path.exists(ip) and os.path.getsize(ip) > 400 ): # unpopulated OSM json is about 51 bytes country = os.path.basename(os.path.dirname(path_links[key][idx])) logger.info( f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(path_links[key])).zfill(2)}: {ip}" ) with open(ip, "r") as f: data = json.load(f) df = pd.DataFrame(data["elements"]) df["id"] = df["id"].astype(str) df["id"] = df["id"].apply(lambda x: (f"relation/{x}")) df["country"] = country col_tags = [ "circuits", "frequency", "rating", "voltage", ] tags = pd.json_normalize(df["tags"]).map( lambda x: str(x) if pd.notnull(x) else x ) for ct in col_tags: if ct not in tags.columns: tags[ct] = pd.NA tags = tags.loc[:, col_tags] df = pd.concat([df, tags], axis="columns") df.drop(columns=["type", "tags"], inplace=True) df_links = pd.concat([df_links, df], axis="rows") else: logger.info( f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(path_links[key])).zfill(2)} (empty): {ip}" ) continue logger.info("---") logger.info("Dropping lines without rating.") len_before = len(df_links) df_links = df_links.dropna(subset=["rating"]) len_after = len(df_links) logger.info( f"Dropped {len_before-len_after} elements without rating. " + f"Imported {len_after} elements." ) return df_links def _create_single_link(row): """ Create a single link from multiple rows within a OSM link relation. Parameters: - row: A row of OSM data containing information about the link. Returns: - single_link: A single LineString representing the link. This function takes a row of OSM data and extracts the relevant information to create a single link. It filters out elements (substations, electrodes) with invalid roles and finds the longest link based on its endpoints. If the longest link is a MultiLineString, it extracts the longest linestring from it. The resulting single link is returned. """ valid_roles = ["line", "cable"] df = pd.json_normalize(row["members"]) df = df[df["role"].isin(valid_roles)] df.loc[:, "geometry"] = df.apply(_create_linestring, axis=1) df.loc[:, "length"] = df["geometry"].apply(lambda x: x.length) list_endpoints = [] for idx, row in df.iterrows(): tuple = sorted([row["geometry"].coords[0], row["geometry"].coords[-1]]) # round tuple to 3 decimals tuple = ( round(tuple[0][0], 3), round(tuple[0][1], 3), round(tuple[1][0], 3), round(tuple[1][1], 3), ) list_endpoints.append(tuple) df.loc[:, "endpoints"] = list_endpoints df_longest = df.loc[df.groupby("endpoints")["length"].idxmin()] single_link = linemerge(df_longest["geometry"].values.tolist()) # If the longest component is a MultiLineString, extract the longest linestring from it if isinstance(single_link, MultiLineString): # Find connected components components = list(single_link.geoms) # Find the longest connected linestring single_link = max(components, key=lambda x: x.length) return single_link def _drop_duplicate_lines(df_lines): """ Drop duplicate lines from the given dataframe. Duplicates are usually lines cross-border lines or slightly outside the country border of focus. Parameters: - df_lines (pandas.DataFrame): The dataframe containing lines data. Returns: - df_lines (pandas.DataFrame): The dataframe with duplicate lines removed and cleaned data. This function drops duplicate lines from the given dataframe based on the 'id' column. It groups the duplicate rows by 'id' and aggregates the 'country' column to a string split by semicolon, as they appear in multiple country datasets. One example of the duplicates is kept, accordingly. Finally, the updated dataframe without multiple duplicates is returned. """ logger.info("Dropping duplicate lines.") duplicate_rows = df_lines[df_lines.duplicated(subset=["id"], keep=False)].copy() # Group rows by id and aggregate the country column to a string split by semicolon grouped_duplicates = ( duplicate_rows.groupby("id")["country"].agg(lambda x: ";".join(x)).reset_index() ) duplicate_rows.drop_duplicates(subset="id", inplace=True) duplicate_rows.drop(columns=["country"], inplace=True) duplicate_rows = duplicate_rows.join( grouped_duplicates.set_index("id"), on="id", how="left" ) len_before = len(df_lines) # Drop duplicates and update the df_lines dataframe with the cleaned data df_lines = df_lines[~df_lines["id"].isin(duplicate_rows["id"])] df_lines = pd.concat([df_lines, duplicate_rows], axis="rows") len_after = len(df_lines) logger.info( f"Dropped {len_before - len_after} duplicate elements. " + f"Keeping {len_after} elements." ) return df_lines def _filter_by_voltage(df, min_voltage=200000): """ Filter rows in the DataFrame based on the voltage in V. Parameters: - df (pandas.DataFrame): The DataFrame containing the substations or lines data. - min_voltage (int, optional): The minimum voltage value to filter the rows. Defaults to 200000 [unit: V]. Returns: - filtered df (pandas.DataFrame): The filtered DataFrame containing the lines or substations above min_voltage. - list_voltages (list): A list of unique voltage values above min_voltage. The type of the list elements is string. """ if df.empty: return df, [] logger.info( f"Filtering dataframe by voltage. Only keeping rows above and including {min_voltage} V." ) list_voltages = df["voltage"].str.split(";").explode().unique().astype(str) # Keep numeric strings list_voltages = list_voltages[np.vectorize(str.isnumeric)(list_voltages)] list_voltages = list_voltages.astype(int) list_voltages = list_voltages[list_voltages >= int(min_voltage)] list_voltages = list_voltages.astype(str) bool_voltages = df["voltage"].apply(_check_voltage, list_voltages=list_voltages) len_before = len(df) df = df[bool_voltages] len_after = len(df) logger.info( f"Dropped {len_before - len_after} elements with voltage below {min_voltage}. " + f"Keeping {len_after} elements." ) return df, list_voltages def _clean_substations(df_substations, list_voltages): """ Clean the substation data by performing the following steps: - Split cells in the dataframe. - Filter substation data based on specified voltages. - Update the frequency values based on the split count. - Split cells in the 'frequency' column. - Set remaining invalid frequency values that are not in ['0', '50'] to '50'. Parameters: - df_substations (pandas.DataFrame): The input dataframe containing substation data. - list_voltages (list): A list of voltages above min_voltage to filter the substation data. Returns: - df_substations (pandas.DataFrame): The cleaned substation dataframe. """ df_substations = df_substations.copy() df_substations = _split_cells(df_substations) bool_voltages = df_substations["voltage"].apply( _check_voltage, list_voltages=list_voltages ) df_substations = df_substations[bool_voltages] df_substations.loc[:, "split_count"] = df_substations["id"].apply( lambda x: x.split("-")[1] if "-" in x else "0" ) df_substations.loc[:, "split_count"] = df_substations["split_count"].astype(int) bool_split = df_substations["split_elements"] > 1 bool_frequency_len = ( df_substations["frequency"].apply(lambda x: len(x.split(";"))) == df_substations["split_elements"] ) op_freq = lambda row: row["frequency"].split(";")[row["split_count"] - 1] df_substations.loc[bool_frequency_len & bool_split, "frequency"] = ( df_substations.loc[bool_frequency_len & bool_split,].apply(op_freq, axis=1) ) df_substations = _split_cells(df_substations, cols=["frequency"]) bool_invalid_frequency = df_substations["frequency"].apply( lambda x: x not in ["50", "0"] ) df_substations.loc[bool_invalid_frequency, "frequency"] = "50" return df_substations def _clean_lines(df_lines, list_voltages): """ Cleans and processes the `df_lines` DataFrame heuristically based on the information available per respective line and cable. Further checks to ensure data consistency and completeness. Parameters ---------- df_lines : pandas.DataFrame The input DataFrame containing line information with columns such as 'voltage', 'circuits', 'frequency', 'cables', 'split_elements', 'id', etc. list_voltages : list A list of unique voltage values above a certain threshold. (type: str) Returns ------- df_lines : pandas.DataFrame The cleaned DataFrame with updated columns 'circuits', 'frequency', and 'cleaned' to reflect the applied transformations. Description ----------- This function performs the following operations: - Initializes a 'cleaned' column with False, step-wise updates to True following the respective cleaning step. - Splits the voltage cells in the DataFrame at semicolons using a helper function `_split_cells`. - Filters the DataFrame to only include rows with valid voltages. - Sets circuits of remaining lines without any applicable heuristic equal to 1. The function ensures that the resulting DataFrame has consistent and complete information for further processing or analysis while maintaining the data of the original OSM data set wherever possible. """ logger.info("Cleaning lines and determining circuits.") # Initiate boolean with False, only set to true if all cleaning steps are # passed df_lines = df_lines.copy() df_lines["cleaned"] = False df_lines["voltage_original"] = df_lines["voltage"] df_lines["circuits_original"] = df_lines["circuits"] df_lines = _split_cells(df_lines) bool_voltages = df_lines["voltage"].apply( _check_voltage, list_voltages=list_voltages ) df_lines = df_lines[bool_voltages] bool_ac = df_lines["frequency"] != "0" bool_dc = ~bool_ac valid_frequency = ["50", "0"] bool_invalid_frequency = df_lines["frequency"].apply( lambda x: x not in valid_frequency ) bool_noinfo = (df_lines["cables"] == "") & (df_lines["circuits"] == "") # Fill in all values where cables info and circuits does not exist. Assuming 1 circuit df_lines.loc[bool_noinfo, "circuits"] = "1" df_lines.loc[bool_noinfo & bool_invalid_frequency, "frequency"] = "50" df_lines.loc[bool_noinfo, "cleaned"] = True # Fill in all values where cables info exists and split_elements == 1 bool_cables_ac = ( (df_lines["cables"] != "") & (df_lines["split_elements"] == 1) & (df_lines["cables"] != "0") & (df_lines["cables"].apply(lambda x: len(x.split(";")) == 1)) & (df_lines["circuits"] == "") & (df_lines["cleaned"] == False) & bool_ac ) df_lines.loc[bool_cables_ac, "circuits"] = df_lines.loc[ bool_cables_ac, "cables" ].apply(lambda x: str(int(max(1, np.floor_divide(int(x), 3))))) df_lines.loc[bool_cables_ac, "frequency"] = "50" df_lines.loc[bool_cables_ac, "cleaned"] = True bool_cables_dc = ( (df_lines["cables"] != "") & (df_lines["split_elements"] == 1) & (df_lines["cables"] != "0") & (df_lines["cables"].apply(lambda x: len(x.split(";")) == 1)) & (df_lines["circuits"] == "") & (df_lines["cleaned"] == False) & bool_dc ) df_lines.loc[bool_cables_dc, "circuits"] = df_lines.loc[ bool_cables_dc, "cables" ].apply(lambda x: str(int(max(1, np.floor_divide(int(x), 2))))) df_lines.loc[bool_cables_dc, "frequency"] = "0" df_lines.loc[bool_cables_dc, "cleaned"] = True # Fill in all values where circuits info exists and split_elements == 1 bool_lines = ( (df_lines["circuits"] != "") & (df_lines["split_elements"] == 1) & (df_lines["circuits"] != "0") & (df_lines["circuits"].apply(lambda x: len(x.split(";")) == 1)) & (df_lines["cleaned"] == False) ) df_lines.loc[bool_lines & bool_ac, "frequency"] = "50" df_lines.loc[bool_lines & bool_dc, "frequency"] = "0" df_lines.loc[bool_lines, "cleaned"] = True # Clean those values where number of voltages split by semicolon is larger # than no cables or no circuits bool_cables = ( (df_lines["voltage_original"].apply(lambda x: len(x.split(";")) > 1)) & (df_lines["cables"].apply(lambda x: len(x.split(";")) == 1)) & (df_lines["circuits"].apply(lambda x: len(x.split(";")) == 1)) & (df_lines["cleaned"] == False) ) df_lines.loc[bool_cables, "circuits"] = df_lines[bool_cables].apply( _distribute_to_circuits, axis=1 ) df_lines.loc[bool_cables & bool_ac, "frequency"] = "50" df_lines.loc[bool_cables & bool_dc, "frequency"] = "0" df_lines.loc[bool_cables, "cleaned"] = True # Clean those values where multiple circuit values are present, divided by # semicolon has_multiple_circuits = df_lines["circuits"].apply(lambda x: len(x.split(";")) > 1) circuits_match_split_elements = df_lines.apply( lambda row: len(row["circuits"].split(";")) == row["split_elements"], axis=1, ) is_not_cleaned = df_lines["cleaned"] == False bool_cables = has_multiple_circuits & circuits_match_split_elements & is_not_cleaned df_lines.loc[bool_cables, "circuits"] = df_lines.loc[bool_cables].apply( lambda row: str(row["circuits"].split(";")[int(row["id"].split("-")[-1]) - 1]), axis=1, ) df_lines.loc[bool_cables & bool_ac, "frequency"] = "50" df_lines.loc[bool_cables & bool_dc, "frequency"] = "0" df_lines.loc[bool_cables, "cleaned"] = True # Clean those values where multiple cables values are present, divided by # semicolon has_multiple_cables = df_lines["cables"].apply(lambda x: len(x.split(";")) > 1) cables_match_split_elements = df_lines.apply( lambda row: len(row["cables"].split(";")) == row["split_elements"], axis=1, ) is_not_cleaned = df_lines["cleaned"] == False bool_cables = has_multiple_cables & cables_match_split_elements & is_not_cleaned df_lines.loc[bool_cables, "circuits"] = df_lines.loc[bool_cables].apply( lambda row: str( max( 1, np.floor_divide( int(row["cables"].split(";")[int(row["id"].split("-")[-1]) - 1]), 3 ), ) ), axis=1, ) df_lines.loc[bool_cables & bool_ac, "frequency"] = "50" df_lines.loc[bool_cables & bool_dc, "frequency"] = "0" df_lines.loc[bool_cables, "cleaned"] = True # All remaining lines to circuits == 1 bool_leftover = df_lines["cleaned"] == False if sum(bool_leftover) > 0: str_id = "; ".join(str(id) for id in df_lines.loc[bool_leftover, "id"]) logger.info(f"Setting circuits of remaining {sum(bool_leftover)} lines to 1...") logger.info(f"Lines affected: {str_id}") df_lines.loc[bool_leftover, "circuits"] = "1" df_lines.loc[bool_leftover & bool_ac, "frequency"] = "50" df_lines.loc[bool_leftover & bool_dc, "frequency"] = "0" df_lines.loc[bool_leftover, "cleaned"] = True return df_lines def _create_substations_geometry(df_substations): """ Creates geometries. Parameters: df_substations (DataFrame): The input DataFrame containing the substations data. Returns: df_substations (DataFrame): A new DataFrame with the polygons ["polygon"] of the substations geometries. """ logger.info("Creating substations geometry.") df_substations = df_substations.copy() # Create centroids from geometries and keep the original polygons df_substations.loc[:, "polygon"] = df_substations["geometry"] return df_substations def _create_substations_centroid(df_substations): """ Creates centroids from geometries and keeps the original polygons. Parameters: df_substations (DataFrame): The input DataFrame containing the substations data. Returns: df_substations (DataFrame): A new DataFrame with the centroids ["geometry"] and polygons ["polygon"] of the substations geometries. """ logger.info("Creating substations geometry.") df_substations = df_substations.copy() df_substations.loc[:, "geometry"] = df_substations["polygon"].apply( lambda x: x.centroid ) df_substations.loc[:, "lon"] = df_substations["geometry"].apply(lambda x: x.x) df_substations.loc[:, "lat"] = df_substations["geometry"].apply(lambda x: x.y) return df_substations def _create_lines_geometry(df_lines): """ Create line geometry for the given DataFrame of lines. Parameters: - df_lines (pandas.DataFrame): DataFrame containing lines data. Returns: - df_lines (pandas.DataFrame): DataFrame with transformed 'geometry' column (type: shapely LineString). Notes: - This function transforms 'geometry' column in the input DataFrame by applying the '_create_linestring' function to each row. - It then drops rows where the geometry has equal start and end points, as these are usually not lines but outlines of areas. """ logger.info("Creating lines geometry.") df_lines = df_lines.copy() df_lines.loc[:, "geometry"] = df_lines.apply(_create_linestring, axis=1) bool_circle = df_lines["geometry"].apply(lambda x: x.coords[0] == x.coords[-1]) df_lines = df_lines[~bool_circle] return df_lines def _add_bus_centroid_to_line(linestring, point): """ Adds the centroid of a substation to a linestring by extending the linestring with a new segment. Parameters: linestring (LineString): The original linestring to extend. point (Point): The centroid of the bus. Returns: merged (LineString): The extended linestring with the new segment. """ start = linestring.coords[0] end = linestring.coords[-1] dist_to_start = point.distance(Point(start)) dist_to_end = point.distance(Point(end)) if dist_to_start < dist_to_end: new_segment = LineString([point.coords[0], start]) else: new_segment = LineString([point.coords[0], end]) merged = linemerge([linestring, new_segment]) return merged def _finalise_substations(df_substations): """ Finalises the substations column types. Args: df_substations (pandas.DataFrame): The input DataFrame containing substations data. Returns: df_substations (pandas.DataFrame(): The DataFrame with finalised column types and transformed data. """ logger.info("Finalising substations column types.") df_substations = df_substations.copy() # rename columns df_substations.rename( columns={ "id": "bus_id", "power": "symbol", "substation": "tag_substation", }, inplace=True, ) # Initiate new columns for subsequent build_osm_network step df_substations.loc[:, "symbol"] = "substation" df_substations.loc[:, "tag_substation"] = "transmission" df_substations.loc[:, "dc"] = False df_substations.loc[df_substations["frequency"] == "0", "dc"] = True df_substations.loc[:, "under_construction"] = False df_substations.loc[:, "station_id"] = None df_substations.loc[:, "tag_area"] = None df_substations.loc[:, "tag_source"] = df_substations["bus_id"] # Only included needed columns df_substations = df_substations[ [ "bus_id", "symbol", "tag_substation", "voltage", "lon", "lat", "dc", "under_construction", "station_id", "tag_area", "country", "geometry", "polygon", "tag_source", ] ] # Substation data types df_substations["voltage"] = df_substations["voltage"].astype(int) return df_substations def _finalise_lines(df_lines): """ Finalises the lines column types. Args: df_lines (pandas.DataFrame): The input DataFrame containing lines data. Returns: df_lines (pandas.DataFrame(): The DataFrame with finalised column types and transformed data. """ logger.info("Finalising lines column types.") df_lines = df_lines.copy() # Rename columns df_lines.rename( columns={ "id": "line_id", "power": "tag_type", "frequency": "tag_frequency", }, inplace=True, ) # Initiate new columns for subsequent build_osm_network step df_lines.loc[:, "bus0"] = None df_lines.loc[:, "bus1"] = None df_lines.loc[:, "length"] = None df_lines.loc[:, "underground"] = False df_lines.loc[df_lines["tag_type"] == "line", "underground"] = False df_lines.loc[df_lines["tag_type"] == "cable", "underground"] = True df_lines.loc[:, "under_construction"] = False df_lines.loc[:, "dc"] = False df_lines.loc[df_lines["tag_frequency"] == "50", "dc"] = False df_lines.loc[df_lines["tag_frequency"] == "0", "dc"] = True # Only include needed columns df_lines = df_lines[ [ "line_id", "circuits", "tag_type", "voltage", "tag_frequency", "bus0", "bus1", "length", "underground", "under_construction", "dc", "country", "geometry", ] ] df_lines["circuits"] = df_lines["circuits"].astype(int) df_lines["voltage"] = df_lines["voltage"].astype(int) df_lines["tag_frequency"] = df_lines["tag_frequency"].astype(int) return df_lines def _finalise_links(df_links): """ Finalises the links column types. Args: df_links (pandas.DataFrame): The input DataFrame containing links data. Returns: df_links (pandas.DataFrame(): The DataFrame with finalised column types and transformed data. """ logger.info("Finalising links column types.") df_links = df_links.copy() # Rename columns df_links.rename( columns={ "id": "link_id", "rating": "p_nom", }, inplace=True, ) # Initiate new columns for subsequent build_osm_network step df_links["bus0"] = None df_links["bus1"] = None df_links["length"] = None df_links["underground"] = True df_links["under_construction"] = False df_links["dc"] = True # Only include needed columns df_links = df_links[ [ "link_id", "voltage", "p_nom", "bus0", "bus1", "length", "underground", "under_construction", "dc", "country", "geometry", ] ] df_links["p_nom"] = df_links["p_nom"].astype(int) df_links["voltage"] = df_links["voltage"].astype(int) return df_links def _import_substations(path_substations): """ Import substations from the given input paths. This function imports both substations from OSM ways as well as relations that contain nested information on the substations shape and electrical parameters. Ways and relations are subsequently concatenated to form a single DataFrame containing unique bus ids. Args: path_substations (dict): A dictionary containing input paths for substations. Returns: pd.DataFrame: A DataFrame containing the imported substations data. """ cols_substations_way = [ "id", "geometry", "country", "power", "substation", "voltage", "frequency", ] cols_substations_relation = [ "id", "country", "power", "substation", "voltage", "frequency", ] df_substations_way = pd.DataFrame(columns=cols_substations_way) df_substations_relation = pd.DataFrame(columns=cols_substations_relation) logger.info("Importing substations") for key in path_substations: logger.info(f"Processing {key}...") for idx, ip in enumerate(path_substations[key]): if ( os.path.exists(ip) and os.path.getsize(ip) > 400 ): # unpopulated OSM json is about 51 bytes country = os.path.basename(os.path.dirname(path_substations[key][idx])) logger.info( f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(path_substations[key])).zfill(2)}: {ip}" ) with open(ip, "r") as f: data = json.load(f) df = pd.DataFrame(data["elements"]) df["id"] = df["id"].astype(str) # new string that adds "way/" to id df["id"] = df["id"].apply( lambda x: ( f"way/{x}" if key == "substations_way" else f"relation/{x}" ) ) df["country"] = country col_tags = ["power", "substation", "voltage", "frequency"] tags = pd.json_normalize(df["tags"]).map( lambda x: str(x) if pd.notnull(x) else x ) for ct in col_tags: if ct not in tags.columns: tags[ct] = pd.NA tags = tags.loc[:, col_tags] df = pd.concat([df, tags], axis="columns") if key == "substations_way": df.drop(columns=["type", "tags", "bounds", "nodes"], inplace=True) df_substations_way = pd.concat( [df_substations_way, df], axis="rows" ) elif key == "substations_relation": df.drop(columns=["type", "tags", "bounds"], inplace=True) df_substations_relation = pd.concat( [df_substations_relation, df], axis="rows" ) else: logger.info( f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(path_substations[key])).zfill(2)} (empty): {ip}" ) continue logger.info("---") df_substations_way.drop_duplicates(subset="id", keep="first", inplace=True) df_substations_relation.drop_duplicates(subset="id", keep="first", inplace=True) df_substations_way["geometry"] = df_substations_way.apply(_create_polygon, axis=1) # Normalise the members column of df_substations_relation cols_members = ["id", "type", "ref", "role", "geometry"] df_substations_relation_members = pd.DataFrame(columns=cols_members) for index, row in df_substations_relation.iterrows(): col_members = ["type", "ref", "role", "geometry"] df = pd.json_normalize(row["members"]) for cm in col_members: if cm not in df.columns: df[cm] = pd.NA df = df.loc[:, col_members] df["id"] = str(row["id"]) df["ref"] = df["ref"].astype(str) df = df[df["type"] != "node"] df = df.dropna(subset=["geometry"]) df = df[~df["role"].isin(["", "incoming_line", "substation", "inner"])] df_substations_relation_members = pd.concat( [df_substations_relation_members, df], axis="rows" ) df_substations_relation_members.reset_index(inplace=True) df_substations_relation_members["linestring"] = ( df_substations_relation_members.apply(_create_linestring, axis=1) ) df_substations_relation_members_grouped = ( df_substations_relation_members.groupby("id")["linestring"] .apply(lambda x: linemerge(x.tolist())) .reset_index() ) df_substations_relation_members_grouped["geometry"] = ( df_substations_relation_members_grouped["linestring"].apply( lambda x: x.convex_hull ) ) df_substations_relation = ( df_substations_relation.join( df_substations_relation_members_grouped.set_index("id"), on="id", how="left" ) .drop(columns=["members", "linestring"]) .dropna(subset=["geometry"]) ) # reorder columns and concatenate df_substations_relation = df_substations_relation[cols_substations_way] df_substations = pd.concat( [df_substations_way, df_substations_relation], axis="rows" ) return df_substations def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon): """ Removes lines that are within substation polygons from the given GeoDataFrame of lines. These are not needed to create network (e.g. bus bars, switchgear, etc.) Parameters: - gdf_lines (GeoDataFrame): A GeoDataFrame containing lines with 'line_id' and 'geometry' columns. - gdf_substations_polygon (GeoDataFrame): A GeoDataFrame containing substation polygons. Returns: GeoDataFrame: A new GeoDataFrame without lines within substation polygons. """ logger.info("Identifying and removing lines within substation polygons...") gdf = gpd.sjoin( gdf_lines[["line_id", "geometry"]], gdf_substations_polygon, how="inner", predicate="within", )["line_id"] logger.info( f"Removed {len(gdf)} lines within substations of original {len(gdf_lines)} lines." ) gdf_lines = gdf_lines[~gdf_lines["line_id"].isin(gdf)] return gdf_lines def _merge_touching_polygons(df): """ Merge touching polygons in a GeoDataFrame. Parameters: - df: pandas.DataFrame or geopandas.GeoDataFrame The input DataFrame containing the polygons to be merged. Returns: - gdf: geopandas.GeoDataFrame The GeoDataFrame with merged polygons. """ gdf = gpd.GeoDataFrame(df, geometry="polygon", crs=crs) combined_polygons = unary_union(gdf.geometry) if combined_polygons.geom_type == "MultiPolygon": gdf_combined = gpd.GeoDataFrame( geometry=[poly for poly in combined_polygons.geoms], crs=crs ) else: gdf_combined = gpd.GeoDataFrame(geometry=[combined_polygons], crs=crs) gdf.reset_index(drop=True, inplace=True) for i, combined_geom in gdf_combined.iterrows(): mask = gdf.intersects(combined_geom.geometry) gdf.loc[mask, "polygon_merged"] = combined_geom.geometry gdf.drop(columns=["polygon"], inplace=True) gdf.rename(columns={"polygon_merged": "polygon"}, inplace=True) return gdf def _add_endpoints_to_line(linestring, polygon_dict): """ Adds endpoints to a line by removing any overlapping areas with polygons. Parameters: linestring (LineString): The original line to add endpoints to. polygon_dict (dict): A dictionary of polygons, where the keys are bus IDs and the values are the corresponding polygons. Returns: LineString: The modified line with added endpoints. """ if not polygon_dict: return linestring polygon_centroids = { bus_id: polygon.centroid for bus_id, polygon in polygon_dict.items() } polygon_unary = polygons = unary_union(list(polygon_dict.values())) # difference with polygon linestring_new = linestring.difference(polygon_unary) if type(linestring_new) == MultiLineString: # keep the longest line in the multilinestring linestring_new = max(linestring_new.geoms, key=lambda x: x.length) for p in polygon_centroids: linestring_new = _add_bus_centroid_to_line(linestring_new, polygon_centroids[p]) return linestring_new def _get_polygons_at_endpoints(linestring, polygon_dict): """ Get the polygons that contain the endpoints of a given linestring. Parameters: linestring (LineString): The linestring for which to find the polygons at the endpoints. polygon_dict (dict): A dictionary containing polygons as values, with bus_ids as keys. Returns: dict: A dictionary containing bus_ids as keys and polygons as values, where the polygons contain the endpoints of the linestring. """ # Get the endpoints of the linestring start_point = Point(linestring.coords[0]) end_point = Point(linestring.coords[-1]) # Initialize dictionary to store bus_ids as keys and polygons as values bus_id_polygon_dict = {} for bus_id, polygon in polygon_dict.items(): if polygon.contains(start_point) or polygon.contains(end_point): bus_id_polygon_dict[bus_id] = polygon return bus_id_polygon_dict def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon): """ Extends the lines in the given GeoDataFrame `gdf_lines` to the centroid of the nearest substations represented by the polygons in the `gdf_substations_polygon` GeoDataFrame. Parameters: gdf_lines (GeoDataFrame): A GeoDataFrame containing the lines to be extended. gdf_substations_polygon (GeoDataFrame): A GeoDataFrame containing the polygons representing substations. Returns: GeoDataFrame: A new GeoDataFrame with the lines extended to the substations. """ gdf = gpd.sjoin( gdf_lines, gdf_substations_polygon.drop_duplicates(subset="polygon", inplace=False), how="left", lsuffix="line", rsuffix="bus", predicate="intersects", ).drop(columns="index_bus") # Group by 'line_id' and create a dictionary mapping 'bus_id' to 'geometry_bus', excluding the grouping columns gdf = ( gdf.groupby("line_id") .apply( lambda x: x[["bus_id", "geometry_bus"]] .dropna() .set_index("bus_id")["geometry_bus"] .to_dict(), include_groups=False, ) .reset_index() ) gdf.columns = ["line_id", "bus_dict"] gdf["intersects_bus"] = gdf.apply(lambda row: len(row["bus_dict"]) > 0, axis=1) gdf.loc[:, "line_geometry"] = gdf.join( gdf_lines.set_index("line_id")["geometry"], on="line_id" )["geometry"] # Polygons at the endpoints of the linestring gdf["bus_endpoints"] = gdf.apply( lambda row: _get_polygons_at_endpoints(row["line_geometry"], row["bus_dict"]), axis=1, ) gdf.loc[:, "line_geometry_new"] = gdf.apply( lambda row: _add_endpoints_to_line(row["line_geometry"], row["bus_endpoints"]), axis=1, ) gdf.set_index("line_id", inplace=True) gdf_lines.set_index("line_id", inplace=True) gdf_lines.loc[:, "geometry"] = gdf["line_geometry_new"] return gdf_lines # Function to bridge gaps between all lines if __name__ == "__main__": if "snakemake" not in globals(): from _helpers import mock_snakemake snakemake = mock_snakemake("clean_osm_data") configure_logging(snakemake) set_scenario_config(snakemake) # Parameters crs = "EPSG:4326" # Correct crs for OSM data min_voltage_ac = 200000 # [unit: V] Minimum voltage value to filter AC lines. min_voltage_dc = 150000 # [unit: V] Minimum voltage value to filter DC links. logger.info("---") logger.info("SUBSTATIONS") # Input path_substations = { "substations_way": snakemake.input.substations_way, "substations_relation": snakemake.input.substations_relation, } # Cleaning process df_substations = _import_substations(path_substations) df_substations["voltage"] = _clean_voltage(df_substations["voltage"]) df_substations, list_voltages = _filter_by_voltage( df_substations, min_voltage=min_voltage_ac ) df_substations["frequency"] = _clean_frequency(df_substations["frequency"]) df_substations = _clean_substations(df_substations, list_voltages) df_substations = _create_substations_geometry(df_substations) # Merge touching polygons df_substations = _merge_touching_polygons(df_substations) df_substations = _create_substations_centroid(df_substations) df_substations = _finalise_substations(df_substations) # Create polygon GeoDataFrame to remove lines within substations gdf_substations_polygon = gpd.GeoDataFrame( df_substations[["bus_id", "polygon", "voltage"]], geometry="polygon", crs=crs, ) gdf_substations_polygon["geometry"] = gdf_substations_polygon.polygon.copy() logger.info("---") logger.info("LINES AND CABLES") path_lines = { "lines": snakemake.input.lines_way, "cables": snakemake.input.cables_way, } # Cleaning process df_lines = _import_lines_and_cables(path_lines) df_lines = _drop_duplicate_lines(df_lines) df_lines.loc[:, "voltage"] = _clean_voltage(df_lines["voltage"]) df_lines, list_voltages = _filter_by_voltage(df_lines, min_voltage=min_voltage_ac) df_lines.loc[:, "circuits"] = _clean_circuits(df_lines["circuits"]) df_lines.loc[:, "cables"] = _clean_cables(df_lines["cables"]) df_lines.loc[:, "frequency"] = _clean_frequency(df_lines["frequency"]) df_lines.loc[:, "wires"] = _clean_wires(df_lines["wires"]) df_lines = _clean_lines(df_lines, list_voltages) # Drop DC lines, will be added through relations later len_before = len(df_lines) df_lines = df_lines[df_lines["frequency"] == "50"] len_after = len(df_lines) logger.info( f"Dropped {len_before - len_after} DC lines. Keeping {len_after} AC lines." ) df_lines = _create_lines_geometry(df_lines) df_lines = _finalise_lines(df_lines) # Create GeoDataFrame gdf_lines = gpd.GeoDataFrame(df_lines, geometry="geometry", crs=crs) gdf_lines = _remove_lines_within_substations(gdf_lines, gdf_substations_polygon) gdf_lines = _extend_lines_to_substations(gdf_lines, gdf_substations_polygon) logger.info("---") logger.info("HVDC LINKS") path_links = { "links": snakemake.input.links_relation, } df_links = _import_links(path_links) df_links = _drop_duplicate_lines(df_links) df_links.loc[:, "voltage"] = _clean_voltage(df_links["voltage"]) df_links, list_voltages = _filter_by_voltage(df_links, min_voltage=min_voltage_dc) # Keep only highest voltage of split string df_links.loc[:, "voltage"] = df_links["voltage"].apply( lambda x: str(max(map(int, x.split(";")))) ) df_links.loc[:, "frequency"] = _clean_frequency(df_links["frequency"]) df_links.loc[:, "rating"] = _clean_rating(df_links["rating"]) df_links.loc[:, "geometry"] = df_links.apply(_create_single_link, axis=1) df_links = _finalise_links(df_links) gdf_links = gpd.GeoDataFrame(df_links, geometry="geometry", crs=crs).set_index( "link_id" ) # Add line endings to substations path_country_shapes = snakemake.input.country_shapes path_offshore_shapes = snakemake.input.offshore_shapes df_substations = _add_line_endings_to_substations( df_substations, gdf_lines, path_country_shapes, path_offshore_shapes, prefix="line-end", ) df_substations = _add_line_endings_to_substations( df_substations, gdf_links, path_country_shapes, path_offshore_shapes, prefix="link-end", ) # Drop polygons and create GDF gdf_substations = gpd.GeoDataFrame( df_substations.drop(columns=["polygon"]), geometry="geometry", crs=crs ) output_substations_polygon = snakemake.output["substations_polygon"] output_substations = snakemake.output["substations"] output_lines = snakemake.output["lines"] output_links = snakemake.output["links"] logger.info( f"Exporting clean substations with polygon shapes to {output_substations_polygon}" ) gdf_substations_polygon.drop(columns=["geometry"]).to_file( output_substations_polygon, driver="GeoJSON" ) logger.info(f"Exporting clean substations to {output_substations}") gdf_substations.to_file(output_substations, driver="GeoJSON") logger.info(f"Exporting clean lines to {output_lines}") gdf_lines.to_file(output_lines, driver="GeoJSON") logger.info(f"Exporting clean links to {output_links}") gdf_links.to_file(output_links, driver="GeoJSON") logger.info("Cleaning OSM data completed.")