From 4cd964b3bdd84a7b2a60ebbfbcf24c5c7ceaf0bc Mon Sep 17 00:00:00 2001
From: Fabian Neumann <fabian.neumann@outlook.de>
Date: Sat, 13 Nov 2021 16:48:08 +0100
Subject: [PATCH] gas_network: use IGGIELGN scigrid dataset

---
 Snakefile                         |   8 +-
 scripts/build_gas_network.py      | 149 ++++++++++++++----------------
 scripts/cluster_gas_network.py    |   4 +-
 scripts/prepare_sector_network.py |   4 +-
 4 files changed, 78 insertions(+), 87 deletions(-)

diff --git a/Snakefile b/Snakefile
index 68b281c4..ce5be51d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -85,20 +85,24 @@ if config["sector"]["gas_network"]:
         "IGGIELGN_LNGs.geojson",
         "IGGIELGN_BorderPoints.geojson",
         "IGGIELGN_Productions.geojson",
+        "IGGIELGN_PipeSegments.geojson",
     ]
 
+
     rule retrieve_gas_infrastructure_data:
         output: expand("data/gas_network/scigrid-gas/data/{files}", files=datafiles)
         script: 'scripts/retrieve_gas_infrastructure_data.py'
 
+
     rule build_gas_network:
         input:
-            gas_network="data/gas_network/gas_network_dataset.csv"
+            gas_network="data/gas_network/scigrid-gas/data/IGGIELGN_PipeSegments.geojson"
         output:
             cleaned_gas_network="resources/gas_network.csv"
         resources: mem_mb=4000
         script: "scripts/build_gas_network.py"
 
+
     rule build_gas_input_locations:
         input:
             lng="data/gas_network/scigrid-gas/data/IGGIELGN_LNGs.geojson",
@@ -112,6 +116,7 @@ if config["sector"]["gas_network"]:
         resources: mem_mb=2000,
         script: "scripts/build_gas_input_locations.py"
 
+
     rule cluster_gas_network:
         input:
             cleaned_gas_network="resources/gas_network.csv",
@@ -122,6 +127,7 @@ if config["sector"]["gas_network"]:
         resources: mem_mb=4000
         script: "scripts/cluster_gas_network.py"
 
+
     gas_infrastructure = {**rules.cluster_gas_network.output, **rules.build_gas_input_locations.output}
 else:
     gas_infrastructure = {}
diff --git a/scripts/build_gas_network.py b/scripts/build_gas_network.py
index cf4a86a1..3d4bcb2a 100644
--- a/scripts/build_gas_network.py
+++ b/scripts/build_gas_network.py
@@ -1,38 +1,15 @@
-"""
-Preprocess gas network based on data from:
-
-    [1] the SciGRID Gas project
-        (https://www.gas.scigrid.de/)
-
-    [2] ENTSOG capacity map
-        (https://www.entsog.eu/sites/default/files/2019-10/Capacities%20for%20Transmission%20Capacity%20Map%20RTS008_NS%20-%20DWH_final.xlsx)
-"""
+"""Preprocess gas network based on data from bthe SciGRID Gas project (https://www.gas.scigrid.de/)."""
 
 import logging
 logger = logging.getLogger(__name__)
 
-import re
-import json
-
 import pandas as pd
+import geopandas as gpd
 from shapely.geometry import Point
 from pypsa.geo import haversine_pts
 
 
-def string2list(string, with_none=True):
-    """Convert string format to a list."""
-
-    if with_none:
-        p2 = re.compile('None')
-        string = p2.sub('\"None\"', string)
-    else:
-        p = re.compile('(?<!\\\\)\'')
-        string = p.sub('\"', string)
-
-    return json.loads(string)
-
-
-def diameter2capacity(pipe_diameter_mm):
+def diameter_to_capacity(pipe_diameter_mm):
     """Calculate pipe capacity in MW based on diameter in mm.
 
     20 inch (500 mm)  50 bar -> 1.5   GW CH4 pipe capacity (LHV)
@@ -65,75 +42,81 @@ def diameter2capacity(pipe_diameter_mm):
         return a3 + m3 * pipe_diameter_mm
 
 
-def find_terminal_points(df):
-    
-    latlon = []
-
-    for attr in ["lat", "long"]:
-    
-        s = df[attr].apply(string2list)
-
-        s = s.apply(lambda x: [x[0], x[-1]])
-
-        latlon.append(pd.DataFrame(s.to_list(),
-            columns=[f"{attr}0", f"{attr}1"]
-        ))
-    
-    latlon = pd.concat(latlon, axis=1)
-    
-    points = latlon.apply(
-        lambda x: {
-            "point0": Point(x.long0, x.lat0),
-            "point1": Point(x.long1, x.lat1)
-        },
-        axis=1,
-        result_type='expand'
-    )
-    
-    return pd.concat([df, points], axis=1)
-
-
-def process_gas_network_data(fn):
-
-    df = pd.read_csv(fn, sep=',')
-
-    df = find_terminal_points(df)
-
-    to_drop = ["name", "source_id", "country_code", "node_id",
-               "long", "lat", "lat_mean", "long_mean", "num_compressor"]
+def load_dataset(fn):
+    df = gpd.read_file(fn)
+    param = df.param.apply(pd.Series)
+    method = df.method.apply(pd.Series)[["diameter_mm", "max_cap_M_m3_per_d"]]
+    method.columns = method.columns + "_method"
+    df = pd.concat([df, param, method], axis=1)
+    to_drop = ["param", "uncertainty", "method", "tags"]
+    to_drop = df.columns.intersection(to_drop)
     df.drop(to_drop, axis=1, inplace=True)
+    return df
 
+
+def prepare_dataset(
+    df,
+    length_factor=1.5,
+    correction_threshold_length=4,
+    correction_threshold_p_nom=8,
+    bidirectional_below=10
+):
+
+    # extract start and end from LineString
+    df["point0"] = df.geometry.apply(lambda x: Point(x.coords[0]))
+    df["point1"] = df.geometry.apply(lambda x: Point(x.coords[-1]))
+
+    conversion_factor = 437.5 # MCM/day to MWh/h
+    df["p_nom"] = df.max_cap_M_m3_per_d * conversion_factor
+
+    # for inferred diameters, assume 500 mm rather than 900 mm (more conservative)
+    df.loc[df.diameter_mm_method != 'raw', "diameter_mm"] = 500.
+
+    keep = ["name", "diameter_mm", "is_H_gas", "is_bothDirection",
+            "length_km", "p_nom", "max_pressure_bar",
+            "start_year", "point0", "point1", "geometry"]
     to_rename = {
         "is_bothDirection": "bidirectional",
+        "is_H_gas": "H_gas",
         "start_year": "build_year",
         "length_km": "length",
-        "Capacity_GWh_h": "p_nom_data",
-        "id": "tags",
     }
-    df.rename(columns=to_rename, inplace=True)
-    
+    df = df[keep].rename(columns=to_rename)
+
     df.bidirectional = df.bidirectional.astype(bool)
+    df.H_gas = df.H_gas.astype(bool)
 
-    # convert from GWh/h to MW
-    df.p_nom_data *= 1e3
+    # short lines below 10 km are assumed to be bidirectional
+    short_lines = df["length"] < bidirectional_below
+    df.loc[short_lines, "bidirectional"] = True
 
-    # for pipes with missing diameter, assume 500 mm
-    df.loc[df.diameter_mm.isna(), "diameter_mm"] = 500.
-
-    # for nord stream and small pipelines take original capacity data
-    # otherwise inferred values from pipe diameter
-    df["p_nom"] = df.diameter_mm.map(diameter2capacity)
-    df.p_nom.update(
-        df.p_nom_data.where((df.diameter_mm < 500) | (df.max_pressure_bar == 220))
-    )
+    # correct all capacities that deviate correction_threshold factor
+    # to diameter-based capacities, unless they are NordStream pipelines
+    # also all capacities below 0.5 GW are now diameter-based capacities
+    df["p_nom_diameter"] = df.diameter_mm.apply(diameter_to_capacity)
+    ratio = df.p_nom / df.p_nom_diameter
+    not_nordstream = df.max_pressure_bar < 220
+    df.p_nom.update(df.p_nom_diameter.where(
+        (df.p_nom <= 500) |
+        ((ratio > correction_threshold_p_nom) & not_nordstream) |
+        ((ratio < 1 / correction_threshold_p_nom) & not_nordstream)
+    ))
 
+    # lines which have way too discrepant line lengths
+    # get assigned haversine length * length factor
     df["length_haversine"] = df.apply(
-        lambda p: 1.5 * haversine_pts([p.point0.x, p.point1.y], [p.point1.x, p.point1.y]),
-        axis=1
+        lambda p: length_factor * haversine_pts(
+            [p.point0.x, p.point1.y],
+            [p.point1.x, p.point1.y]
+        ), axis=1
     )
+    ratio = df.eval("length / length_haversine")
+    df["length"].update(df.length_haversine.where(
+        (df["length"] < 20) |
+        (ratio > correction_threshold_length) |
+        (ratio < 1 / correction_threshold_length)
+    ))
 
-    df.length.update(df.length_haversine.where(df.length.isna()))
-    
     return df
 
 
@@ -145,6 +128,8 @@ if __name__ == "__main__":
 
     logging.basicConfig(level=snakemake.config['logging_level'])
 
-    gas_network = process_gas_network_data(snakemake.input.gas_network)
+    gas_network = load_dataset(snakemake.input.gas_network)
+
+    gas_network = prepare_dataset(gas_network)
 
     gas_network.to_csv(snakemake.output.cleaned_gas_network)
\ No newline at end of file
diff --git a/scripts/cluster_gas_network.py b/scripts/cluster_gas_network.py
index 9f192a92..f6d15af0 100755
--- a/scripts/cluster_gas_network.py
+++ b/scripts/cluster_gas_network.py
@@ -70,12 +70,12 @@ def aggregate_parallel_pipes(df):
         'bus0': 'first',
         'bus1': 'first',
         "p_nom": 'sum',
-        "p_nom_data": 'sum',
+        "p_nom_diameter": 'sum',
         "max_pressure_bar": "mean",
         "build_year": "mean",
         "diameter_mm": "mean",
         "length": 'mean',
-        'tags': ' '.join,
+        'name': ' '.join,
         "p_min_pu": 'min',
     }
     return df.groupby(df.index).agg(strategies)
diff --git a/scripts/prepare_sector_network.py b/scripts/prepare_sector_network.py
index 35728a8b..2f10443d 100644
--- a/scripts/prepare_sector_network.py
+++ b/scripts/prepare_sector_network.py
@@ -1115,7 +1115,7 @@ def add_storage_and_grids(n, costs):
             p_nom_min=gas_pipes.p_nom_min,
             length=gas_pipes.length,
             capital_cost=gas_pipes.capital_cost,
-            tags=gas_pipes.tags,
+            tags=gas_pipes.name,
             carrier="gas pipeline",
             lifetime=costs.at['CH4 (g) pipeline', 'lifetime']
         )
@@ -1190,7 +1190,7 @@ def add_storage_and_grids(n, costs):
             p_nom_extendable=True,
             length=h2_pipes.length,
             capital_cost=costs.at['H2 (g) pipeline repurposed', 'fixed'] * h2_pipes.length,
-            tags=h2_pipes.tags,
+            tags=h2_pipes.name,
             carrier="H2 pipeline retrofitted",
             lifetime=costs.at['H2 (g) pipeline repurposed', 'lifetime']
         )