From 9792069ab3304f64ecc6566b54eb7bab07401a2e Mon Sep 17 00:00:00 2001
From: FabianHofmann <hofmann@fias.uni-frankfurt.de>
Date: Thu, 3 Dec 2020 12:49:04 +0100
Subject: [PATCH] Update load processing (#211)

* build_load_data

* Add documentation

* updating load data import

* Update Config files

* Update load.csv

* Update add_electricity.py

* change log file name

* Update scripts/add_electricity.py

Co-authored-by: FabianHofmann <hofmann@fias.uni-frankfurt.de>

* Update scripts/build_load_data.py

Co-authored-by: FabianHofmann <hofmann@fias.uni-frankfurt.de>

* Update scripts/build_load_data.py

Co-authored-by: FabianHofmann <hofmann@fias.uni-frankfurt.de>

* Update scripts/build_load_data.py

Co-authored-by: FabianHofmann <hofmann@fias.uni-frankfurt.de>

* Update build_load_data.py

* Update build_load_data.py

* Update scripts/build_load_data.py

Co-authored-by: FabianHofmann <hofmann@fias.uni-frankfurt.de>

* update gap handling in build_load_data

* Update build_load_data.py

* Update config.test1.yaml

* update test.config

* Update config.tutorial.yaml

* update load csv function for load data

* Update build_load_data.py

* Update config.test1.yaml

* Update add_electricity.py

* Update build_load_data.py

* Added error messages if load data contains gaps after modifications

* general adjustments:
	- reduce data source to only entsoe statistics
	- sanitize code
	- adjust logging messages
	- adjust daocstrings

* update Snakefile config and docs

* update release notes
rename build_load -> build_load_data in config

* small follow up

* - reintroduce choice between powerstatistics and transparency
- remove load_ timeseries from databundle
- always build load_data
- reinsert scaling factor in config
- fix url to 2019 version

* update doc: configtable, release notes
update config.yaml

* follow up

Co-authored-by: Jan Frederick <jan.frederick.unnewehr@inatech.uni-freiburg.de>
Co-authored-by: JanFrederickUnnewehr <50404069+JanFrederickUnnewehr@users.noreply.github.com>
---
 Snakefile                           |  10 +-
 config.default.yaml                 |   5 +
 config.tutorial.yaml                |   5 +
 doc/configtables/load.csv           |   7 +-
 doc/configuration.rst               |  10 +-
 doc/preparation.rst                 |   1 +
 doc/preparation/build_load_data.rst |  12 ++
 doc/release_notes.rst               |   3 +
 scripts/add_electricity.py          |  24 +--
 scripts/build_load_data.py          | 224 ++++++++++++++++++++++++++++
 test/config.test1.yaml              |   5 +
 11 files changed, 281 insertions(+), 25 deletions(-)
 create mode 100644 doc/preparation/build_load_data.rst
 create mode 100755 scripts/build_load_data.py

diff --git a/Snakefile b/Snakefile
index c043e57a..133f7843 100644
--- a/Snakefile
+++ b/Snakefile
@@ -53,8 +53,7 @@ datafiles = ['ch_cantons.csv', 'je-e-21.03.02.xls',
             'eez/World_EEZ_v8_2014.shp', 'EIA_hydro_generation_2000_2014.csv', 
             'hydro_capacities.csv', 'naturalearth/ne_10m_admin_0_countries.shp', 
             'NUTS_2013_60M_SH/data/NUTS_RG_60M_2013.shp', 'nama_10r_3popgdp.tsv.gz', 
-            'nama_10r_3gdp.tsv.gz', 'time_series_60min_singleindex_filtered.csv', 
-            'corine/g250_clc06_V18_5.tif']
+            'nama_10r_3gdp.tsv.gz', 'corine/g250_clc06_V18_5.tif']
 
 if not config.get('tutorial', False):
     datafiles.extend(["natura/Natura2000_end2015.shp", "GEBCO_2014_2D.nc"])
@@ -65,6 +64,11 @@ if config['enable'].get('retrieve_databundle', True):
         log: "logs/retrieve_databundle.log"
         script: 'scripts/retrieve_databundle.py'
 
+rule build_load_data:
+    output: "resources/load.csv"
+    log: "logs/build_load_data.log"
+    script: 'scripts/build_load_data.py'
+    
 rule build_powerplants:
     input:
         base_network="networks/base.nc",
@@ -204,7 +208,7 @@ rule add_electricity:
         powerplants='resources/powerplants.csv',
         hydro_capacities='data/bundle/hydro_capacities.csv',
         geth_hydro_capacities='data/geth2015_hydro_capacities.csv',
-        opsd_load='data/bundle/time_series_60min_singleindex_filtered.csv',
+        load='resources/load.csv',
         nuts3_shapes='resources/nuts3_shapes.geojson',
         **{'profile_' + t: "resources/profile_" + t + ".nc"
            for t in config['renewable']}
diff --git a/config.default.yaml b/config.default.yaml
index ff7a503f..98f8ed67 100755
--- a/config.default.yaml
+++ b/config.default.yaml
@@ -168,6 +168,11 @@ transformers:
   type: ''
 
 load:
+  url: https://data.open-power-system-data.org/time_series/2019-06-05/time_series_60min_singleindex.csv
+  power_statistics: True # only for files from <2019; set false in order to get ENTSOE transparency data 
+  interpolate_limit: 3 # data gaps up until this size are interpolated linearly
+  time_shift_for_large_gaps: 1w # data gaps up until this size are copied by copying from 
+  manual_adjustments: true # false
   scaling_factor: 1.0
 
 costs:
diff --git a/config.tutorial.yaml b/config.tutorial.yaml
index a51c2202..aed8693e 100755
--- a/config.tutorial.yaml
+++ b/config.tutorial.yaml
@@ -146,6 +146,11 @@ transformers:
   type: ''
 
 load:
+  url: https://data.open-power-system-data.org/time_series/2019-06-05/time_series_60min_singleindex.csv
+  power_statistics: True # only for files from <2019; set false in order to get ENTSOE transparency data 
+  interpolate_limit: 3 # data gaps up until this size are interpolated linearly
+  time_shift_for_large_gaps: 1w # data gaps up until this size are copied by copying from 
+  manual_adjustments: true # false
   scaling_factor: 1.0
 
 costs:
diff --git a/doc/configtables/load.csv b/doc/configtables/load.csv
index 035b27a1..66f3b994 100644
--- a/doc/configtables/load.csv
+++ b/doc/configtables/load.csv
@@ -1,2 +1,7 @@
 ,Unit,Values,Description
-scaling_factor,--,float,"Global correction factor for the load time series."
\ No newline at end of file
+url,--,string,"Link to open power system data time series data."
+power_statistics,bool,"{true, false}",Whether to load the electricity consumption data of the ENTSOE power statistics (only for files from 2019 and before) or from the ENTSOE transparency data (only has load data from 2015 onwards). 
+interpolate_limit,hours,integer,"Maximum gap size (consecutive nans) which interpolated linearly."
+time_shift_for_large_gaps,string,string,"Periods which are used for copying time-slices in order to fill large gaps of nans. Have to be valid ``pandas`` period strings."
+manual_adjustments,bool,"{true, false}","Whether to adjust the load data manually according to the function in :func:`manual_adjustment`."
+scaling_factor,--,float,"Global correction factor for the load time series."
diff --git a/doc/configuration.rst b/doc/configuration.rst
index bf276c06..f0e1717b 100644
--- a/doc/configuration.rst
+++ b/doc/configuration.rst
@@ -218,7 +218,7 @@ Specifies the temporal range to build an energy system model for as arguments to
 
 .. literalinclude:: ../config.default.yaml
    :language: yaml
-   :lines: 170-171
+   :lines: 170-174
 
 .. csv-table::
    :header-rows: 1
@@ -232,7 +232,7 @@ Specifies the temporal range to build an energy system model for as arguments to
 
 .. literalinclude:: ../config.default.yaml
    :language: yaml
-   :lines: 173-185
+   :lines: 175-188
 
 .. csv-table::
    :header-rows: 1
@@ -254,7 +254,7 @@ Specifies the temporal range to build an energy system model for as arguments to
 
 .. literalinclude:: ../config.default.yaml
    :language: yaml
-   :lines: 187-197
+   :lines: 190-200
 
 .. csv-table::
    :header-rows: 1
@@ -266,7 +266,7 @@ Specifies the temporal range to build an energy system model for as arguments to
 
 .. literalinclude:: ../config.default.yaml
    :language: yaml
-   :lines: 187,198-214
+   :lines: 190,201-217
 
 .. csv-table::
    :header-rows: 1
@@ -280,7 +280,7 @@ Specifies the temporal range to build an energy system model for as arguments to
 
 .. literalinclude:: ../config.default.yaml
    :language: yaml
-   :lines: 216-355
+   :lines: 219-358
 
 .. csv-table::
    :header-rows: 1
diff --git a/doc/preparation.rst b/doc/preparation.rst
index b2749a41..9e986580 100644
--- a/doc/preparation.rst
+++ b/doc/preparation.rst
@@ -39,6 +39,7 @@ together into a detailed PyPSA network stored in ``networks/elec.nc``.
 
    preparation/retrieve
    preparation/build_shapes
+   preparation/build_load_data
    preparation/build_cutout
    preparation/build_natura_raster
    preparation/prepare_links_p_nom
diff --git a/doc/preparation/build_load_data.rst b/doc/preparation/build_load_data.rst
new file mode 100644
index 00000000..03535981
--- /dev/null
+++ b/doc/preparation/build_load_data.rst
@@ -0,0 +1,12 @@
+..
+  SPDX-FileCopyrightText: 2020-2021 The PyPSA-Eur Authors
+
+  SPDX-License-Identifier: CC-BY-4.0
+
+.. _load_data:
+
+Rule ``build_load_data``
+=============================
+
+
+.. automodule:: build_load_data
diff --git a/doc/release_notes.rst b/doc/release_notes.rst
index 7efea8be..065310c6 100644
--- a/doc/release_notes.rst
+++ b/doc/release_notes.rst
@@ -49,6 +49,9 @@ Upcoming Release
 
 * Modelling hydrogen and battery storage with Store and Link components is now the default, rather than using StorageUnit components with fixed power-to-energy ratio (`#205 <https://github.com/PyPSA/pypsa-eur/pull/205>`_).
 
+* Electricity consumption data is now directly retrieved from the `OPSD website <https://data.open-power-system-data.org/time_series/2019-06-05>`_ using the rule ``build_load_data``. The user can decide whether to take the ENTSOE power statistics data (defaul) or the ENTSOE transparency data.   
+
+
 
 PyPSA-Eur 0.2.0 (8th June 2020)
 ==================================
diff --git a/scripts/add_electricity.py b/scripts/add_electricity.py
index 936ca1b5..80904b7a 100755
--- a/scripts/add_electricity.py
+++ b/scripts/add_electricity.py
@@ -53,14 +53,9 @@ Inputs
         :scale: 34 %
 
 - ``data/geth2015_hydro_capacities.csv``: alternative to capacities above; NOT CURRENTLY USED!
-- ``data/bundle/time_series_60min_singleindex_filtered.csv``: Hourly per-country load profiles since 2010 from the `ENTSO-E statistical database <https://www.entsoe.eu/data/power-stats/hourly_load/>`_
 
-    .. image:: ../img/load-box.png
-        :scale: 33 %
-
-    .. image:: ../img/load-ts.png
-        :scale: 33 %
 
+- ``resources/opsd_load.csv`` Hourly per-country load profiles.
 - ``resources/regions_onshore.geojson``: confer :ref:`busregions`
 - ``resources/nuts3_shapes.geojson``: confer :ref:`shapes`
 - ``resources/powerplants.csv``: confer :ref:`powerplants`
@@ -91,7 +86,6 @@ It further adds extendable ``generators`` with **zero** capacity for
 """
 
 from vresutils.costdata import annuity
-from vresutils.load import timeseries_opsd
 from vresutils import transfer as vtransfer
 
 import logging
@@ -200,7 +194,6 @@ def load_powerplants(ppl_fn=None):
             .rename(columns=str.lower).drop(columns=['efficiency'])
             .replace({'carrier': carrier_dict}))
 
-
 # =============================================================================
 # Attach components
 # =============================================================================
@@ -211,17 +204,15 @@ def attach_load(n):
     substation_lv_i = n.buses.index[n.buses['substation_lv']]
     regions = (gpd.read_file(snakemake.input.regions).set_index('name')
                .reindex(substation_lv_i))
-    opsd_load = (timeseries_opsd(slice(*n.snapshots[[0,-1]].year.astype(str)),
-                                 snakemake.input.opsd_load) *
-                 snakemake.config.get('load', {}).get('scaling_factor', 1.0))
+    opsd_load = (pd.read_csv(snakemake.input.load, index_col=0, parse_dates=True)
+                .filter(items=snakemake.config['countries']))
 
-    # Convert to naive UTC (has to be explicit since pandas 0.24)
-    opsd_load.index = opsd_load.index.tz_localize(None)
+    scaling = snakemake.config.get('load', {}).get('scaling_factor', 1.0)
+    logger.info(f"Load data scaled with scalling factor {scaling}.")
+    opsd_load *= scaling
 
     nuts3 = gpd.read_file(snakemake.input.nuts3_shapes).set_index('index')
 
-    def normed(x): return x.divide(x.sum())
-
     def upsample(cntry, group):
         l = opsd_load[cntry]
         if len(group) == 1:
@@ -236,7 +227,8 @@ def attach_load(n):
                               index=group.index)
 
             # relative factors 0.6 and 0.4 have been determined from a linear
-            # regression on the country to continent load data (refer to vresutils.load._upsampling_weights)
+            # regression on the country to continent load data
+            # (refer to vresutils.load._upsampling_weights)
             factors = normed(0.6 * normed(gdp_n) + 0.4 * normed(pop_n))
             return pd.DataFrame(factors.values * l.values[:,np.newaxis],
                                 index=l.index, columns=factors.index)
diff --git a/scripts/build_load_data.py b/scripts/build_load_data.py
new file mode 100755
index 00000000..0b781dd3
--- /dev/null
+++ b/scripts/build_load_data.py
@@ -0,0 +1,224 @@
+# coding: utf-8
+"""
+
+This rule downloads the load data from `Open Power System Data Time series <https://data.open-power-system-data.org/time_series/>`_. For all countries in the network, the per country load timeseries with suffix ``_load_actual_entsoe_transparency`` are extracted from the dataset. After filling small gaps linearly and large gaps by copying time-slice of a given period, the load data is exported to a ``.csv`` file.
+
+Relevant Settings
+-----------------
+
+.. code:: yaml
+
+    snapshots:
+
+    load:
+        url:
+        interpolate_limit:
+        time_shift_for_large_gaps:
+        manual_adjustments: true
+
+
+.. seealso::
+    Documentation of the configuration file ``config.yaml`` at
+    :ref:`load_cf`
+
+Inputs
+------
+
+
+Outputs
+-------
+
+- ``resource/time_series_60min_singleindex_filtered.csv``:
+
+
+"""
+
+import logging
+logger = logging.getLogger(__name__)
+from _helpers import configure_logging
+
+import pandas as pd
+import numpy as np
+import dateutil
+from pandas import Timedelta as Delta
+
+
+def load_timeseries(fn, years, countries, powerstatistics=True):
+    """
+    Read load data from OPSD time-series package version 2020-10-06.
+
+    Parameters
+    ----------
+    years : None or slice()
+        Years for which to read load data (defaults to
+        slice("2018","2019"))
+    fn : str
+        File name or url location (file format .csv)
+    countries : listlike
+        Countries for which to read load data.
+    powerstatistics: bool
+        Whether the electricity consumption data of the ENTSOE power
+        statistics (if true) or of the ENTSOE transparency map (if false)
+        should be parsed.
+
+    Returns
+    -------
+    load : pd.DataFrame
+        Load time-series with UTC timestamps x ISO-2 countries
+    """
+    logger.info(f"Retrieving load data from '{fn}'.")
+
+    pattern = 'power_statistics' if powerstatistics else '_transparency'
+    pattern = f'_load_actual_entsoe_{pattern}'
+    rename = lambda s: s[:-len(pattern)]
+    date_parser = lambda x: dateutil.parser.parse(x, ignoretz=True)
+    return (pd.read_csv(fn, index_col=0, parse_dates=[0], date_parser=date_parser)
+            .filter(like=pattern)
+            .rename(columns=rename)
+            .dropna(how="all", axis=0)
+            .rename(columns={'GB_UKM' : 'GB'})
+            .filter(items=countries)
+            .loc[years])
+
+
+def consecutive_nans(ds):
+    return (ds.isnull().astype(int)
+            .groupby(ds.notnull().astype(int).cumsum()[ds.isnull()])
+            .transform('sum').fillna(0))
+
+
+def fill_large_gaps(ds, shift):
+    """
+    Fill up large gaps with load data from the previous week.
+
+    This function fills gaps ragning from 3 to 168 hours (one week).
+    """
+    shift = Delta(shift)
+    nhours = shift / np.timedelta64(1, 'h')
+    if (consecutive_nans(ds) > nhours).any():
+        logger.warning('There exist gaps larger then the time shift used for '
+                       'copying time slices.')
+    time_shift = pd.Series(ds.values, ds.index + shift)
+    return ds.where(ds.notnull(), time_shift.reindex_like(ds))
+
+
+def nan_statistics(df):
+    def max_consecutive_nans(ds):
+        return (ds.isnull().astype(int)
+                  .groupby(ds.notnull().astype(int).cumsum())
+                  .sum().max())
+    consecutive = df.apply(max_consecutive_nans)
+    total = df.isnull().sum()
+    max_total_per_month = df.isnull().resample('m').sum().max()
+    return pd.concat([total, consecutive, max_total_per_month],
+                 keys=['total', 'consecutive', 'max_total_per_month'], axis=1)
+
+
+def copy_timeslice(load, cntry, start, stop, delta):
+    start = pd.Timestamp(start)
+    stop = pd.Timestamp(stop)
+    if start-delta in load.index and stop in load.index and cntry in load:
+        load.loc[start:stop, cntry] = load.loc[start-delta:stop-delta, cntry].values
+
+
+def manual_adjustment(load, powerstatistics):
+    """
+    Adjust gaps manual for load data from OPSD time-series package.
+
+    1. For the ENTSOE power statistics load data (if powerstatistics is True)
+
+    Kosovo (KV) and Albania (AL) do not exist in the data set. Kosovo gets the
+    same load curve as Serbia and Albania the same as Macdedonia, both scaled
+    by the corresponding ratio of total energy consumptions reported by
+    IEA Data browser [0] for the year 2013.
+
+    2. For the ENTSOE transparency load data (if powerstatistics is False)
+
+    Albania (AL) and Macedonia (MK) do not exist in the data set. Both get the
+    same load curve as Montenegro,  scaled by the corresponding ratio of total energy
+    consumptions reported by  IEA Data browser [0] for the year 2016.
+
+    [0] https://www.iea.org/data-and-statistics?country=WORLD&fuel=Electricity%20and%20heat&indicator=TotElecCons
+
+
+    Parameters
+    ----------
+    load : pd.DataFrame
+        Load time-series with UTC timestamps x ISO-2 countries
+    powerstatistics: bool
+        Whether argument load comprises the electricity consumption data of
+        the ENTSOE power statistics or of the ENTSOE transparency map
+
+    Returns
+    -------
+    load : pd.DataFrame
+        Manual adjusted and interpolated load time-series with UTC
+        timestamps x ISO-2 countries
+    """
+
+    if powerstatistics:
+        if 'MK' in load.columns:
+            if 'AL' not in load.columns or load.AL.isnull().values.all():
+                load['AL'] = load['MK'] * (4.1 / 7.4)
+        if 'RS' in load.columns:
+            if 'KV' not in load.columns or load.KV.isnull().values.all():
+                load['KV'] = load['RS'] * (4.8 / 27.)
+
+        copy_timeslice(load, 'GR', '2015-08-11 21:00', '2015-08-15 20:00', Delta(weeks=1))
+        copy_timeslice(load, 'AT', '2018-12-31 22:00', '2019-01-01 22:00', Delta(days=2))
+        copy_timeslice(load, 'CH', '2010-01-19 07:00', '2010-01-19 22:00', Delta(days=1))
+        copy_timeslice(load, 'CH', '2010-03-28 00:00', '2010-03-28 21:00', Delta(days=1))
+        # is a WE, so take WE before
+        copy_timeslice(load, 'CH', '2010-10-08 13:00', '2010-10-10 21:00', Delta(weeks=1))
+        copy_timeslice(load, 'CH', '2010-11-04 04:00', '2010-11-04 22:00', Delta(days=1))
+        copy_timeslice(load, 'NO', '2010-12-09 11:00', '2010-12-09 18:00', Delta(days=1))
+        # whole january missing
+        copy_timeslice(load, 'GB', '2009-12-31 23:00', '2010-01-31 23:00', Delta(days=-364))
+
+    else:
+        if 'ME' in load:
+            if 'AL' not in load and 'AL' in countries:
+                load['AL'] = load.ME * (5.7/2.9)
+            if 'MK' not in load and 'MK' in countries:
+                load['MK'] = load.ME * (6.7/2.9)
+        copy_timeslice(load, 'BG', '2018-10-27 21:00', '2018-10-28 22:00', Delta(weeks=1))
+
+    return load
+
+
+if __name__ == "__main__":
+
+    if 'snakemake' not in globals():
+        from _helpers import mock_snakemake
+        snakemake = mock_snakemake('build_load_data')
+
+    configure_logging(snakemake)
+
+    config = snakemake.config
+    powerstatistics = config['load']['power_statistics']
+    url = config['load']['url']
+    interpolate_limit = config['load']['interpolate_limit']
+    countries = config['countries']
+    snapshots = pd.date_range(freq='h', **config['snapshots'])
+    years = slice(snapshots[0], snapshots[-1])
+    time_shift = config['load']['time_shift_for_large_gaps']
+
+    load = load_timeseries(url, years, countries, powerstatistics)
+
+    if config['load']['manual_adjustments']:
+        load = manual_adjustment(load, powerstatistics)
+
+    logger.info(f"Linearly interpolate gaps of size {interpolate_limit} and less.")
+    load = load.interpolate(method='linear', limit=interpolate_limit)
+
+    logger.info("Filling larger gaps by copying time-slices of period "
+                f"'{time_shift}'.")
+    load = load.apply(fill_large_gaps, shift=time_shift)
+
+    assert not load.isna().any().any(), (
+        'Load data contains nans. Adjust the parameters '
+        '`time_shift_for_large_gaps` or modify the `manual_adjustment` function '
+        'for implementing the needed load data modifications.')
+
+    load.to_csv(snakemake.output[0])
+
diff --git a/test/config.test1.yaml b/test/config.test1.yaml
index 2efdaecb..e8f17758 100755
--- a/test/config.test1.yaml
+++ b/test/config.test1.yaml
@@ -146,6 +146,11 @@ transformers:
   type: ''
 
 load:
+  url: https://data.open-power-system-data.org/time_series/2019-06-05/time_series_60min_singleindex.csv
+  power_statistics: True # only for files from <2019; set false in order to get ENTSOE transparency data 
+  interpolate_limit: 3 # data gaps up until this size are interpolated linearly
+  time_shift_for_large_gaps: 1w # data gaps up until this size are copied by copying from 
+  manual_adjustments: true # false
   scaling_factor: 1.0
 
 costs: