From 1b26c3c737f6fe93699fe3378b12d4d7b192a141 Mon Sep 17 00:00:00 2001
From: Tom Brown <tom@nworbmot.org>
Date: Tue, 16 Apr 2019 16:03:51 +0200
Subject: [PATCH] Copy data preparation scripts from pypsa-eur sector branch

---
 .gitignore                                    |  32 ++
 Snakefile                                     | 126 +++++
 config.yaml                                   |  12 +
 scripts/build_biomass_potentials.py           |  53 ++
 scripts/build_clustered_population_layouts.py |  32 ++
 scripts/build_cop_profiles.py                 |  25 +
 scripts/build_energy_totals.py                | 460 ++++++++++++++++++
 scripts/build_heat_demand.py                  |  40 ++
 scripts/build_industrial_demand.py            |  35 ++
 scripts/build_population_layouts.py           | 104 ++++
 scripts/build_solar_thermal_profiles.py       |  55 +++
 scripts/build_temperature_profiles.py         |  48 ++
 12 files changed, 1022 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Snakefile
 create mode 100644 config.yaml
 create mode 100644 scripts/build_biomass_potentials.py
 create mode 100644 scripts/build_clustered_population_layouts.py
 create mode 100644 scripts/build_cop_profiles.py
 create mode 100644 scripts/build_energy_totals.py
 create mode 100644 scripts/build_heat_demand.py
 create mode 100644 scripts/build_industrial_demand.py
 create mode 100644 scripts/build_population_layouts.py
 create mode 100644 scripts/build_solar_thermal_profiles.py
 create mode 100644 scripts/build_temperature_profiles.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..4682a1b3
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,32 @@
+.snakemake*
+.ipynb_checkpoints
+__pycache__
+gurobi.log
+
+/bak
+/resources
+/results
+/networks
+/benchmarks
+/logs
+/notebooks
+/data
+/data/links_p_nom.csv
+/data/*totals.csv
+/data/emobility/
+/data/heating/
+/data/eurostat*
+/data/odyssee/
+/data/transport_data.csv
+/data/switzerland*
+/data/.nfs*
+
+*.org
+
+*.nc
+
+*~
+/scripts/old
+
+*.pyc
+/cutouts
diff --git a/Snakefile b/Snakefile
new file mode 100644
index 00000000..f4c23848
--- /dev/null
+++ b/Snakefile
@@ -0,0 +1,126 @@
+
+configfile: "config.yaml"
+
+subworkflow pypsaeur:
+    workdir: "../pypsa-eur"
+    snakefile: "../pypsa-eur/Snakefile"
+    configfile: "../pypsa-eur/config.yaml"
+
+
+rule prepare_sector_networks:
+    input:
+        pypsaeur(expand(config['results_dir'] + config['run'] + "/prenetworks/elec_s{simpl}_{clusters}_lv{lv}_{opts}.nc",
+                 **config['scenario']))
+
+
+
+rule build_population_layouts:
+    input:
+        nuts3_shapes='resources/nuts3_shapes.geojson',
+        urban_percent="data/urban_percent.csv"
+    output:
+        pop_layout_total="resources/pop_layout_total.nc",
+        pop_layout_urban="resources/pop_layout_urban.nc",
+        pop_layout_rural="resources/pop_layout_rural.nc"
+    script: "scripts/build_population_layouts.py"
+
+
+rule build_clustered_population_layouts:
+    input:
+        pop_layout_total="resources/pop_layout_total.nc",
+        pop_layout_urban="resources/pop_layout_urban.nc",
+        pop_layout_rural="resources/pop_layout_rural.nc",
+        regions_onshore="resources/regions_onshore_{network}_s{simpl}_{clusters}.geojson"
+    output:
+        clustered_pop_layout="resources/pop_layout_{network}_s{simpl}_{clusters}.csv"
+    script: "scripts/build_clustered_population_layouts.py"
+
+
+rule build_heat_demand:
+    input:
+        pop_layout_total="resources/pop_layout_total.nc",
+        pop_layout_urban="resources/pop_layout_urban.nc",
+        pop_layout_rural="resources/pop_layout_rural.nc",
+        regions_onshore="resources/regions_onshore_{network}_s{simpl}_{clusters}.geojson"
+    output:
+        heat_demand_urban="resources/heat_demand_urban_{network}_s{simpl}_{clusters}.nc",
+        heat_demand_rural="resources/heat_demand_rural_{network}_s{simpl}_{clusters}.nc",
+        heat_demand_total="resources/heat_demand_total_{network}_s{simpl}_{clusters}.nc"
+    script: "scripts/build_heat_demand.py"
+
+rule build_temperature_profiles:
+    input:
+        pop_layout_total="resources/pop_layout_total.nc",
+        pop_layout_urban="resources/pop_layout_urban.nc",
+        pop_layout_rural="resources/pop_layout_rural.nc",
+        regions_onshore="resources/regions_onshore_{network}_s{simpl}_{clusters}.geojson"
+    output:
+        temp_soil_total="resources/temp_soil_total_{network}_s{simpl}_{clusters}.nc",
+        temp_soil_rural="resources/temp_soil_rural_{network}_s{simpl}_{clusters}.nc",
+        temp_soil_urban="resources/temp_soil_urban_{network}_s{simpl}_{clusters}.nc",
+        temp_air_total="resources/temp_air_total_{network}_s{simpl}_{clusters}.nc",
+        temp_air_rural="resources/temp_air_rural_{network}_s{simpl}_{clusters}.nc",
+        temp_air_urban="resources/temp_air_urban_{network}_s{simpl}_{clusters}.nc"
+    script: "scripts/build_temperature_profiles.py"
+
+
+rule build_cop_profiles:
+    input:
+        temp_soil_total="resources/temp_soil_total_{network}_s{simpl}_{clusters}.nc",
+        temp_soil_rural="resources/temp_soil_rural_{network}_s{simpl}_{clusters}.nc",
+        temp_soil_urban="resources/temp_soil_urban_{network}_s{simpl}_{clusters}.nc",
+        temp_air_total="resources/temp_air_total_{network}_s{simpl}_{clusters}.nc",
+        temp_air_rural="resources/temp_air_rural_{network}_s{simpl}_{clusters}.nc",
+        temp_air_urban="resources/temp_air_urban_{network}_s{simpl}_{clusters}.nc"
+    output:
+        cop_soil_total="resources/cop_soil_total_{network}_s{simpl}_{clusters}.nc",
+        cop_soil_rural="resources/cop_soil_rural_{network}_s{simpl}_{clusters}.nc",
+        cop_soil_urban="resources/cop_soil_urban_{network}_s{simpl}_{clusters}.nc",
+        cop_air_total="resources/cop_air_total_{network}_s{simpl}_{clusters}.nc",
+        cop_air_rural="resources/cop_air_rural_{network}_s{simpl}_{clusters}.nc",
+        cop_air_urban="resources/cop_air_urban_{network}_s{simpl}_{clusters}.nc"
+    script: "scripts/build_cop_profiles.py"
+
+
+rule build_solar_thermal_profiles:
+    input:
+        pop_layout_total="resources/pop_layout_total.nc",
+        pop_layout_urban="resources/pop_layout_urban.nc",
+        pop_layout_rural="resources/pop_layout_rural.nc",
+        regions_onshore="resources/regions_onshore_{network}_s{simpl}_{clusters}.geojson"
+    output:
+        solar_thermal_total="resources/solar_thermal_total_{network}_s{simpl}_{clusters}.nc",
+        solar_thermal_urban="resources/solar_thermal_urban_{network}_s{simpl}_{clusters}.nc",
+        solar_thermal_rural="resources/solar_thermal_rural_{network}_s{simpl}_{clusters}.nc"
+    script: "scripts/build_solar_thermal_profiles.py"
+
+
+
+rule build_energy_totals:
+    input:
+        nuts3_shapes='resources/nuts3_shapes.geojson'
+    output:
+        energy_name='data/energy_totals.csv',
+	co2_name='data/co2_totals.csv',
+	transport_name='data/transport_data.csv'
+    threads: 1
+    resources: mem_mb=10000
+    script: 'scripts/build_energy_totals.py'
+
+rule build_biomass_potentials:
+    input:
+        jrc_potentials="data/biomass/JRC Biomass Potentials.xlsx"
+    output:
+        biomass_potentials='data/biomass_potentials.csv'
+    threads: 1
+    resources: mem_mb=1000
+    script: 'scripts/build_biomass_potentials.py'
+
+rule build_industrial_demand:
+    input:
+        clustered_pop_layout="resources/pop_layout_{network}_s{simpl}_{clusters}.csv"
+    output:
+        industrial_demand="resources/industrial_demand_{network}_s{simpl}_{clusters}.csv"
+    threads: 1
+    resources: mem_mb=1000
+    script: 'scripts/build_industrial_demand.py'
diff --git a/config.yaml b/config.yaml
new file mode 100644
index 00000000..f8c3b984
--- /dev/null
+++ b/config.yaml
@@ -0,0 +1,12 @@
+
+results_dir: 'results/'
+summary_dir: results
+run: '190416-modular-test'
+
+scenario:
+  sectors: [E] # ,E+EV,E+BEV,E+BEV+V2G] # [ E+EV, E+BEV, E+BEV+V2G ]
+  simpl: ['']
+  lv: [1.0]#[1.0, 1.125, 1.25, 1.5, 2.0, opt]# or opt
+  clusters: [128] #[90, 128, 181] #[45, 64, 90, 128, 181, 256] #, 362] # (2**np.r_[5.5:9:.5]).astype(int) minimum is 37
+  opts: [Co2L0-3H-T-H-B-I,Co2L0-3H-T-H-B-I-onwind0,Co2L0-3H-T-H-B-I-onwind0p1]#,Co2L0p05-3H-T-H-B-I,Co2L0p10-3H-T-H-B-I,Co2L0p20-3H-T-H-B-I,Co2L0p30-3H-T-H-B-I,Co2L0p50-3H-T-H-B-I]#[Co2L-3H-T-H,Co2L0p10-3H-T-H,Co2L0-3H-T-H,Co2L0p20-3H-T-H]   #Co2L-3H-T-H,Co2L0p10-3H-T-H,Co2L0p20-3H-T-HCo2L-3H-T-H,Co2L0p10-3H-T-H,Co2L0p30-3H-T-H,Co2L0p50-3H-T-H] #Co2L-3H,Co2L-3H-T,, LC-FL, LC-T, Ep-T, Co2L-T]
+  # Co2L will give default (5%); Co2L0p25 will give 25% CO2 emissions; Co2Lm0p05 will give 5% negative emissions
diff --git a/scripts/build_biomass_potentials.py b/scripts/build_biomass_potentials.py
new file mode 100644
index 00000000..72a5a714
--- /dev/null
+++ b/scripts/build_biomass_potentials.py
@@ -0,0 +1,53 @@
+
+import pandas as pd
+
+idx = pd.IndexSlice
+
+def build_biomass_potentials():
+
+    #delete empty column C from this sheet first before reading it in
+    df = pd.read_excel(snakemake.input.jrc_potentials,
+                       "Potentials (PJ)")
+
+    df.rename(columns={"Unnamed: 16":"Municipal waste"},inplace=True)
+    df.drop(columns="Total",inplace=True)
+    df.replace("-",0.,inplace=True)
+
+    df_dict = {}
+
+    for i in range(36):
+        df_dict[df.iloc[i*16,1]] = df.iloc[1+i*16:(i+1)*16].astype(float)
+
+    df_new = pd.concat(df_dict)
+
+    us_type = pd.Series(index=df_new.columns)
+    us_type.iloc[0:7] = "not included"
+    us_type.iloc[7:8] = "biogas"
+    us_type.iloc[8:9] = "solid biomass"
+    us_type.iloc[9:11] = "not included"
+    us_type.iloc[11:16] = "solid biomass"
+    us_type.iloc[16:17] = "biogas"
+
+
+    #convert from PJ to MWh
+    biomass_potentials = df_new.loc[idx[:,snakemake.config['biomass']['year'],snakemake.config['biomass']['scenario']],:].groupby(us_type,axis=1).sum().groupby(level=0).sum().rename({"UK" : "GB", "BH" : "BA"})/3.6*1e6
+    biomass_potentials.to_csv(snakemake.output.biomass_potentials)
+
+
+
+if __name__ == "__main__":
+
+
+    # Detect running outside of snakemake and mock snakemake for testing
+    if 'snakemake' not in globals():
+        from vresutils import Dict
+        import yaml
+        snakemake = Dict()
+        snakemake.input = Dict()
+        snakemake.input['jrc_potentials'] = "data/biomass/JRC Biomass Potentials.xlsx"
+        snakemake.output = Dict()
+        snakemake.output['biomass_potentials'] = 'data/biomass_potentials.csv'
+        with open('config.yaml') as f:
+            snakemake.config = yaml.load(f)
+
+    build_biomass_potentials()
diff --git a/scripts/build_clustered_population_layouts.py b/scripts/build_clustered_population_layouts.py
new file mode 100644
index 00000000..de98d51d
--- /dev/null
+++ b/scripts/build_clustered_population_layouts.py
@@ -0,0 +1,32 @@
+
+import geopandas as gpd
+import xarray as xr
+import pandas as pd
+import atlite
+
+
+
+cutout = atlite.Cutout(snakemake.config['renewable']['onwind']['cutout'],
+                       cutout_dir=snakemake.config['atlite']['cutout_dir'])
+
+
+clustered_busregions_as_geopd = gpd.read_file(snakemake.input.regions_onshore).set_index('name', drop=True)
+
+clustered_busregions = pd.Series(clustered_busregions_as_geopd.geometry, index=clustered_busregions_as_geopd.index)
+
+
+
+I = cutout.indicatormatrix(clustered_busregions)
+
+
+items = ["total","urban","rural"]
+
+pop = pd.DataFrame(columns=items,
+                   index=clustered_busregions.index)
+
+
+for item in items:
+    pop_layout = xr.open_dataarray(snakemake.input['pop_layout_'+item])
+    pop[item] = I.dot(pop_layout.stack(spatial=('y', 'x')))
+
+pop.to_csv(snakemake.output.clustered_pop_layout)
diff --git a/scripts/build_cop_profiles.py b/scripts/build_cop_profiles.py
new file mode 100644
index 00000000..d469489c
--- /dev/null
+++ b/scripts/build_cop_profiles.py
@@ -0,0 +1,25 @@
+
+import xarray as xr
+
+#quadratic regression based on Staffell et al. (2012)
+#https://doi.org/10.1039/C2EE22653G
+
+# COP is function of temp difference source to sink
+
+cop_f = {"air" : lambda d_t: 6.81 -0.121*d_t + 0.000630*d_t**2,
+         "soil" : lambda d_t: 8.77 -0.150*d_t + 0.000734*d_t**2}
+
+sink_T = 55. # Based on DTU / large area radiators
+
+
+
+for area in ["total", "urban", "rural"]:
+    for source in ["air", "soil"]:
+
+        source_T = xr.open_dataarray(snakemake.input["temp_{}_{}".format(source,area)])
+
+        delta_T = sink_T - source_T
+
+        cop = cop_f[source](delta_T)
+
+        cop.to_netcdf(snakemake.output["cop_{}_{}".format(source,area)])
diff --git a/scripts/build_energy_totals.py b/scripts/build_energy_totals.py
new file mode 100644
index 00000000..05551807
--- /dev/null
+++ b/scripts/build_energy_totals.py
@@ -0,0 +1,460 @@
+
+import pandas as pd
+import geopandas as gpd
+
+idx = pd.IndexSlice
+
+#translations for Eurostat
+country_to_code = {
+'EU28' : 'EU',
+'EA19' : 'EA',
+'Belgium' : 'BE',
+'Bulgaria' : 'BG',
+'Czech Republic' : 'CZ',
+'Denmark' : 'DK',
+'Germany' : 'DE',
+'Estonia' : 'EE',
+'Ireland' : 'IE',
+'Greece' : 'GR',
+'Spain' : 'ES',
+'France' : 'FR',
+'Croatia' : 'HR',
+'Italy' : 'IT',
+'Cyprus' : 'CY',
+'Latvia' : 'LV',
+'Lithuania' : 'LT',
+'Luxembourg' : 'LU',
+'Hungary' : 'HU',
+'Malta' : 'MA',
+'Netherlands' : 'NL',
+'Austria' : 'AT',
+'Poland' : 'PL',
+'Portugal' : 'PT',
+'Romania' : 'RO',
+'Slovenia' : 'SI',
+'Slovakia' : 'SK',
+'Finland' : 'FI',
+'Sweden' : 'SE',
+'United Kingdom' : 'GB',
+'Iceland' : 'IS',
+'Norway' : 'NO',
+'Montenegro' : 'ME',
+'FYR of Macedonia' : 'MK',
+'Albania' : 'AL',
+'Serbia' : 'RS',
+'Turkey' : 'TU',
+'Bosnia and Herzegovina' : 'BA',
+'Kosovo\n(UNSCR 1244/99)' : 'KO',  #2017 version
+'Kosovo\n(under United Nations Security Council Resolution 1244/99)' : 'KO',  #2016 version
+'Moldova' : 'MO',
+'Ukraine' : 'UK',
+'Switzerland' : 'CH',
+}
+
+
+non_EU = ['NO', 'CH', 'ME', 'MK', 'RS', 'BA', 'AL']
+
+rename = {"GR" : "EL",
+          "GB" : "UK"}
+
+eu28 = ['FR', 'DE', 'GB', 'IT', 'ES', 'PL', 'SE', 'NL', 'BE', 'FI', 'CZ',
+        'DK', 'PT', 'RO', 'AT', 'BG', 'EE', 'GR', 'LV',
+        'HU', 'IE', 'SK', 'LT', 'HR', 'LU', 'SI'] + ['CY','MT']
+
+eu28_eea = eu28[:]
+eu28_eea.remove("GB")
+eu28_eea.append("UK")
+
+
+def build_eurostat(year):
+    """Return multi-index for all countries' energy data in TWh/a."""
+
+    stats_from_year = 2016
+
+    fns = {2016: "data/eurostat-energy_balances-june_2016_edition/{year}-Energy-Balances-June2016edition.xlsx",
+           2017: "data/eurostat-energy_balances-june_2017_edition/{year}-ENERGY-BALANCES-June2017edition.xlsx"}
+
+    #2016 includes BA, 2017 doesn't
+
+    #with sheet as None, an ordered dictionary of all sheets is returned
+    dfs = pd.read_excel(fns[stats_from_year].format(year=year),
+                        None,
+                        skiprows=1,
+                        index_col=list(range(4)))
+
+
+    #sorted_index necessary for slicing
+    df = pd.concat({country_to_code[df.columns[0]] : df for ct,df in dfs.items()},sort=True).sort_index()
+
+    #drop non-numeric columns; convert ktoe/a to TWh/a
+    return df.drop(df.columns[df.dtypes != float],axis=1)*11.63/1e3
+
+
+def build_swiss(year):
+
+    fn = "data/switzerland-sfoe/switzerland-new_format.csv"
+
+    #convert PJ/a to TWh/a
+    return (pd.read_csv(fn,index_col=list(range(2)))/3.6).loc["CH",str(year)]
+
+
+
+
+def build_idees(year):
+    base_dir = "data/jrc-idees-2015"
+
+    totals = pd.DataFrame()
+
+    #convert ktoe/a to TWh/a
+    factor = 11.63/1e3
+
+    for ct in population.index:
+
+        if ct in non_EU:
+            print("When reading IDEES, skipping non-EU28 country",ct)
+            continue
+
+        #RESIDENTIAL
+
+        filename = "{}/JRC-IDEES-2015_Residential_{}.xlsx".format(base_dir,rename.get(ct,ct))
+        df = pd.read_excel(filename,"RES_hh_fec")
+
+        assert df.iloc[2,0] == "Space heating"
+        totals.loc[ct,"total residential space"] = df.loc[2,year]
+
+        assert df.iloc[10,0] == "Advanced electric heating"
+        assert df.iloc[11,0] == "Conventional electric heating"
+        totals.loc[ct,"electricity residential space"] = df.loc[[10,11],year].sum()
+
+        assert df.iloc[15,0] == "Water heating"
+        totals.loc[ct,"total residential water"] = df.loc[15,year]
+        assert df.iloc[23,0] == "Electricity"
+        totals.loc[ct,"electricity residential water"] = df.loc[23,year]
+
+        assert df.iloc[25,0] == "Cooking"
+        totals.loc[ct,"total residential cooking"] = df.loc[25,year]
+        assert df.iloc[30,0] == "Electricity"
+        totals.loc[ct,"electricity residential cooking"] = df.loc[30,year]
+
+        df = pd.read_excel(filename,"RES_summary")
+
+        assert df.iloc[34,0] == "Energy consumption by fuel - Eurostat structure (ktoe)"
+        totals.loc[ct,"total residential"] = df.loc[34,year]
+
+        assert df.iloc[47,0] == "Electricity"
+        totals.loc[ct,"electricity residential"] = df.loc[47,year]
+
+
+        #SERVICES
+
+        filename = "{}/JRC-IDEES-2015_Tertiary_{}.xlsx".format(base_dir,rename.get(ct,ct))
+        df = pd.read_excel(filename,"SER_hh_fec")
+
+        assert df.iloc[2,0] == "Space heating"
+        totals.loc[ct,"total services space"] = df.loc[2,year]
+
+        assert df.iloc[11,0] == "Advanced electric heating"
+        assert df.iloc[12,0] == "Conventional electric heating"
+        totals.loc[ct,"electricity services space"] = df.loc[[11,12],year].sum()
+
+        assert df.iloc[17,0] == "Hot water"
+        totals.loc[ct,"total services water"] = df.loc[17,year]
+        assert df.iloc[24,0] == "Electricity"
+        totals.loc[ct,"electricity services water"] = df.loc[24,year]
+
+        assert df.iloc[27,0] == "Catering"
+        totals.loc[ct,"total services cooking"] = df.loc[27,year]
+        assert df.iloc[31,0] == "Electricity"
+        totals.loc[ct,"electricity services cooking"] = df.loc[31,year]
+
+        df = pd.read_excel(filename,"SER_summary")
+
+        assert df.iloc[37,0] == "Energy consumption by fuel - Eurostat structure (ktoe)"
+        totals.loc[ct,"total services"] = df.loc[37,year]
+
+        assert df.iloc[50,0] == "Electricity"
+        totals.loc[ct,"electricity services"] = df.loc[50,year]
+
+
+        # TRANSPORT
+
+        filename = "{}/JRC-IDEES-2015_Transport_{}.xlsx".format(base_dir,rename.get(ct,ct))
+        df = pd.read_excel(filename,"TrRoad_ene")
+
+        assert df.iloc[2,0] == "by fuel (EUROSTAT DATA)"
+        totals.loc[ct,"total road"] = df.loc[2,year]
+        assert df.iloc[13,0] == "Electricity"
+        totals.loc[ct,"electricity road"] = df.loc[13,year]
+
+        assert df.iloc[61,0] == "Passenger cars"
+        totals.loc[ct,"passenger car efficiency"] = df.loc[61,year]
+
+        df = pd.read_excel(filename,"TrRail_ene")
+
+        assert df.iloc[2,0] == "by fuel (EUROSTAT DATA)"
+        totals.loc[ct,"total rail"] = df.loc[2,year]
+        assert df.iloc[12,0] == "Electricity"
+        totals.loc[ct,"electricity rail"] = df.loc[12,year]
+
+        df = pd.read_excel(filename,"TrRoad_act")
+
+        assert df.iloc[85,0] == "Passenger cars"
+        totals.loc[ct,"passenger cars"] = df.loc[85,year]
+
+    totals = totals*factor
+
+    totals["passenger cars"] = totals["passenger cars"]/factor
+
+    #convert ktoe/100km to kWh per km
+    totals["passenger car efficiency"] = 10*totals["passenger car efficiency"]
+
+    return totals
+
+
+def build_energy_totals():
+
+    clean_df = idees.reindex(population.index).drop(["passenger cars","passenger car efficiency"],axis=1)
+
+    clean_df.loc["CH"] = swiss
+
+    #get values for missing countries based on Eurostat EnergyBalances
+    #divide cooking/space/water according to averages in EU28
+
+    missing = clean_df.index[clean_df["total residential"].isnull()]
+    missing_in_eurostat = missing.intersection(eurostat.index.levels[0])
+    uses = ["space","cooking","water"]
+
+    for sector,eurostat_sector in [("residential","Residential"),("services","Services"),
+                                   ("road","Road"),("rail","Rail")]:
+        for fuel,eurostat_fuel in [("electricity","Electricity"),("total","Total all products")]:
+            clean_df.loc[missing_in_eurostat,"{} {}".format(fuel,sector)] = eurostat.loc[idx[missing_in_eurostat,:,:,eurostat_sector],eurostat_fuel].groupby(level=0).sum()
+
+        if sector in ["road","rail"]:
+            continue
+
+        fuel = "electricity"
+        for use in uses:
+            avg = (clean_df["{} {} {}".format(fuel,sector,use)]/clean_df["{} {}".format(fuel,sector)]).mean()
+            print("{}: average fraction of {} for {} is {}".format(sector,fuel,use,avg))
+            clean_df.loc[missing_in_eurostat,"{} {} {}".format(fuel,sector,use)] = avg*clean_df.loc[missing_in_eurostat,"{} {}".format(fuel,sector)]
+
+        fuel = "total"
+        for use in uses:
+            avg = ((clean_df["{} {} {}".format("total",sector,use)]-clean_df["{} {} {}".format("electricity",sector,use)])/
+                   (clean_df["{} {}".format("total",sector)]-clean_df["{} {}".format("electricity",sector)])).mean()
+            print("{}: average fraction of non-electric for {} is {}".format(sector,use,avg))
+            clean_df.loc[missing_in_eurostat,"{} {} {}".format(fuel,sector,use)] = \
+                   clean_df.loc[missing_in_eurostat,"{} {} {}".format("electricity",sector,use)] \
+                   + avg*(clean_df.loc[missing_in_eurostat,"{} {}".format("total",sector)] - clean_df.loc[missing_in_eurostat,"{} {}".format("electricity",sector)])
+
+
+
+    #Fix Norway space and water heating fractions
+    #http://www.ssb.no/en/energi-og-industri/statistikker/husenergi/hvert-3-aar/2014-07-14
+    #The main heating source for about 73 per cent of the households is based on electricity
+    #=> 26% is non-electric
+    elec_fraction = 0.73
+
+    without_norway = clean_df.drop("NO")
+
+    for sector in ["residential","services"]:
+
+        #assume non-electric is heating
+        total_heating = (clean_df.loc["NO","{} {}".format("total",sector)]-clean_df.loc["NO","{} {}".format("electricity",sector)])/(1-elec_fraction)
+
+        for use in uses:
+            fraction = ((without_norway["{} {} {}".format("total",sector,use)]-without_norway["{} {} {}".format("electricity",sector,use)])/
+                        (without_norway["{} {}".format("total",sector)]-without_norway["{} {}".format("electricity",sector)])).mean()
+            clean_df.loc["NO","{} {} {}".format("total",sector,use)] = total_heating*fraction
+            clean_df.loc["NO","{} {} {}".format("electricity",sector,use)] = total_heating*fraction*elec_fraction
+
+    #fix missing data for BA (services and road energy data)
+    missing = (clean_df.loc["BA"] == 0.)
+
+    #add back in proportional to RS with ratio of total residential demand
+    clean_df.loc["BA",missing] = clean_df.loc["BA","total residential"]/clean_df.loc["RS","total residential"]*clean_df.loc["RS",missing]
+
+    clean_df.to_csv(snakemake.output.energy_name)
+
+    return clean_df
+
+
+def build_eea_co2():
+    # see ../notebooks/compute_1990_Europe_emissions_for_targets.ipynb
+
+    #https://www.eea.europa.eu/data-and-maps/data/national-emissions-reported-to-the-unfccc-and-to-the-eu-greenhouse-gas-monitoring-mechanism-14
+    #downloaded 190222 (modified by EEA last on 181130)
+    fn = "data/eea/UNFCCC_v21.csv"
+    df = pd.read_csv(fn, encoding="latin-1")
+    df.loc[df["Year"] == "1985-1987","Year"] = 1986
+    df["Year"] = df["Year"].astype(int)
+    df = df.set_index(['Country_code', 'Pollutant_name', 'Year', 'Sector_name']).sort_index()
+
+    e = pd.Series()
+    e["electricity"] = '1.A.1.a - Public Electricity and Heat Production'
+    e['residential non-elec'] = '1.A.4.b - Residential'
+    e['services non-elec'] = '1.A.4.a - Commercial/Institutional'
+    e['rail non-elec'] = "1.A.3.c - Railways"
+    e["road non-elec"] = '1.A.3.b - Road Transportation'
+    e["domestic navigation"] = "1.A.3.d - Domestic Navigation"
+    e['international navigation'] = '1.D.1.b - International Navigation'
+    e["domestic aviation"] = '1.A.3.a - Domestic Aviation'
+    e["international aviation"] = '1.D.1.a - International Aviation'
+    e['total energy'] = '1 - Energy'
+    e['industrial processes'] = '2 - Industrial Processes and Product Use'
+    e['agriculture'] = '3 - Agriculture'
+    e['LULUCF'] = '4 - Land Use, Land-Use Change and Forestry'
+    e['waste management'] = '5 - Waste management'
+    e['other'] = '6 - Other Sector'
+    e['indirect'] = 'ind_CO2 - Indirect CO2'
+    e["total wL"] = "Total (with LULUCF, with indirect CO2)"
+    e["total woL"] = "Total (without LULUCF, with indirect CO2)"
+
+
+    pol = "CO2" #["All greenhouse gases - (CO2 equivalent)","CO2"]
+
+    cts = ["CH","EUA","NO"] + eu28_eea
+
+    year = 1990
+
+    emissions = df.loc[idx[cts,pol,year,e.values],"emissions"].unstack("Sector_name").rename(columns=pd.Series(e.index,e.values)).rename(index={"All greenhouse gases - (CO2 equivalent)" : "GHG"},level=1)
+
+    #only take level 0, since level 1 (pol) and level 2 (year) are trivial
+    emissions = emissions.groupby(level=0,axis=0).sum()
+
+    emissions.rename(index={"EUA" : "EU28", "UK" : "GB"},inplace=True)
+
+    emissions['industrial non-elec'] = emissions['total energy'] - emissions[['electricity', 'services non-elec','residential non-elec', 'road non-elec',
+                                                                              'rail non-elec', 'domestic aviation', 'international aviation', 'domestic navigation',
+                                                                              'international navigation']].sum(axis=1)
+
+    emissions.drop(columns=["total energy", "total wL", "total woL"],inplace=True)
+
+    return emissions/1e3
+
+
+def build_eurostat_co2(year=1990):
+
+    eurostat_for_co2 = build_eurostat(year)
+
+    se = pd.Series(index=eurostat_for_co2.columns,dtype=float)
+
+    #emissions in tCO2_equiv per MWh_th
+    se["Solid fuels"] = 0.36   #Approximates coal
+    se["Oil (total)"] = 0.285  #Average of distillate and residue
+    se["Gas"] = 0.2            #For natural gas
+
+    #oil values from https://www.eia.gov/tools/faqs/faq.cfm?id=74&t=11
+    #Distillate oil (No. 2)  0.276
+    #Residual oil (No. 6)  0.298
+    #https://www.eia.gov/electricity/annual/html/epa_a_03.html
+
+
+
+    eurostat_co2 = eurostat_for_co2.multiply(se).sum(axis=1)
+
+    return eurostat_co2
+
+
+def build_co2_totals(year=1990):
+
+    co2 = eea_co2.reindex(["EU28","NO","CH","BA","RS","AL","ME","MK"] + eu28)
+
+    for ct in ["BA","RS","AL","ME","MK"]:
+        co2.loc[ct,"electricity"] = eurostat_co2[ct,"+","Conventional Thermal Power Stations","of which From Coal"].sum()
+        co2.loc[ct,"residential non-elec"] = eurostat_co2[ct,"+","+","Residential"].sum()
+        co2.loc[ct,"services non-elec"] = eurostat_co2[ct,"+","+","Services"].sum()
+        co2.loc[ct,"road non-elec"] = eurostat_co2[ct,"+","+","Road"].sum()
+        co2.loc[ct,"rail non-elec"] = eurostat_co2[ct,"+","+","Rail"].sum()
+        co2.loc[ct,"domestic navigation"] = eurostat_co2[ct,"+","+","Domestic Navigation"].sum()
+        co2.loc[ct,'international navigation'] = eurostat_co2[ct,"-","Bunkers"].sum()
+        co2.loc[ct,"domestic aviation"] = eurostat_co2[ct,"+","+","Domestic aviation"].sum()
+        co2.loc[ct,"international aviation"] = eurostat_co2[ct,"+","+","International aviation"].sum()
+        #doesn't include industrial process emissions or fuel processing/refining
+        co2.loc[ct,'industrial non-elec'] = eurostat_co2[ct,"+","Industry"].sum()
+        #doesn't include non-energy emissions
+        co2.loc[ct,'agriculture'] = eurostat_co2[ct,"+","+","Agriculture / Forestry"].sum()
+
+
+
+    co2.to_csv(snakemake.output.co2_name)
+
+    return co2
+
+
+def build_transport_data():
+
+    transport_data = pd.DataFrame(columns=["number cars","average fuel efficiency"],
+                                  index=population.index)
+
+    ## collect number of cars
+
+    transport_data["number cars"] = idees["passenger cars"]
+
+    #CH from http://ec.europa.eu/eurostat/statistics-explained/index.php/Passenger_cars_in_the_EU#Luxembourg_has_the_highest_number_of_passenger_cars_per_inhabitant
+    transport_data.loc["CH","number cars"] = 4.136e6
+
+    missing = transport_data.index[transport_data["number cars"].isnull()]
+
+    print("Missing data on cars from:")
+
+    print(missing)
+
+    cars_pp = transport_data["number cars"]/population
+
+    transport_data.loc[missing,"number cars"] = cars_pp.mean()*population
+
+
+    ## collect average fuel efficiency in kWh/km
+
+    transport_data["average fuel efficiency"] = idees["passenger car efficiency"]
+
+    missing = transport_data.index[transport_data["average fuel efficiency"].isnull()]
+
+    print("Missing data on fuel efficiency from:")
+
+    print(missing)
+
+    transport_data.loc[missing,"average fuel efficiency"] = transport_data["average fuel efficiency"].mean()
+
+    transport_data.to_csv(snakemake.output.transport_name)
+
+    return transport_data
+
+
+
+if __name__ == "__main__":
+
+
+    # Detect running outside of snakemake and mock snakemake for testing
+    if 'snakemake' not in globals():
+        from vresutils import Dict
+        snakemake = Dict()
+        snakemake.output = Dict()
+        snakemake.output['energy_name'] = "data/energy_totals.csv"
+        snakemake.output['co2_name'] = "data/co2_totals.csv"
+        snakemake.output['transport_name'] = "data/transport_data.csv"
+
+        snakemake.input = Dict()
+        snakemake.input['nuts3_shapes'] = 'resources/nuts3_shapes.geojson'
+
+    nuts3 = gpd.read_file(snakemake.input.nuts3_shapes).set_index('index')
+    population = nuts3['pop'].groupby(nuts3.country).sum()
+
+    year = 2011
+
+    eurostat = build_eurostat(year)
+
+    swiss = build_swiss(year)
+
+    idees = build_idees(year)
+
+    build_energy_totals()
+
+    eea_co2 = build_eea_co2()
+
+    eurostat_co2 = build_eurostat_co2()
+
+    build_co2_totals()
+
+    build_transport_data()
diff --git a/scripts/build_heat_demand.py b/scripts/build_heat_demand.py
new file mode 100644
index 00000000..a1c793e1
--- /dev/null
+++ b/scripts/build_heat_demand.py
@@ -0,0 +1,40 @@
+
+import geopandas as gpd
+import atlite
+import pandas as pd
+import xarray as xr
+import scipy as sp
+
+
+if 'snakemake' not in globals():
+    from vresutils import Dict
+    import yaml
+    snakemake = Dict()
+    with open('config.yaml') as f:
+        snakemake.config = yaml.load(f)
+    snakemake.input = Dict()
+    snakemake.output = Dict()
+
+time = pd.date_range(freq='m', **snakemake.config['snapshots'])
+params = dict(years=slice(*time.year[[0, -1]]), months=slice(*time.month[[0, -1]]))
+
+cutout = atlite.Cutout(snakemake.config['renewable']['onwind']['cutout'],
+                       cutout_dir=snakemake.config['atlite']['cutout_dir'],
+                       **params)
+
+clustered_busregions_as_geopd = gpd.read_file(snakemake.input.regions_onshore).set_index('name', drop=True)
+
+clustered_busregions = pd.Series(clustered_busregions_as_geopd.geometry, index=clustered_busregions_as_geopd.index)
+
+I = cutout.indicatormatrix(clustered_busregions)
+
+
+for item in ["rural","urban","total"]:
+
+    pop_layout = xr.open_dataarray(snakemake.input['pop_layout_'+item])
+    
+    M = I.T.dot(sp.diag(I.dot(pop_layout.stack(spatial=('y', 'x')))))
+    
+    heat_demand = cutout.heat_demand(matrix=M.T,index=clustered_busregions.index)
+
+    heat_demand.to_netcdf(snakemake.output["heat_demand_"+item])
diff --git a/scripts/build_industrial_demand.py b/scripts/build_industrial_demand.py
new file mode 100644
index 00000000..e841da71
--- /dev/null
+++ b/scripts/build_industrial_demand.py
@@ -0,0 +1,35 @@
+
+import pandas as pd
+
+idx = pd.IndexSlice
+
+def build_industrial_demand():
+
+    population = pd.read_csv(snakemake.input.clustered_pop_layout,
+                             index_col=0)
+
+    totals = pd.Series(data=[1100.,1814.,586.,400.,580.,186.],
+                       index=["industry new electricity","industry process heat",
+                              "naptha feedstock","shipping H2","aviation kerosene","process emissions"])
+
+    industrial_demand = pd.DataFrame({i : population["total"]*totals[i]*1e6/population["total"].sum() for i in totals.index })
+
+    industrial_demand.to_csv(snakemake.output.industrial_demand)
+
+
+
+if __name__ == "__main__":
+
+    # Detect running outside of snakemake and mock snakemake for testing
+    if 'snakemake' not in globals():
+        from vresutils import Dict
+        import yaml
+        snakemake = Dict()
+        snakemake.input = Dict()
+        snakemake.input['clustered_pop_layout'] = "resources/pop_layout_elec_s_128.csv"
+        snakemake.output = Dict()
+        snakemake.output['industrial_demand'] = "resources/industrial_demand_elec_s_128.csv"
+        with open('config.yaml') as f:
+            snakemake.config = yaml.load(f)
+
+    build_industrial_demand()
diff --git a/scripts/build_population_layouts.py b/scripts/build_population_layouts.py
new file mode 100644
index 00000000..f4273a91
--- /dev/null
+++ b/scripts/build_population_layouts.py
@@ -0,0 +1,104 @@
+
+# Build mapping between grid cells and population (total, urban, rural)
+
+import atlite
+import pandas as pd
+import xarray as xr
+
+from vresutils import shapes as vshapes
+
+import geopandas as gpd
+
+
+if 'snakemake' not in globals():
+    from vresutils import Dict
+    import yaml
+    snakemake = Dict()
+    with open('config.yaml') as f:
+        snakemake.config = yaml.load(f)
+    snakemake.input = Dict()
+    snakemake.output = Dict()
+
+    snakemake.input["urban_percent"] = "data/urban_percent.csv"
+
+cutout = atlite.Cutout(snakemake.config['renewable']['onwind']['cutout'],
+                       cutout_dir=snakemake.config['atlite']['cutout_dir'])
+
+grid_cells = cutout.grid_cells()
+
+#nuts3 has columns country, gdp, pop, geometry
+#population is given in dimensions of 1e3=k
+nuts3 = gpd.read_file(snakemake.input.nuts3_shapes).set_index('index')
+
+
+# Indicator matrix NUTS3 -> grid cells
+I = atlite.cutout.compute_indicatormatrix(nuts3.geometry, grid_cells)
+
+# Indicator matrix grid_cells -> NUTS3; inprinciple Iinv*I is identity
+# but imprecisions mean not perfect
+Iinv = cutout.indicatormatrix(nuts3.geometry)
+
+countries = nuts3.country.value_counts().index.sort_values()
+
+urban_fraction = pd.read_csv(snakemake.input.urban_percent,
+                             header=None,index_col=0,squeeze=True)/100.
+
+#fill missing Balkans values
+missing = ["AL","ME","MK"]
+reference = ["RS","BA"]
+urban_fraction = urban_fraction.reindex(urban_fraction.index|missing)
+urban_fraction.loc[missing] = urban_fraction[reference].mean()
+
+
+#population in each grid cell
+pop_cells = pd.Series(I.dot(nuts3['pop']))
+
+#in km^2
+cell_areas = pd.Series(cutout.grid_cells()).map(vshapes.area)/1e6
+
+#pop per km^2
+density_cells = pop_cells/cell_areas
+
+
+#rural or urban population in grid cell
+pop_rural = pd.Series(0.,density_cells.index)
+pop_urban = pd.Series(0.,density_cells.index)
+
+for ct in countries:
+    print(ct,urban_fraction[ct])
+
+    indicator_nuts3_ct = pd.Series(0.,nuts3.index)
+    indicator_nuts3_ct[nuts3.index[nuts3.country==ct]] = 1.
+
+    indicator_cells_ct = pd.Series(Iinv.T.dot(indicator_nuts3_ct))
+
+    density_cells_ct = indicator_cells_ct*density_cells
+
+    pop_cells_ct = indicator_cells_ct*pop_cells
+
+    #correct for imprecision of Iinv*I
+    pop_ct = nuts3['pop'][indicator_nuts3_ct.index[indicator_nuts3_ct == 1.]].sum()
+    pop_cells_ct = pop_cells_ct*pop_ct/pop_cells_ct.sum()
+
+    # The first low density grid cells to reach rural fraction are rural
+    index_from_low_d_to_high_d = density_cells_ct.sort_values().index
+    pop_ct_rural_b = pop_cells_ct[index_from_low_d_to_high_d].cumsum()/pop_cells_ct.sum() < (1-urban_fraction[ct])
+    pop_ct_urban_b = ~pop_ct_rural_b
+
+    pop_ct_rural_b[indicator_cells_ct==0.] = False
+    pop_ct_urban_b[indicator_cells_ct==0.] = False
+
+    pop_rural += pop_cells_ct.where(pop_ct_rural_b,0.)
+    pop_urban += pop_cells_ct.where(pop_ct_urban_b,0.)
+
+pop_cells = {"total" : pop_cells}
+
+pop_cells["rural"] = pop_rural
+pop_cells["urban"] = pop_urban
+
+for key in pop_cells.keys():
+    layout = xr.DataArray(pop_cells[key].values.reshape(cutout.shape),
+                          [('y', cutout.coords['y']), ('x', cutout.coords['x'])])
+
+    layout.to_netcdf(snakemake.output["pop_layout_"+key])
+
diff --git a/scripts/build_solar_thermal_profiles.py b/scripts/build_solar_thermal_profiles.py
new file mode 100644
index 00000000..eaa65da3
--- /dev/null
+++ b/scripts/build_solar_thermal_profiles.py
@@ -0,0 +1,55 @@
+
+import sys
+
+#override atlite
+sys.path = ["/home/vres/data/tom/lib/atlite"] + sys.path
+
+import geopandas as gpd
+import atlite
+import pandas as pd
+import xarray as xr
+import scipy as sp
+
+
+if 'snakemake' not in globals():
+    from vresutils import Dict
+    import yaml
+    snakemake = Dict()
+    with open('config.yaml') as f:
+        snakemake.config = yaml.load(f)
+    snakemake.input = Dict()
+    snakemake.output = Dict()
+
+time = pd.date_range(freq='m', **snakemake.config['snapshots'])
+params = dict(years=slice(*time.year[[0, -1]]), months=slice(*time.month[[0, -1]]))
+
+
+
+cutout = atlite.Cutout(snakemake.config['renewable']['onwind']['cutout'],
+                       cutout_dir=snakemake.config['atlite']['cutout_dir'],
+                       **params)
+
+clustered_busregions_as_geopd = gpd.read_file(snakemake.input.regions_onshore).set_index('name', drop=True)
+
+clustered_busregions = pd.Series(clustered_busregions_as_geopd.geometry, index=clustered_busregions_as_geopd.index)
+
+I = cutout.indicatormatrix(clustered_busregions)
+
+
+for item in ["total","rural","urban"]:
+
+    pop_layout = xr.open_dataarray(snakemake.input['pop_layout_'+item])
+    
+    M = I.T.dot(sp.diag(I.dot(pop_layout.stack(spatial=('y', 'x')))))
+    nonzero_sum = M.sum(axis=0, keepdims=True)
+    nonzero_sum[nonzero_sum == 0.] = 1.
+    M_tilde = M/nonzero_sum
+    
+    solar_thermal_angle = 45.
+    #should clearsky_model be "simple" or "enhanced"?
+    solar_thermal = cutout.solar_thermal(clearsky_model="simple",
+                                         orientation={'slope': solar_thermal_angle, 'azimuth': 180.},
+                                         matrix = M_tilde.T,
+                                         index=clustered_busregions.index)
+
+    solar_thermal.to_netcdf(snakemake.output["solar_thermal_"+item])
diff --git a/scripts/build_temperature_profiles.py b/scripts/build_temperature_profiles.py
new file mode 100644
index 00000000..94e1fa26
--- /dev/null
+++ b/scripts/build_temperature_profiles.py
@@ -0,0 +1,48 @@
+
+import geopandas as gpd
+import atlite
+import pandas as pd
+import xarray as xr
+import scipy as sp
+
+
+if 'snakemake' not in globals():
+    from vresutils import Dict
+    import yaml
+    snakemake = Dict()
+    with open('config.yaml') as f:
+        snakemake.config = yaml.load(f)
+    snakemake.input = Dict()
+    snakemake.output = Dict()
+
+time = pd.date_range(freq='m', **snakemake.config['snapshots'])
+params = dict(years=slice(*time.year[[0, -1]]), months=slice(*time.month[[0, -1]]))
+
+
+cutout = atlite.Cutout(snakemake.config['renewable']['onwind']['cutout'],
+                       cutout_dir=snakemake.config['atlite']['cutout_dir'],
+                       **params)
+
+clustered_busregions_as_geopd = gpd.read_file(snakemake.input.regions_onshore).set_index('name', drop=True)
+
+clustered_busregions = pd.Series(clustered_busregions_as_geopd.geometry, index=clustered_busregions_as_geopd.index)
+
+I = cutout.indicatormatrix(clustered_busregions)
+
+
+for item in ["total","rural","urban"]:
+
+    pop_layout = xr.open_dataarray(snakemake.input['pop_layout_'+item])
+    
+    M = I.T.dot(sp.diag(I.dot(pop_layout.stack(spatial=('y', 'x')))))
+    nonzero_sum = M.sum(axis=0, keepdims=True)
+    nonzero_sum[nonzero_sum == 0.] = 1.
+    M_tilde = M/nonzero_sum
+    
+    temp_air = cutout.temperature(matrix=M_tilde.T,index=clustered_busregions.index)
+
+    temp_air.to_netcdf(snakemake.output["temp_air_"+item])
+
+    temp_soil = cutout.soil_temperature(matrix=M_tilde.T,index=clustered_busregions.index)
+
+    temp_soil.to_netcdf(snakemake.output["temp_soil_"+item])