Source code for darts.datasets

"""
Datasets
--------

A few popular time series datasets
"""

import os
from pathlib import Path
from typing import List, Literal, Optional

import numpy as np
import pandas as pd

from darts import TimeSeries
from darts.logging import get_logger, raise_if_not
from darts.utils.utils import _build_tqdm_iterator

from .dataset_loaders import DatasetLoaderCSV, DatasetLoaderMetadata

"""
    Overall usage of this package:
    from darts.datasets import AirPassengersDataset
    ts: TimeSeries = AirPassengersDataset.load()
"""

logger = get_logger(__name__)

_DEFAULT_PATH = "https://raw.githubusercontent.com/unit8co/darts/master/datasets"


[docs]class AirPassengersDataset(DatasetLoaderCSV): """ Monthly Air Passengers Dataset, from 1949 to 1960. """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "air_passengers.csv", uri=_DEFAULT_PATH + "/AirPassengers.csv", hash="167ffa96204a2b47339c21eea25baf32", header_time="Month", ) )
[docs]class AusBeerDataset(DatasetLoaderCSV): """ Total quarterly beer production in Australia (in megalitres) from 1956:Q1 to 2008:Q3 [1]_. References ---------- .. [1] https://rdrr.io/cran/fpp/man/ausbeer.html """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "ausbeer.csv", uri=_DEFAULT_PATH + "/ausbeer.csv", hash="1f4028a570a20939411cc04de7364bbd", header_time="date", format_time="%Y-%m-%d", ) )
[docs]class AustralianTourismDataset(DatasetLoaderCSV): """ A single multivariate TimeSeries, containing monthly tourism numbers over 36 months in Australia. The numbers are broken down per region ("NSW", "VIC", "QLD", "SA", "WA", "TAS", "NT"), reason ("Hol", "VFR", "Bus", "Oth"), (region, reason) pairs, and (region, reason, <city>) tuples, where <city> can be either "city" or "noncity". This is an augmented version of the Australian tourism dataset available in [1]_, where we pre-computed the groupings per region (not available in the original dataset). References ---------- .. [1] https://robjhyndman.com/publications/hierarchical-tourism/ """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "australian_tourism.csv", uri=_DEFAULT_PATH + "/australian_tourism.csv", hash="6eeea6b56e16e01123f303b492d9901c", header_time=None, format_time=None, ) )
[docs]class EnergyDataset(DatasetLoaderCSV): """ Hourly energy dataset coming from [1]_. Contains a time series with 28 hourly components between 2014-12-31 23:00:00 and 2018-12-31 22:00:00 References ---------- .. [1] https://www.kaggle.com/nicholasjhana/energy-consumption-generation-prices-and-weather """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "energy.csv", uri=_DEFAULT_PATH + "/energy_dataset.csv", hash="f564ef18e01574734a0fa20806d1c7ee", header_time="time", format_time="%Y-%m-%d %H:%M:%S", ) )
[docs]class GasRateCO2Dataset(DatasetLoaderCSV): """ Gas Rate CO2 dataset Two components, length 296 (integer time index) """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "gasrate_co2.csv", uri=_DEFAULT_PATH + "/gasrate_co2.csv", hash="77bf383715a9cf81459f81fe17baf3b0", header_time=None, format_time=None, ) )
[docs]class HeartRateDataset(DatasetLoaderCSV): """ The series contains 1800 evenly-spaced measurements of instantaneous heart rate from a single subject. The measurements (in units of beats per minute) occur at 0.5 second intervals, so that the length of each series is exactly 15 minutes. This is the series1 in [1]_. It uses an integer time index. References ---------- .. [1] http://ecg.mit.edu/time-series/ """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "heart_rate.csv", uri=_DEFAULT_PATH + "/heart_rate.csv", hash="3c4a108e1116867cf056dc5be2c95386", header_time=None, format_time=None, ) )
[docs]class IceCreamHeaterDataset(DatasetLoaderCSV): """ Monthly sales of heaters and ice cream between January 2004 and June 2020. """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "ice_cream_heater.csv", uri=_DEFAULT_PATH + "/ice_cream_heater.csv", hash="62031c7b5cdc9339fe7cf389173ef1c3", header_time="Month", format_time="%Y-%m", ) )
[docs]class MonthlyMilkDataset(DatasetLoaderCSV): """ Monthly production of milk (in pounds per cow) between January 1962 and December 1975 """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "monthly_milk.csv", uri=_DEFAULT_PATH + "/monthly-milk.csv", hash="4784443e696da45d7082e76a67687b93", header_time="Month", format_time="%Y-%m", ) )
[docs]class MonthlyMilkIncompleteDataset(DatasetLoaderCSV): """ Monthly production of milk (in pounds per cow) between January 1962 and December 1975. Has some missing values. """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "monthly_milk-incomplete.csv", uri=_DEFAULT_PATH + "/monthly-milk-incomplete.csv", hash="49b275c7e2f8f28a6a05224be1a049a4", header_time="Month", format_time="%Y-%m", freq="MS", ) )
[docs]class SunspotsDataset(DatasetLoaderCSV): """ Monthly Sunspot Numbers, 1749 - 1983 Monthly mean relative sunspot numbers from 1749 to 1983. Collected at Swiss Federal Observatory, Zurich until 1960, then Tokyo Astronomical Observatory. Source: [1]_ References ---------- .. [1] https://www.rdocumentation.org/packages/datasets/versions/3.6.1/topics/sunspots """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "sunspots.csv", uri=_DEFAULT_PATH + "/monthly-sunspots.csv", hash="4d27019c43d9c256d528f1bd6c5f40e0", header_time="Month", format_time="%Y-%m", ) )
[docs]class TaylorDataset(DatasetLoaderCSV): """ Half-hourly electricity demand in England and Wales from Monday 5 June 2000 to Sunday 27 August 2000. Discussed in Taylor (2003) [1]_, and kindly provided by James W Taylor [2]_. Units: Megawatts (Uses an integer time index). References ---------- .. [1] Taylor, J.W. (2003) Short-term electricity demand forecasting using double seasonal exponential smoothing. Journal of the Operational Research Society, 54, 799-805. .. [2] https://www.rdocumentation.org/packages/forecast/versions/8.13/topics/taylor """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "taylor.csv", uri=_DEFAULT_PATH + "/taylor.csv", hash="1ea355c90e8214cb177788a674801a22", header_time=None, format_time=None, ) )
[docs]class TemperatureDataset(DatasetLoaderCSV): """ Daily temperature in Melbourne between 1981 and 1990 """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "temperatures.csv", uri=_DEFAULT_PATH + "/temps.csv", hash="ce5b5e4929793ec8b6a54711110acebf", header_time="Date", format_time="%m/%d/%Y", freq="D", ) )
[docs]class USGasolineDataset(DatasetLoaderCSV): """ Weekly U.S. Product Supplied of Finished Motor Gasoline between 1991-02-08 and 2021-04-30 Obtained from [1]_. References ---------- .. [1] https://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=PET&s=wgfupus2&f=W """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "us_gasoline.csv", uri=_DEFAULT_PATH + "/us_gasoline.csv", hash="25d440337a06cbf83423e81d0337a1ce", header_time="Week", format_time="%m/%d/%Y", ) )
[docs]class WineDataset(DatasetLoaderCSV): """ Australian total wine sales by wine makers in bottles <= 1 litre. Monthly between Jan 1980 and Aug 1994. Source: [1]_ References ---------- .. [1] https://www.rdocumentation.org/packages/forecast/versions/8.1/topics/wineind """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "wine.csv", uri=_DEFAULT_PATH + "/wineind.csv", hash="b68971d7e709ad0b7e6300cab977e3cd", header_time="date", format_time="%Y-%m-%d", ) )
[docs]class WoolyDataset(DatasetLoaderCSV): """ Quarterly production of woollen yarn in Australia: tonnes. Mar 1965 -- Sep 1994. Source: [1]_ References ---------- .. [1] https://www.rdocumentation.org/packages/forecast/versions/8.1/topics/woolyrnq """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "wooly.csv", uri=_DEFAULT_PATH + "/woolyrnq.csv", hash="4be8b12314db94c8fd76f5c674454bf0", header_time="date", format_time="%Y-%m-%d", ) )
[docs]class ETTh1Dataset(DatasetLoaderCSV): """ The data of 1 Electricity Transformers at 1 stations, including load, oil temperature. The dataset ranges from 2016/07 to 2018/07 taken hourly. Source: [1]_ [2]_ Field Descriptions: * date: The recorded date * HUFL: High UseFul Load * HULL: High UseLess Load * MUFL: Medium UseFul Load * MULL: Medium UseLess Load * LUFL: Low UseFul Load * LULL: Low UseLess Load * OT: Oil Temperature (Target) References ---------- .. [1] https://github.com/zhouhaoyi/ETDataset .. [2] https://arxiv.org/abs/2012.07436 """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "ETTh1.csv", uri=_DEFAULT_PATH + "/ETTh1.csv", hash="8381763947c85f4be6ac456c508460d6", header_time="date", format_time="%Y-%m-%d %H:%M:%S", ) )
[docs]class ETTh2Dataset(DatasetLoaderCSV): """ The data of 1 Electricity Transformers at 1 stations, including load, oil temperature. The dataset ranges from 2016/07 to 2018/07 taken hourly. Source: [1]_ [2]_ Field Descriptions: * date: The recorded date * HUFL: High UseFul Load * HULL: High UseLess Load * MUFL: Medium UseFul Load * MULL: Medium UseLess Load * LUFL: Low UseFul Load * LULL: Low UseLess Load * OT: Oil Temperature (Target) References ---------- .. [1] https://github.com/zhouhaoyi/ETDataset .. [2] https://arxiv.org/abs/2012.07436 """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "ETTh2.csv", uri=_DEFAULT_PATH + "/ETTh2.csv", hash="51a229a3fc13579dd939364fefe9c7ab", header_time="date", format_time="%Y-%m-%d %H:%M:%S", ) )
[docs]class ETTm1Dataset(DatasetLoaderCSV): """ The data of 1 Electricity Transformers at 1 stations, including load, oil temperature. The dataset ranges from 2016/07 to 2018/07 recorded every 15 minutes. Source: [1]_ [2]_ Field Descriptions: * date: The recorded date * HUFL: High UseFul Load * HULL: High UseLess Load * MUFL: Medium UseFul Load * MULL: Medium UseLess Load * LUFL: Low UseFul Load * LULL: Low UseLess Load * OT: Oil Temperature (Target) References ---------- .. [1] https://github.com/zhouhaoyi/ETDataset .. [2] https://arxiv.org/abs/2012.07436 """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "ETTm1.csv", uri=_DEFAULT_PATH + "/ETTm1.csv", hash="82d6bd89109c63d075d99c1077b33f38", header_time="date", format_time="%Y-%m-%d %H:%M:%S", ) )
[docs]class ETTm2Dataset(DatasetLoaderCSV): """ The data of 1 Electricity Transformers at 1 stations, including load, oil temperature. The dataset ranges from 2016/07 to 2018/07 recorded every 15 minutes. Source: [1]_ [2]_ Field Descriptions: * date: The recorded date * HUFL: High UseFul Load * HULL: High UseLess Load * MUFL: Medium UseFul Load * MULL: Medium UseLess Load * LUFL: Low UseFul Load * LULL: Low UseLess Load * OT: Oil Temperature (Target) References ---------- .. [1] https://github.com/zhouhaoyi/ETDataset .. [2] https://arxiv.org/abs/2012.07436 """ def __init__(self): super().__init__( metadata=DatasetLoaderMetadata( "ETTm2.csv", uri=_DEFAULT_PATH + "/ETTm2.csv", hash="7687e47825335860bf58bccb31be0c56", header_time="date", format_time="%Y-%m-%d %H:%M:%S", ) )
[docs]class ElectricityDataset(DatasetLoaderCSV): """ Measurements of electric power consumption in one household with 15 minute sampling rate. 370 client's consumption are recorded in kW. Source: [1]_ Loading this dataset will provide a multivariate timeseries with 370 columns for each household. The following code can be used to convert the dataset to a list of univariate timeseries, one for each household. References ---------- .. [1] https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014 """ def __init__(self, multivariate: bool = True): """ Parameters ---------- multivariate: bool Whether to return a single multivariate timeseries - if False returns a list of univariate TimeSeries. Default is True. """ def pre_proces_fn(extracted_dir, dataset_path): with open(Path(extracted_dir, "LD2011_2014.txt")) as fin: with open(dataset_path, "w", newline="\n") as fout: for line in fin: fout.write(line.replace(",", ".").replace(";", ",")) super().__init__( metadata=DatasetLoaderMetadata( "Electricity.csv", uri="https://archive.ics.uci.edu/static/public/321/electricityloaddiagrams20112014.zip", hash="acfe6783eea43905e510f537add940fd", header_time="Unnamed: 0", format_time="%Y-%m-%d %H:%M:%S", pre_process_zipped_csv_fn=pre_proces_fn, multivariate=multivariate, ) ) def _to_multi_series(self, series: pd.DataFrame) -> List[TimeSeries]: """ Load the electricity dataset as a list of univariate series, one for each household. """ ts_list = [] # list of timeseries for label in _build_tqdm_iterator( series, verbose=True, total=len(series.columns) ): srs = series[label] # filter column down to the period of recording srs = srs.replace(0.0, np.nan) start_date = min(srs.fillna(method="ffill").dropna().index) end_date = max(srs.fillna(method="bfill").dropna().index) active_range = (srs.index >= start_date) & (srs.index <= end_date) srs = srs[active_range].fillna(0.0) # convert to timeseries tmp = pd.DataFrame({"power_usage": srs}) tmp["date"] = tmp.index ts = TimeSeries.from_dataframe(tmp, "date", ["power_usage"]) ts_list.append(ts) return ts_list
[docs]class UberTLCDataset(DatasetLoaderCSV): """ 14.3 million Uber pickups from January to June 2015. The data is resampled to hourly or daily based sample_freq on using the locationID as the target. Source: [1]_ Loading this dataset will provide a multivariate timeseries with 262 columns for each locationID. The following code can be used to convert the dataset to a list of univariate timeseries, one for each locationID. References ---------- .. [1] https://github.com/fivethirtyeight/uber-tlc-foil-response """ def __init__(self, sample_freq: str = "hourly", multivariate: bool = True): """ Parameters ---------- sample_freq: str The sampling frequency of the data. Can be "hourly" or "daily". Default is "hourly". multivariate: bool Whether to return a single multivariate timeseries - if False returns a list of univariate TimeSeries. Default is True. """ valid_sample_freq = ["daily", "hourly"] raise_if_not( sample_freq in valid_sample_freq, f"sample_freq must be one of {valid_sample_freq}", logger, ) def pre_proces_fn(extracted_dir, dataset_path): df = pd.read_csv( Path(extracted_dir, "uber-raw-data-janjune-15.csv"), header=0, usecols=["Pickup_date", "locationID"], index_col=0, ) output_dict = {} freq_setting = "1H" if "hourly" in str(dataset_path) else "1D" time_series_of_locations = list(df.groupby(by="locationID")) for locationID, df in time_series_of_locations: df.sort_index() df.index = pd.to_datetime(df.index) count_series = df.resample(rule=freq_setting).size() output_dict[locationID] = count_series output_df = pd.DataFrame(output_dict) output_df.to_csv(dataset_path) super().__init__( metadata=DatasetLoaderMetadata( f"uber_tlc_{sample_freq}.csv", uri="https://github.com/fivethirtyeight/uber-tlc-foil-response/raw/" "63bb878b76f47f69b4527d50af57aac26dead983/" "uber-trip-data/uber-raw-data-janjune-15.csv.zip", hash=( "9ed84ebe0df4bc664748724b633b3fe6" if sample_freq == "hourly" else "24f9fd67e4b9e53f0214a90268cd9bee" ), header_time="Pickup_date", format_time="%Y-%m-%d %H:%M:%S", pre_process_zipped_csv_fn=pre_proces_fn, multivariate=multivariate, ) ) def _to_multi_series(self, series: pd.DataFrame) -> List[TimeSeries]: """ load the Uber TLC dataset as a list of univariate timeseries, one for each locationID. """ ts_list = [] # list of timeseries for label in _build_tqdm_iterator( series, verbose=True, total=len(series.columns) ): srs = series[label] # filter column down to the period of recording start_date = min(srs.fillna(method="ffill").dropna().index) end_date = max(srs.fillna(method="bfill").dropna().index) active_range = (srs.index >= start_date) & (srs.index <= end_date) srs = srs[active_range] # convert to timeseries tmp = pd.DataFrame({"locationID": srs}) tmp["date"] = tmp.index ts = TimeSeries.from_dataframe(tmp, "date", ["locationID"]) ts_list.append(ts) return ts_list
[docs]class ILINetDataset(DatasetLoaderCSV): """ ILI describes the number of patients seen with influenzalike illness and the total number of patients. It includes weekly data from the Centers for Disease Control and Prevention of the United States from 1997 to 2022. Source: [1]_ [2]_ [3]_ [4]_ Components Descriptions: * % WEIGHTED ILI: Combined state-specific data of patients visit to healthcare providers for ILI reported each week weighted by state population * % UNWEIGHTED ILI: Combined state-specific data of patients visit to healthcare providers for ILI reported each week unweighted by state population * AGE 0-4: Number of patients between 0 and 4 years of age * AGE 25-49: Number of patients between 25 and 49 years of age * AGE 25-64: Number of patients between 25 and 64 years of age * AGE 5-24: Number of patients between 5 and 24 years of age * AGE 50-64: Number of patients between 50 and 64 years of age * AGE 65: Number of patients above (>=65) 65 years of age * ILITOTAL: Total number of ILI patients. For this system, ILI is defined as fever (temperature of 100°F [37.8°C] or greater) and a cough and/or a sore throat * NUM. OF PROVIDERS: Number of outpatient healthcare providers * TOTAL PATIENTS: Total number of patients References ---------- .. [1] https://gis.cdc.gov/grasp/fluview/fluportaldashboard.html .. [2] https://www.cdc.gov/flu/weekly/overview.htm#Outpatient .. [3] https://arxiv.org/pdf/2205.13504.pdf .. [4] https://gis.cdc.gov/grasp/fluview/FluViewPhase2QuickReferenceGuide.pdf """ def __init__(self, multivariate: bool = True): super().__init__( metadata=DatasetLoaderMetadata( "ILINet.csv", uri=_DEFAULT_PATH + "/ILINet.csv", hash="c9cbd6cc0a92b21cd95bec2706212d8d", header_time="DATE", format_time="%Y-%m-%d", freq="W", multivariate=multivariate, ) ) def _to_multi_series(self, series: pd.DataFrame) -> List[TimeSeries]: """ Load the ILINetDataset dataset as a list of univariate timeseries. """ return [TimeSeries.from_series(series[label]) for label in series]
[docs]class ExchangeRateDataset(DatasetLoaderCSV): """ The collection of the daily exchange rates of eight foreign countries, including Australia, British, Canada, Switzerland, China, Japan, New Zealand, and Singapore, ranging from 1990 to 2016. Unfortunately, there were some inconsistencies concerning the dates, so the resulting TimeSeries is integer-indexed. Source: [1]_ References ---------- .. [1] https://github.com/laiguokun/multivariate-time-series-data """ def __init__(self, multivariate: bool = True): """ Parameters ---------- multivariate: bool Whether to return a single multivariate timeseries - if False returns a list of univariate TimeSeries. Default is True. """ super().__init__( metadata=DatasetLoaderMetadata( "exchange_rate.csv", uri=_DEFAULT_PATH + "/exchange_rate.csv", hash="6e35621a9eb6a9dd5465cf52a22b1339", header_time=None, multivariate=multivariate, ) ) def _to_multi_series(self, series: pd.DataFrame) -> List[TimeSeries]: """ Load the ExchangeRateDataset dataset as a list of univariate timeseries, one for each country. """ return [TimeSeries.from_series(series[label]) for label in series]
[docs]class TrafficDataset(DatasetLoaderCSV): """ The data in this repo is a collection of 48 months (2015-2016) hourly data from the California Department of Transportation. The data describes the road occupancy rates (between 0 and 1) measured by 862 different sensors on San Francisco Bay area freeways. The raw data is in http://pems.dot.ca.gov. Source: [1]_ References ---------- .. [1] https://github.com/laiguokun/multivariate-time-series-data """ def __init__(self, multivariate: bool = True): """ Parameters ---------- multivariate: bool Whether to return a single multivariate timeseries - if False returns a list of univariate TimeSeries. Default is True. """ super().__init__( metadata=DatasetLoaderMetadata( "traffic.csv", uri=_DEFAULT_PATH + "/traffic.csv", hash="a2105f364ef70aec06c757304833f72a", header_time="Date", format_time="%Y-%m-%d %H:%M:%S", freq="1H", multivariate=multivariate, ) ) def _to_multi_series(self, series: pd.DataFrame) -> List[TimeSeries]: """ Load the TrafficDataset dataset as a list of univariate timeseries, one for each ID. """ return [TimeSeries.from_series(series[label]) for label in series]
[docs]class WeatherDataset(DatasetLoaderCSV): """ Weather includes 21 indicators of weather, such as air temperature, and humidity. The data was recorded every 10 min for 2020 in Germany. Source: [1]_ [2]_ References ---------- .. [1] https://www.bgc-jena.mpg.de/wetter/ .. [2] https://arxiv.org/pdf/2205.13504.pdf """ def __init__(self, multivariate: bool = True): """ Parameters ---------- multivariate: bool Whether to return a single multivariate timeseries - if False returns a list of univariate TimeSeries. Default is True. """ super().__init__( metadata=DatasetLoaderMetadata( "weather.csv", uri=_DEFAULT_PATH + "/weather.csv", hash="a2942a05638ba311bc7935bcc087a30f", header_time="Date Time", format_time="%d.%m.%Y %H:%M:%S", freq="10min", multivariate=multivariate, ) ) def _to_multi_series(self, series: pd.DataFrame) -> List[TimeSeries]: """ Load the WeatherDataset dataset as a list of univariate timeseries, one for weather indicator. """ return [TimeSeries.from_series(series[label]) for label in series]
[docs]class ElectricityConsumptionZurichDataset(DatasetLoaderCSV): """ Electricity Consumption of households & SMEs (low voltage) and businesses & services (medium voltage) in the city of Zurich [1]_, with values recorded every 15 minutes. The electricity consumption is combined with weather measurements recorded by three different stations in the city of Zurich with a hourly frequency [2]_. The missing time stamps are filled with NaN. The original weather data is recorded every hour. Before adding the features to the electricity consumption, the data is resampled to 15 minutes frequency, and missing values are interpolated. To simplify the dataset, the measurements from the Zch_Schimmelstrasse and Zch_Rosengartenstrasse weather stations are discarded to keep only the data recorded in the Zch_Stampfenbachstrasse station. Both dataset sources are updated continuously, but this dataset only retrains values between 2015-01-01 and 2022-08-31. The time index was converted from CET time zone to UTC. Components Descriptions: * Value_NE5 : Households & SMEs electricity consumption (low voltage, grid level 7) in kWh * Value_NE7 : Business and services electricity consumption (medium voltage, grid level 5) in kWh * Hr [%Hr] : Relative humidity * RainDur [min] : Duration of precipitation (divided by 4 for conversion from hourly to quarter-hourly records) * T [°C] : Temperature * WD [°] : Wind direction * WVv [m/s] : Wind vector speed * p [hPa] : Air pressure * WVs [m/s] : Wind scalar speed * StrGlo [W/m2] : Global solar irradiation Note: before 2018, the scalar speeds were calculated from the 30 minutes vector data. References ---------- .. [1] https://data.stadt-zuerich.ch/dataset/ewz_stromabgabe_netzebenen_stadt_zuerich .. [2] https://data.stadt-zuerich.ch/dataset/ugz_meteodaten_stundenmittelwerte """ def __init__(self): def pre_process_dataset(dataset_path): """Restrict the time axis and add the weather data""" df = pd.read_csv(dataset_path, index_col=0) # convert time index df.index = pd.DatetimeIndex(pd.to_datetime(df.index, utc=True)).tz_localize( None ) # extract pre-determined period df = df.loc[ (pd.Timestamp("2015-01-01") <= df.index) & (df.index <= pd.Timestamp("2022-08-31")) ] # download and preprocess the weather information df_weather = self._download_weather_data() # add weather data as additional features df = pd.concat([df, df_weather], axis=1) # interpolate weather data df = df.interpolate() # raining duration is given in minutes -> we divide by 4 from hourly to quarter-hourly records df["RainDur [min]"] = df["RainDur [min]"] / 4 # round Electricity cols to 4 decimals, other columns to 2 decimals cols_precise = ["Value_NE5", "Value_NE7"] df = df.round( decimals={col: (4 if col in cols_precise else 2) for col in df.columns} ) # export the dataset df.index.name = "Timestamp" df.to_csv(self._get_path_dataset()) # pandas v2.2.0 introduced a bug that was fixed in v2.2.1; the expected hash for 2.2.0 # is "485d81e9902cc0ccb1f86d7e01fb37cd" # hash value for dataset with weather data super().__init__( metadata=DatasetLoaderMetadata( "zurich_electricity_consumption.csv", uri=( "https://data.stadt-zuerich.ch/dataset/" "ewz_stromabgabe_netzebenen_stadt_zuerich/" "download/ewz_stromabgabe_netzebenen_stadt_zuerich.csv" ), hash="a019125b7f9c1afeacb0ae60ce7455ef", header_time="Timestamp", freq="15min", pre_process_csv_fn=pre_process_dataset, ) ) @staticmethod def _download_weather_data(): """Concatenate the yearly csv files into a single dataframe and reshape it""" # download the csv from the url base_url = "https://data.stadt-zuerich.ch/dataset/ugz_meteodaten_stundenmittelwerte/download/" filenames = [f"ugz_ogd_meteo_h1_{year}.csv" for year in range(2015, 2023)] df = pd.concat([pd.read_csv(base_url + fname) for fname in filenames]) # retain only one weather station df = df.loc[df["Standort"] == "Zch_Stampfenbachstrasse"] # pivot the df to get all measurements as columns df["param_name"] = df["Parameter"] + " [" + df["Einheit"] + "]" df = df.pivot(index="Datum", columns="param_name", values="Wert") # convert time index to from CET to UTC and extract the required time range df.index = pd.DatetimeIndex(pd.to_datetime(df.index, utc=True)).tz_localize( None ) df = df.loc[ (pd.Timestamp("2015-01-01") <= df.index) & (df.index <= pd.Timestamp("2022-08-31")) ] return df