Convert your Dataset
The “Long Format”
The basic format to convert any dataset to our representation is the long format. The long format is simply a tuple:
(time_series_id, channel_id, timestamp, value, static_var_1, static_var_2, ...)
.
If your dataset contains rows that are in this format, you are almost good to go. Else, there will be a little bit of preprocessing to do.
Case 1. (easy) Your dataset is already in the long format
Let’s assume for now your dataset is already in this form. Here is a minimal working example.
[28]:
import pandas as pd
import numpy as np
[29]:
df = pd.DataFrame(
{
"time_series_id": np.random.choice(["A", "B", "C"], size=100),
"channel_id": np.random.choice(["X", "Y", "Z"], size=100),
"timestamp": pd.date_range("2023-01-01", periods=100, freq="H"),
"value": np.random.randn(100),
}
)
df["labels"] = df["time_series_id"].map(
{"A": 0, "B": 1, "C": 1}
) # let's say we have labels
df.head()
/var/folders/kj/v66zvn217x31k6lx63lt02q40000gn/T/ipykernel_11325/3078918095.py:5: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.
"timestamp": pd.date_range("2023-01-01", periods=100, freq="H"),
[29]:
time_series_id | channel_id | timestamp | value | labels | |
---|---|---|---|---|---|
0 | B | Y | 2023-01-01 00:00:00 | 0.105162 | 1 |
1 | B | Z | 2023-01-01 01:00:00 | -0.573337 | 1 |
2 | B | X | 2023-01-01 02:00:00 | -1.973967 | 1 |
3 | C | Y | 2023-01-01 03:00:00 | 0.656065 | 1 |
4 | A | Y | 2023-01-01 04:00:00 | -0.500246 | 0 |
[30]:
# Let's save this dataframe to a CSV file
df.to_csv("your_original_dataset.csv", index=False)
[31]:
# the csv file can be converted to our format using our interface
from pyrregular.io_utils import read_csv
from pyrregular.reader_interface import ReaderInterface
from pyrregular.accessor import IrregularAccessor
class YourDataset(ReaderInterface):
@staticmethod
def read_original_version(verbose=False):
return read_csv(
filenames="your_original_dataset.csv",
ts_id="time_series_id",
time_id="timestamp",
signal_id="channel_id",
value_id="value",
dims={
"ts_id": [
"labels"
], # static variable that depends on the time series id
"signal_id": [],
"time_id": [],
},
time_index_as_datetime=False,
verbose=verbose,
)
[32]:
da = YourDataset.read_original_version(True)
da
[32]:
<xarray.DataArray (ts_id: 3, signal_id: 3, time_id: 100)> Size: 3kB <COO: shape=(3, 3, 100), dtype=float64, nnz=100, fill_value=nan> Coordinates: * time_id (time_id) <U19 8kB '2023-01-01 00:00:00' ... '2023-01-05 03:00... labels (ts_id) int64 24B 0 1 1 * ts_id (ts_id) <U1 12B 'A' 'B' 'C' * signal_id (signal_id) <U1 12B 'X' 'Y' 'Z'
If you don’t know if a variable is static, or to which dimension it depends from, you can check it.
[33]:
from pyrregular.data_utils import infer_static_columns
infer_static_columns(df, "time_series_id")
[33]:
['labels']
The dataset can be saved with our custom accessor
[34]:
da.irr.to_hdf5("your_dataset.h5")
And then loaded directly with xarray
[35]:
import xarray as xr
[36]:
da2 = xr.load_dataset("your_dataset.h5", engine="pyrregular")
da2
/Users/francesco/github/irregular_ts/irregular_ts/accessor.py:9: AccessorRegistrationWarning: registration of accessor <class 'irregular_ts.accessor.IrregularAccessor'> under name 'irr' for type <class 'xarray.core.dataarray.DataArray'> is overriding a preexisting attribute with the same name.
@xr.register_dataarray_accessor("irr")
[36]:
<xarray.Dataset> Size: 11kB Dimensions: (ts_id: 3, signal_id: 3, time_id: 100) Coordinates: labels (ts_id) int32 12B 0 1 1 * signal_id (signal_id) <U1 12B 'X' 'Y' 'Z' * time_id (time_id) <U19 8kB '2023-01-01 00:00:00' ... '2023-01-05 03:00... * ts_id (ts_id) <U1 12B 'A' 'B' 'C' Data variables: data (ts_id, signal_id, time_id) float64 3kB <COO: nnz=100, fill_value=nan>
Case 2. Your dataset is not in the long format
Let’s say you have a 3d numpy array, containing the time series, and a numpy array containing only the labels.
[37]:
import numpy as np
shape = (10, 2, 100) # 10 time series, 2 channels, 100 timestamps
data = np.full(shape, np.nan)
mask = np.random.rand(*shape) < 0.35
data[mask] = np.random.randn(mask.sum())
labels = np.random.randint(0, 2, shape[0])
np.save("your_more_complex_dataset.npy", data)
np.save("your_more_complex_dataset_labels.npy", labels)
data.shape, labels.shape
[37]:
((10, 2, 100), (10,))
You need only a function that takes the data and the labels, and returns a dataframe in the long format, yielding it row by row.
[38]:
def read_your_dataset(filenames):
data = np.load(filenames["data"])
labels = np.load(filenames["labels"])
ts_ids, signal_ids, timestamps = np.indices(shape)
ts_ids, signal_ids, timestamps = ts_ids.ravel(), signal_ids.ravel(), timestamps.ravel()
for ts_id, signal_id, timestamp in zip(ts_ids, signal_ids, timestamps):
value = data[ts_id, signal_id, timestamp]
if np.isnan(value):
continue
label = labels[ts_id]
yield dict(
time_series_id=ts_id,
channel_id=signal_id,
timestamp=timestamp,
value=value,
labels=label,
)
[39]:
from pyrregular.io_utils import read_csv
from pyrregular.reader_interface import ReaderInterface
from pyrregular.accessor import IrregularAccessor
class YourDataset(ReaderInterface):
@staticmethod
def read_original_version(verbose=False):
return read_csv(
filenames={
"data": "your_more_complex_dataset.npy",
"labels": "your_more_complex_dataset_labels.npy",
},
ts_id="time_series_id",
time_id="timestamp",
signal_id="channel_id",
value_id="value",
dims={
"ts_id": [
"labels"
], # static variable that depends on the time series id
"signal_id": [],
"time_id": [],
},
reader_fun=read_your_dataset,
time_index_as_datetime=False,
verbose=verbose,
attrs={
"authors": "Bond, James Bond", # you can add any attribute you want
}
)
[40]:
da = YourDataset.read_original_version(True)
da
[40]:
<xarray.DataArray (ts_id: 10, signal_id: 2, time_id: 100)> Size: 23kB <COO: shape=(10, 2, 100), dtype=float64, nnz=720, fill_value=nan> Coordinates: * time_id (time_id) int64 800B 0 1 2 3 4 5 6 7 ... 92 93 94 95 96 97 98 99 labels (ts_id) int64 80B 0 0 0 1 1 1 0 1 1 0 * ts_id (ts_id) <U21 840B '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' * signal_id (signal_id) <U21 168B '0' '1' Attributes: authors: Bond, James Bond