Convert your Dataset

The “Long Format”

The basic format to convert any dataset to our representation is the long format. The long format is simply a tuple:

(time_series_id, channel_id, timestamp, value, static_var_1, static_var_2, ...).

If your dataset contains rows that are in this format, you are almost good to go. Else, there will be a little bit of preprocessing to do.

Case 1. (easy) Your dataset is already in the long format

Let’s assume for now your dataset is already in this form. Here is a minimal working example.

[28]:
import pandas as pd
import numpy as np
[29]:
df = pd.DataFrame(
    {
        "time_series_id": np.random.choice(["A", "B", "C"], size=100),
        "channel_id": np.random.choice(["X", "Y", "Z"], size=100),
        "timestamp": pd.date_range("2023-01-01", periods=100, freq="H"),
        "value": np.random.randn(100),
    }
)
df["labels"] = df["time_series_id"].map(
    {"A": 0, "B": 1, "C": 1}
)  # let's say we have labels
df.head()
/var/folders/kj/v66zvn217x31k6lx63lt02q40000gn/T/ipykernel_11325/3078918095.py:5: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.
  "timestamp": pd.date_range("2023-01-01", periods=100, freq="H"),
[29]:
time_series_id channel_id timestamp value labels
0 B Y 2023-01-01 00:00:00 0.105162 1
1 B Z 2023-01-01 01:00:00 -0.573337 1
2 B X 2023-01-01 02:00:00 -1.973967 1
3 C Y 2023-01-01 03:00:00 0.656065 1
4 A Y 2023-01-01 04:00:00 -0.500246 0
[30]:
# Let's save this dataframe to a CSV file
df.to_csv("your_original_dataset.csv", index=False)
[31]:
# the csv file can be converted to our format using our interface

from pyrregular.io_utils import read_csv
from pyrregular.reader_interface import ReaderInterface
from pyrregular.accessor import IrregularAccessor


class YourDataset(ReaderInterface):
    @staticmethod
    def read_original_version(verbose=False):
        return read_csv(
            filenames="your_original_dataset.csv",
            ts_id="time_series_id",
            time_id="timestamp",
            signal_id="channel_id",
            value_id="value",
            dims={
                "ts_id": [
                    "labels"
                ],  # static variable that depends on the time series id
                "signal_id": [],
                "time_id": [],
            },
            time_index_as_datetime=False,
            verbose=verbose,
        )
[32]:
da = YourDataset.read_original_version(True)
da
[32]:
<xarray.DataArray (ts_id: 3, signal_id: 3, time_id: 100)> Size: 3kB
<COO: shape=(3, 3, 100), dtype=float64, nnz=100, fill_value=nan>
Coordinates:
  * time_id    (time_id) <U19 8kB '2023-01-01 00:00:00' ... '2023-01-05 03:00...
    labels     (ts_id) int64 24B 0 1 1
  * ts_id      (ts_id) <U1 12B 'A' 'B' 'C'
  * signal_id  (signal_id) <U1 12B 'X' 'Y' 'Z'

If you don’t know if a variable is static, or to which dimension it depends from, you can check it.

[33]:
from pyrregular.data_utils import infer_static_columns

infer_static_columns(df, "time_series_id")
[33]:
['labels']

The dataset can be saved with our custom accessor

[34]:
da.irr.to_hdf5("your_dataset.h5")

And then loaded directly with xarray

[35]:
import xarray as xr
[36]:
da2 = xr.load_dataset("your_dataset.h5", engine="pyrregular")
da2
/Users/francesco/github/irregular_ts/irregular_ts/accessor.py:9: AccessorRegistrationWarning: registration of accessor <class 'irregular_ts.accessor.IrregularAccessor'> under name 'irr' for type <class 'xarray.core.dataarray.DataArray'> is overriding a preexisting attribute with the same name.
  @xr.register_dataarray_accessor("irr")
[36]:
<xarray.Dataset> Size: 11kB
Dimensions:    (ts_id: 3, signal_id: 3, time_id: 100)
Coordinates:
    labels     (ts_id) int32 12B 0 1 1
  * signal_id  (signal_id) <U1 12B 'X' 'Y' 'Z'
  * time_id    (time_id) <U19 8kB '2023-01-01 00:00:00' ... '2023-01-05 03:00...
  * ts_id      (ts_id) <U1 12B 'A' 'B' 'C'
Data variables:
    data       (ts_id, signal_id, time_id) float64 3kB <COO: nnz=100, fill_value=nan>

Case 2. Your dataset is not in the long format

Let’s say you have a 3d numpy array, containing the time series, and a numpy array containing only the labels.

[37]:
import numpy as np

shape = (10, 2, 100)  # 10 time series, 2 channels, 100 timestamps
data = np.full(shape, np.nan)
mask = np.random.rand(*shape) < 0.35
data[mask] = np.random.randn(mask.sum())
labels = np.random.randint(0, 2, shape[0])

np.save("your_more_complex_dataset.npy", data)
np.save("your_more_complex_dataset_labels.npy", labels)

data.shape, labels.shape
[37]:
((10, 2, 100), (10,))

You need only a function that takes the data and the labels, and returns a dataframe in the long format, yielding it row by row.

[38]:
def read_your_dataset(filenames):
    data = np.load(filenames["data"])
    labels = np.load(filenames["labels"])
    ts_ids, signal_ids, timestamps = np.indices(shape)
    ts_ids, signal_ids, timestamps = ts_ids.ravel(), signal_ids.ravel(), timestamps.ravel()

    for ts_id, signal_id, timestamp in zip(ts_ids, signal_ids, timestamps):
        value = data[ts_id, signal_id, timestamp]
        if np.isnan(value):
            continue
        label = labels[ts_id]
        yield dict(
            time_series_id=ts_id,
            channel_id=signal_id,
            timestamp=timestamp,
            value=value,
            labels=label,
        )
[39]:
from pyrregular.io_utils import read_csv
from pyrregular.reader_interface import ReaderInterface
from pyrregular.accessor import IrregularAccessor

class YourDataset(ReaderInterface):
    @staticmethod
    def read_original_version(verbose=False):
        return read_csv(
            filenames={
                "data": "your_more_complex_dataset.npy",
                "labels": "your_more_complex_dataset_labels.npy",
            },
            ts_id="time_series_id",
            time_id="timestamp",
            signal_id="channel_id",
            value_id="value",
            dims={
                "ts_id": [
                    "labels"
                ],  # static variable that depends on the time series id
                "signal_id": [],
                "time_id": [],
            },
            reader_fun=read_your_dataset,
            time_index_as_datetime=False,
            verbose=verbose,
            attrs={
                "authors": "Bond, James Bond",  # you can add any attribute you want
            }
        )
[40]:
da = YourDataset.read_original_version(True)
da
[40]:
<xarray.DataArray (ts_id: 10, signal_id: 2, time_id: 100)> Size: 23kB
<COO: shape=(10, 2, 100), dtype=float64, nnz=720, fill_value=nan>
Coordinates:
  * time_id    (time_id) int64 800B 0 1 2 3 4 5 6 7 ... 92 93 94 95 96 97 98 99
    labels     (ts_id) int64 80B 0 0 0 1 1 1 0 1 1 0
  * ts_id      (ts_id) <U21 840B '0' '1' '2' '3' '4' '5' '6' '7' '8' '9'
  * signal_id  (signal_id) <U21 168B '0' '1'
Attributes:
    authors:  Bond, James Bond