Source code for vak.prep.dataset_df_helper
"""Helper functions for working with datasets represented as a pandas.DataFrame"""
from __future__ import annotations
import pathlib
import numpy as np
import pandas as pd
[docs]
def get_dataset_csv_filename(data_dir_name: str, timenow: str) -> str:
"""Get name of csv file representing dataset.
This function is called by
:func:`vak.prep.frame_classification.dataset_df.get_dataset_csv_path`.
Parameters
----------
data_dir_name : str
Name of directory specified as parameter ``data_dir``
when calling :func:`vak.core.prep.prep`.
This becomes the "prefix" of the csv filename.
timenow : str
Timestamp.
This becomes the "suffix" of the csv filename.
Returns
-------
dataset_csv_filename : str
String, in the form f"{data_dir_name}_prep_{timenow}.csv"
"""
return f"{data_dir_name}_prep_{timenow}.csv"
[docs]
def get_dataset_csv_path(
dataset_path: pathlib.Path, data_dir_name: str, timenow: str
) -> pathlib.Path:
"""Returns the path that should be used to save
a pandas DataFrame representing a dataset
to a csv file.
Parameters
----------
dataset_path : str, pathlib.Path
Path to directory that represents dataset.
data_dir_name : str
Name of directory specified as parameter ``data_dir``
when calling :func:`vak.core.prep.prep`.
This becomes the "prefix" of the csv filename.
timenow : str
Timestamp.
This becomes the "suffix" of the csv filename.
Returns
-------
dataset_csv_path : pathlib.Path
Path that is used when saving ``dataset_df`` as a csv file
in the root of the dataset directory, ``dataset_path``.
"""
dataset_csv_filename = get_dataset_csv_filename(data_dir_name, timenow)
dataset_csv_path = dataset_path / dataset_csv_filename
return dataset_csv_path
[docs]
def add_split_col(df: pd.DataFrame, split: str) -> pd.DataFrame:
"""Add a 'split' column to a pandas DataFrame.
Used by :func:`vak.prep`
to assign an entire dataset to the same split,
e.g. 'train' or 'predict'.
All rows in the 'split' column will have the value specified.
Parameters
----------
df : pandas.DataFrame
A dataframe that represents a dataset.
split : str
A string that will be assigned to every row
in the added "split" column.
One of {'train', 'val', 'test', 'predict'}.
"""
if split not in {"train", "val", "test", "predict"}:
raise ValueError(
f"value for split should be one of {{'train', 'val', 'test', 'predict'}}, but was '{split}'"
)
split_col = np.asarray([split for _ in range(len(df))], dtype="object")
df["split"] = split_col
return df