Skip to content

data_module_utils

Utility functions for file and paths solver.

InputType = ArrayInput | PathInput module-attribute #

Type of input data passed to the dataset.

convert_paths_to_pathlib(input_data, target_data=None) #

Create a list of file paths from the input and target data.

Parameters:

Name Type Description Default
input_data Sequence[str | Path]

Input data, can be a path to a folder, or a list of paths.

required
target_data Sequence[str | Path] | None

Target data, can be None, a path to a folder, or a list of paths.

None

Returns:

Type Description
list[Path]

A list of file paths for input data.

list[Path] | None

A list of file paths for target data, or None if target_data is None.

Source code in src/careamics/lightning/dataset_ng/data_module_utils.py
def convert_paths_to_pathlib(
    input_data: Sequence[str | Path],
    target_data: Sequence[str | Path] | None = None,
) -> tuple[list[Path], list[Path] | None]:
    """Create a list of file paths from the input and target data.

    Parameters
    ----------
    input_data : Sequence[str | Path]
        Input data, can be a path to a folder, or a list of paths.
    target_data : Sequence[str | Path] | None, default=None
        Target data, can be None, a path to a folder, or a list of paths.

    Returns
    -------
    list[Path]
        A list of file paths for input data.
    list[Path] | None
        A list of file paths for target data, or None if target_data is None.
    """
    input_files = [Path(item) if isinstance(item, str) else item for item in input_data]
    if target_data is None:
        return input_files, None
    else:
        target_files = [
            Path(item) if isinstance(item, str) else item for item in target_data
        ]
        validate_source_target_files(input_files, target_files)
        return input_files, target_files

initialize_data_pair(data_type, input_data, target_data=None, loading=None) #

Initialize a pair of input and target data.

Parameters:

Name Type Description Default
data_type Literal['array', 'tiff', 'zarr', 'czi', 'custom']

The type of data to initialize.

required
input_data InputType

Input data, can be None, a path to a folder, a list of paths, or a numpy array.

required
target_data InputType | None

Target data, can be None, a path to a folder, a list of paths, or a numpy array.

None
loading ReadFuncLoading | ImageStackLoading | None

The type of loading used for custom data. ReadFuncLoading is the use of a simple function that will load full images into memory. ImageStackLoading is for custom chunked or memory-mapped next-generation file formats enabling single patches to be read from disk at a time. If the data type is not custom loading should be None.

None

Returns:

Type Description
list[ndarray] | list[Path]

Initialized input data. For file paths, returns a list of Path objects. For numpy arrays, returns the arrays directly.

list[ndarray] | list[Path] | None

Initialized target data. For file paths, returns a list of Path objects. For numpy arrays, returns the arrays directly. Returns None if target_data is None.

Source code in src/careamics/lightning/dataset_ng/data_module_utils.py
def initialize_data_pair(
    data_type: Literal["array", "tiff", "zarr", "czi", "custom"] | SD,
    input_data: Any,
    target_data: Any | None = None,
    loading: ReadFuncLoading | ImageStackLoading | None = None,
) -> tuple[Any, Any | None]:
    """
    Initialize a pair of input and target data.

    Parameters
    ----------
    data_type : Literal["array", "tiff", "zarr", "czi", "custom"]
        The type of data to initialize.
    input_data : InputType
        Input data, can be None, a path to a folder, a list of paths, or a numpy
        array.
    target_data : InputType | None
        Target data, can be None, a path to a folder, a list of paths, or a numpy
        array.
    loading : ReadFuncLoading | ImageStackLoading | None, default=None
        The type of loading used for custom data. `ReadFuncLoading` is the use of
        a simple function that will load full images into memory.
        `ImageStackLoading` is for custom chunked or memory-mapped next-generation
        file formats enabling  single patches to be read from disk at a time.
        If the data type is not custom `loading` should be `None`.

    Returns
    -------
    list[numpy.ndarray] | list[pathlib.Path]
        Initialized input data. For file paths, returns a list of Path objects. For
        numpy arrays, returns the arrays directly.
    list[numpy.ndarray] | list[pathlib.Path] | None
        Initialized target data. For file paths, returns a list of Path objects. For
        numpy arrays, returns the arrays directly. Returns None if target_data is None.
    """
    data_type = SD(data_type)
    data = (input_data, target_data)
    match (data_type, loading):
        case (SD.ARRAY, None) if _is_array_data(data):
            input_data, target_data = validate_array_input(data[0], data[1])
        case (SD.TIFF | SD.CZI, None) if _is_path_data(data):
            input_data, target_data = validate_path_input(data_type, data[0], data[1])
        case (SD.ZARR, None) if _is_path_data(data):
            input_data, target_data = validate_zarr_input(data[0], data[1])
        case (SD.CUSTOM, ReadFuncLoading(extension_filter=ext)) if _is_path_data(data):
            input_data, target_data = validate_path_input(
                data_type, data[0], data[1], extension_filter=ext
            )
        case (SD.CUSTOM, ImageStackLoading()):
            input_data, target_data = input_data, target_data
        case _:
            raise ValueError(
                f"Invalid argument combination for data initialization. "
                f"data_type={data_type!s}, input type is {type(input_data)}. "
                "For custom data, you must provide either a `ReadFuncLoading` or "
                "`ImageStackLoading` dataclass as instruction on how to load the data. "
                "If a training target is provided, a validation target must also be "
                "provided, unless automatic validation splitting is being used."
            )

    validate_input_target_type_consistency(input_data, target_data)
    return input_data, target_data

list_files_in_directory(data_type, input_data, target_data=None, extension_filter='') #

List files from input and target directories.

Parameters:

Name Type Description Default
data_type Literal['tiff', 'zarr', 'czi', 'custom']

The type of data to validate.

required
input_data str | Path

Input data, can be a path to a folder, a list of paths, or a numpy array.

required
target_data str | Path | None

Target data, can be None, a path to a folder, a list of paths, or a numpy array.

None
extension_filter str

File extension filter to apply when listing files.

""

Returns:

Type Description
list[Path]

A list of file paths for input data.

list[Path] | None

A list of file paths for target data, or None if target_data is None.

Source code in src/careamics/lightning/dataset_ng/data_module_utils.py
def list_files_in_directory(
    data_type: Literal["tiff", "zarr", "czi", "custom"] | SD,
    input_data: str | Path,
    target_data: str | Path | None = None,
    extension_filter: str = "",
) -> tuple[list[Path], list[Path] | None]:
    """List files from input and target directories.

    Parameters
    ----------
    data_type : Literal["tiff", "zarr", "czi", "custom"]
        The type of data to validate.
    input_data : str | Path
        Input data, can be a path to a folder, a list of paths, or a numpy array.
    target_data : str | Path | None, default=None
        Target data, can be None, a path to a folder, a list of paths, or a numpy
        array.
    extension_filter : str, default=""
        File extension filter to apply when listing files.

    Returns
    -------
    list[Path]
        A list of file paths for input data.
    list[Path] | None
        A list of file paths for target data, or None if target_data is None.
    """
    input_data = Path(input_data)

    # list_files will return a list with a single element if the path is a file with
    # the correct extension
    input_files = list_files(input_data, data_type, extension_filter)
    if target_data is None:
        return input_files, None
    else:
        target_data = Path(target_data)
        target_files = list_files(target_data, data_type, extension_filter)
        validate_source_target_files(input_files, target_files)
        return input_files, target_files

validate_array_input(input_data, target_data) #

Validate if the input data is a numpy array.

Parameters:

Name Type Description Default
input_data ArrayInput

Input data, can be a list of or a single numpy array.

required
target_data ArrayInput | None

Target data, can be a list of or a single numpy array, or None. array.

required

Returns:

Type Description
list[ndarray]

Validated input data.

list[ndarray] | None

Validated target data, None if the target data is None.

Raises:

Type Description
ValueError

If the input data is not a numpy array or a list of numpy arrays.

Source code in src/careamics/lightning/dataset_ng/data_module_utils.py
def validate_array_input(
    input_data: ArrayInput,
    target_data: ArrayInput | None,
) -> tuple[list[NDArray[Any]], list[NDArray[Any]] | None]:
    """Validate if the input data is a numpy array.

    Parameters
    ----------
    input_data : ArrayInput
        Input data, can be a list of or a single numpy array.
    target_data : ArrayInput | None
        Target data, can be a list of or a single numpy array, or None.
        array.

    Returns
    -------
    list[numpy.ndarray]
        Validated input data.
    list[numpy.ndarray] | None
        Validated target data, None if the target data is None.

    Raises
    ------
    ValueError
        If the input data is not a numpy array or a list of numpy arrays.
    """
    if isinstance(input_data, ndarray):
        input_list = [input_data]

        if target_data is not None and not isinstance(target_data, ndarray):
            raise ValueError(
                f"Wrong target type. Expected numpy.ndarray, got {type(target_data)}. "
                f"Check the data_type parameter or your inputs."
            )
        target_list = [target_data] if target_data is not None else None
        return input_list, target_list
    else:  # is sequence
        input_list = list(input_data)

        if target_data is not None and not isinstance(target_data, Sequence):
            raise ValueError(
                "Wrong target type. Expected a sequence of numpy,ndarray, got "
                f"{type(target_data)}. Check the data_type "
                "parameter or your inputs."
            )
        target_list = list(target_data) if target_data is not None else None

        return input_list, target_list

validate_input_target_type_consistency(input_data, target_data) #

Validate if the input and target data types are consistent.

Parameters:

Name Type Description Default
input_data InputType

Input data, can be a path to a folder, a list of paths, or a numpy array.

required
target_data InputType | None

Target data, can be None, a path to a folder, a list of paths, or a numpy array.

required

Raises:

Type Description
ValueError

If the input and target data types are not consistent.

Source code in src/careamics/lightning/dataset_ng/data_module_utils.py
def validate_input_target_type_consistency(
    input_data: InputType,
    target_data: InputType | None,
) -> None:
    """Validate if the input and target data types are consistent.

    Parameters
    ----------
    input_data : InputType
        Input data, can be a path to a folder, a list of paths, or a numpy array.
    target_data : InputType | None
        Target data, can be None, a path to a folder, a list of paths, or a numpy
        array.

    Raises
    ------
    ValueError
        If the input and target data types are not consistent.
    """
    if target_data is not None and not isinstance(input_data, type(target_data)):
        raise ValueError(
            f"Inputs for input and target must be of the same type or None. "
            f"Got {type(input_data)} and {type(target_data)}."
        )
    if isinstance(input_data, Sequence) and isinstance(target_data, Sequence):
        if len(input_data) != len(target_data):
            raise ValueError(
                f"Inputs and targets must have the same length. "
                f"Got {len(input_data)} and {len(target_data)}."
            )
        if not isinstance(input_data[0], type(target_data[0])):
            raise ValueError(
                f"Inputs and targets must have the same type. "
                f"Got {type(input_data[0])} and {type(target_data[0])}."
            )

validate_path_input(data_type, input_data, target_data, extension_filter='') #

Validate if the input data is a path or a list of paths.

Parameters:

Name Type Description Default
data_type Literal['tiff', 'zarr', 'czi', 'custom']

The type of data to validate.

required
input_data PathInput

Input data, can be a path to a folder, a list of paths.

required
target_data PathInput | None

Target data, can be None, a path to a folder, a list of paths.

required
extension_filter str

File extension filter to apply when listing files.

""

Returns:

Type Description
list[Path]

A list of file paths for input data.

list[Path] | None

A list of file paths for target data, or None if target_data is None.

Raises:

Type Description
ValueError

If the input data is not a path or a list of paths.

Source code in src/careamics/lightning/dataset_ng/data_module_utils.py
def validate_path_input(
    data_type: Literal["tiff", "zarr", "czi", "custom"] | SD,
    input_data: PathInput,
    target_data: PathInput | None,
    extension_filter: str = "",
) -> tuple[list[Path], list[Path] | None]:
    """Validate if the input data is a path or a list of paths.

    Parameters
    ----------
    data_type : Literal["tiff", "zarr", "czi", "custom"]
        The type of data to validate.
    input_data : PathInput
        Input data, can be a path to a folder, a list of paths.
    target_data : PathInput | None
        Target data, can be None, a path to a folder, a list of paths.
    extension_filter : str, default=""
        File extension filter to apply when listing files.

    Returns
    -------
    list[Path]
        A list of file paths for input data.
    list[Path] | None
        A list of file paths for target data, or None if target_data is None.

    Raises
    ------
    ValueError
        If the input data is not a path or a list of paths.
    """
    if isinstance(input_data, (str, Path)) and (
        target_data is None or isinstance(target_data, (str, Path))
    ):
        input_list, target_list = list_files_in_directory(
            data_type, input_data, target_data, extension_filter
        )
        return input_list, target_list
    elif isinstance(input_data, list):
        # TODO warn if paths do not exist
        input_list = [Path(item) for item in input_data if Path(item).exists()]

        target_list = None
        if target_data is not None:
            assert isinstance(target_data, list)
            # consistency with input is enforced by convert_paths_to_pathlib
            target_list = [Path(item) for item in target_data if Path(item).exists()]

        return convert_paths_to_pathlib(input_list, target_list)
    else:
        raise ValueError(
            f"Wrong input type, expected str or Path or list[str | Path], got "
            f"{type(input_data)}. Check the data_type parameter or your inputs."
        )

validate_zarr_input(input_data, target_data) #

Validate if the input data corresponds a zarr input.

Parameters:

Name Type Description Default
input_data PathInput

Input data, can be a path to a folder, to zarr file, a URI pointing to a zarr dataset, or a list.

required
target_data PathInput | None

Target data, can be None.

required

Returns:

Type Description
list[str] or list[Path]

A list of zarr URIs or path for input data.

list[str] or list[Path] | None

A list of zarr URIs or paths for target data, or None if target_data is None.

Raises:

Type Description
ValueError

If the input and target data types are not consistent.

ValueError

If the input data is not a zarr URI or path, or a list of zarr URIs or paths.

Source code in src/careamics/lightning/dataset_ng/data_module_utils.py
def validate_zarr_input(
    input_data: PathInput,
    target_data: PathInput | None,
) -> tuple[list[str] | list[Path], list[str] | list[Path] | None]:
    """Validate if the input data corresponds a zarr input.

    Parameters
    ----------
    input_data : PathInput
        Input data, can be a path to a folder, to zarr file, a URI pointing to a zarr
        dataset, or a list.
    target_data : PathInput | None
        Target data, can be None.

    Returns
    -------
    list[str] or list[Path]
        A list of zarr URIs or path for input data.
    list[str] or list[Path] | None
        A list of zarr URIs or paths for target data, or None if target_data is None.

    Raises
    ------
    ValueError
        If the input and target data types are not consistent.
    ValueError
        If the input data is not a zarr URI or path, or a list of zarr URIs or paths.
    """
    # validate_input_target_type_consistency is called beforehand, ensuring the types
    # of input and target are the same
    if isinstance(input_data, (str, Path)):
        if Path(input_data).exists():
            # either a path to a folder or a zarr file
            # path to a folder will trigger collection of all zarr files in that folder
            assert target_data is None or isinstance(target_data, (str, Path))
            if target_data is not None and not Path(target_data).exists():
                raise ValueError(
                    f"Target provided as path, but does not exist: {target_data}."
                )

            return validate_path_input("zarr", input_data, target_data)
        elif isinstance(input_data, str) and is_valid_uri(input_data):
            input_list = [input_data]

            assert target_data is None or isinstance(target_data, str)
            if target_data is not None and not is_valid_uri(target_data):
                raise ValueError(
                    f"Wrong target type for zarr data. Expected a zarr URI, got "
                    f"{type(target_data)}."
                )
            target_list = [target_data] if target_data is not None else None
            return input_list, target_list
        else:
            raise ValueError(
                f"Wrong input type for zarr data. Expected a file URI or a path to a "
                f" file, got {input_data}. Path may not exist."
            )
    else:  # input is sequence of Path | str
        if Path(input_data[0]).exists():
            return validate_path_input("zarr", input_data, target_data)
        else:
            final_input_list = [str(item) for item in input_data if is_valid_uri(item)]
            if target_data is not None:
                assert isinstance(target_data, list)
                final_target_list = [
                    str(item) for item in target_data if is_valid_uri(item)
                ]
            else:
                final_target_list = None
            return final_input_list, final_target_list