Module gatenlp.corpora.dirs

Module that defines Corpus and DocumentSource/DocumentDestination classes which access documents as files in a directory.

Expand source code
"""
Module that defines Corpus and DocumentSource/DocumentDestination classes which access documents
as files in a directory.
"""

import os
from typing import Union, Callable, Iterable, Optional
from pathlib import Path
from urllib.parse import ParseResult
from gatenlp.urlfileutils import yield_lines_from
from gatenlp.document import Document
from gatenlp.corpora.base import DocumentSource, DocumentDestination, Corpus
from gatenlp.corpora.base import MultiProcessingAble
from gatenlp.corpora.base import EveryNthBase


def minstem(path):
    stem = Path(path).stem
    dotidx = stem.find(".")
    if dotidx > 0:
        stem = stem[:dotidx]
    return stem


def matching_paths(
        dirpath: str,
        exts: Optional[Union[Iterable, str]] = None,
        recursive: bool = True,
        relative: bool = True):
    """
    Yields all relative file paths from dirpath which match the list of extensions
    and which do not start with a dot.

    Args:
        dirpath: the directory to traverse
        exts: a single extension of a list of allowed extensions (inluding the dot). If None,
            all files in the directory not starting with a dot are included
        recursive: if True (default) include all matching paths from all subdirectories as well, otherwise
          only paths from the top directory.
        relative: if True (default), the paths are relative to the directory path
    """
    if isinstance(exts, str):
        exts = [exts]
    if recursive:
        for root, _, filenames in os.walk(dirpath):
            for fname in filenames:
                if exts:
                    for ext in exts:
                        if fname.endswith(ext) and not fname.startswith("."):
                            if relative:
                                yield os.path.relpath(
                                    os.path.join(root, fname), dirpath
                                )
                            else:
                                yield os.path.join(root, fname)
                            break
                else:
                    if not fname.startswith("."):
                        if relative:
                            yield os.path.relpath(os.path.join(root, fname), dirpath)
                        else:
                            yield os.path.join(root, fname)
    else:
        for fname in os.listdir(dirpath):
            full = os.path.join(dirpath, fname)
            if not os.path.isfile(full) or fname.startswith("."):
                pass
            elif exts:
                for ext in exts:
                    if fname.endswith(ext):
                        if relative:
                            yield os.path.relpath(full, dirpath)
                        else:
                            yield full
                        break
            else:
                if relative:
                    yield os.path.relpath(full, dirpath)
                else:
                    yield full


def maker_file_path_fromidx(digits=1, levels=1):
    """
    Creates a method that returns a file path for the given number of leading digits and levels.

    Args:
        digits: minimum number of digits to use for the path, any number with less digits will have leading zeros
           added.
        levels: how to split the original sequence of digits into a hierarchical path name. For example if digits=10
           and levels=3, the generated function will convert the index number 23 into 0/000/000/023

    Returns:
        a function that takes the keyword arguments idx and doc and returns a relative path name (str)
    """
    if (
        not isinstance(digits, int)
        or not isinstance(levels, int)
        or digits < 1
        or levels < 1
        or digits < levels
    ):
        raise Exception(
            "digits and levels must be integers larger than 0 and digits must not be smaller than "
            f"levels, got {digits}/{levels}"
        )

    def file_path_fromidx(doc=None, idx=None):
        # NOTE: doc is unused here but used with other methods to create the file path!
        if idx is None or not isinstance(idx, int) or idx < 0:
            raise Exception("Index must be an integer >= 0")
        per = int(digits / levels)
        asstr = str(idx)
        digs = max(0, digits - len(asstr))
        tmp = "0" * digs
        tmp += str(idx)
        path = ""
        fromdigit = len(tmp) - per
        todigit = len(tmp)
        for _lvl in range(levels - 1):
            path = tmp[fromdigit:todigit] + path
            # print("per=", per, "from=", fromdigit, "to=", todigit, "sec=", tmp[fromdigit:todigit])
            path = "/" + path
            fromdigit = fromdigit - per
            todigit = todigit - per
        path = tmp[:todigit] + path
        return path

    return file_path_fromidx


# TODO: set the special features for the relative path, index number, document id?
class DirFilesSource(DocumentSource, EveryNthBase, MultiProcessingAble):
    """
    A document source which iterates over documents represented as files in a directory.
    """
    def __init__(
        self,
        dirpath: str,
        paths: Optional[Iterable[str]] = None,
        paths_from: Union[str, Path, ParseResult] = None,
        exts: Optional[Iterable[str]] = None,
        fmt: Optional[str] = None,
        recursive: bool = True,
        sort: Union[bool, Callable] = False,
        sort_reverse: bool = False,
        docname_from: Optional[str] = None,
        nparts: int = 1,
        partnr: int = 0,
    ):
        """
        Create a DirFilesSource.

        Args:
            dirpath: the directory that contains the file to load as documents.
            paths:  if not None, must be an iterable of relate file paths to load from the directory
            paths_from: if not None, must be a file or URL to load a list of relative file paths from
            exts: an iterable of allowed file extensions or file extension regexps
            fmt: the format to use for loading files. This is only useful if all files have the same format
               but the file extensions does not indicate the format.
            recursive: recursively include paths from all subdirectories as well
            sort: a boolean to indicate that paths should get processed in sort order, or a callable that
                will be used to extract the sort key.
                The paths get always sorted if partnr is > 1.
            sort_reverse: if paths should get serted in reverse order
            docname_from: If not None set the document name from "basename", "stem" (basename without last extension)
                "minstem" (basename with all extensions removed) "relpath",
                "index" (sequence number of document within this part).
            nparts: only yield every nparts-th document (default 1: every document)
            partnr: start with that index, before yieldieng every nparts-th document (default 0: start at beginning)
        """
        self.dirpath = dirpath
        if paths is not None and paths_from is not None:
            raise Exception("Parameters paths and paths_from cannot be both specified")
        super().__init__()
        EveryNthBase.__init__(self, nparts=nparts, partnr=partnr)
        if docname_from is not None:
            assert docname_from in ["basename", "relpath", "index", "stem", "minstem"]
        self.docname_from = docname_from
        if paths is not None:
            self.paths = paths
        elif paths_from is not None:
            self.paths = []
            for pth in yield_lines_from(paths_from):
                self.paths.append(pth.rstrip("\n\r"))
        else:
            self.paths = list(matching_paths(dirpath, exts=exts, recursive=recursive))
        if sort or nparts > 1:
            if callable(sort):
                self.paths.sort(key=sort, reverse=sort_reverse)
            else:
                self.paths.sort(reverse=sort_reverse)
        if nparts > 1:
            self.paths = [
                p
                for idx, p in enumerate(self.paths)
                if ((idx - partnr) % nparts) == 0
            ]
        self.fmt = fmt

    def __iter__(self):
        """
        Yield the next document from the source.
        """
        self._n = 0
        for p in self.paths:
            fullpath = os.path.join(self.dirpath, p)
            doc = Document.load(fullpath, fmt=self.fmt)
            self.setrelpathfeature(doc, p)
            if self.docname_from:
                if self.docname_from == "basename":
                    docname = os.path.basename(fullpath)
                elif self.docname_from == "stem":
                    docname = Path(fullpath).stem
                elif self.docname_from == "index":
                    docname = str(self._n)
                elif self.docname_from == "relpath":
                    docname = p
                elif self.docname_from == "minstem":
                    docname = minstem(fullpath)
                doc.name = docname
            self._n += 1
            yield doc


class DirFilesDestination(DocumentDestination):
    """
    A destination where each document is stored in a file in a directory or directory tree in some
    known serialization format. The filename or path of the file can be derived from a document feature,
    the document name, the running number of file added, or any function that can derive a file path
    from the document and the running number.
    """

    def __init__(self, dirpath, path_from: Union[str, Callable] = "default", ext: str = "bdocjs", fmt=None):
        """
        Create a destination to store documents in files inside a directory or directory tree.

        Args:
            dirpath: the directory to contain the files
            path_from: one of options listed below. If a string is used as a path name, then the forward slash
                 is always used as the directory path separator, on all systems!

               * "default" (default) a heuristic which uses "relpath" if it is available, otherwise uses the
                    index with at least 5 digits.
               * "relpath": use the relative path used when creating the document, but
                   replace the extension
               * "idx": just use the index/running number of the added document as the base name
               * "idx:5": use the index/running number with at least 5 digits in the name.
               * "idx:10:2": use the index and organize a total of 10 digits into a hierarchical
                   pathname of 2 levels, so 10:2 would mean the first 5 digits are for the name of the subdirectory
                   and the second 5 digits are for the file base name. 10:3 would have for levels, the first
                   subdirectory level with 1 digit, the next two with 3 digits and the remaining 3 digits for the
                   filename.
                   NOTE: "idx" by itself is equivalent to idx:1:1
                * "feature:fname": use the document feature with the feature name fname as a relative path as is
                   but add the extension
                * "docname": use the document name as the relative path, but add extension.
                * "minstem": use the relative path with all extensions replaced by the new extension
                * somefunction: a function that should return the pathname (without extension) and should take two
                   keyword arguments: doc (the document) and idx (the running index of the document).

            ext: the file extension to add to all generated file names
            fmt: the format to use for serializing the document, if None, will try to determine from the extension.
        """
        super().__init__()
        if not os.path.isdir(dirpath):
            raise Exception("Not a directory: ", dirpath)
        self.dirpath = dirpath
        self.idx = 0

        def pathmaker_default(doc=None, idx=None):
            relpath = doc.features.get(self.relpathfeatname())
            if relpath:
                return os.path.splitext(doc.features[self.relpathfeatname()])[0]
            else:
                return f"{idx:05d}"

        if path_from.startswith("idx"):
            rest = path_from[
                3:
            ]  # if we have digits or levels, there is a leading colon!
            if len(rest) == 0:
                digits = 1
                levels = 1
            else:
                parms = rest.split(":")
                parms.append("1")
                digits, levels = parms[1:3]
                digits = int(digits)
                levels = int(levels)
            self.file_path_maker = maker_file_path_fromidx(digits, levels)
        elif path_from.startswith("feature"):
            _, fname = path_from.split(":")
            self.file_path_maker = lambda doc=None, idx=None: doc.features[fname]
        elif path_from == "default":
            self.file_path_maker = pathmaker_default
        elif path_from == "relpath":
            self.file_path_maker = \
                lambda doc=None, idx=None: os.path.splitext(doc.features[self.relpathfeatname()])[0]
        elif path_from == "docname":
            self.file_path_maker = lambda doc=None, idx=None: doc.name
        elif path_from == "minstem":
            self.file_path_maker = lambda doc=None, idx=None: minstem(doc.features[self.relpathfeatname()])
        elif callable(path_from):
            self.file_path_maker = path_from
        else:
            raise Exception(f"Not allowed for path_from: {path_from}")
        if not ext.startswith("."):
            ext = "." + ext
        self.ext = ext
        self.fmt = fmt

    def append(self, doc):
        """
        Add a document to the destination.

        Args:
            doc: the document or None, if None, no action is performed.
        """
        if doc is None:
            return
        assert isinstance(doc, Document)
        path = self.file_path_maker(doc=doc, idx=self.idx)
        path = os.path.normpath(
            path
        )  # convert forward slashes to backslashes on windows
        path = os.path.join(self.dirpath, path) + self.ext
        # check if we need to create the directories. For this we first need to get the directories part of the path,
        # which is everything left of the last slash
        if os.path.sep in path:
            dirs = path[: path.rindex(os.path.sep)]
            if not os.path.exists(os.path.normpath(dirs)):
                os.makedirs(dirs)
        Document.save(doc, path, fmt=self.fmt)
        self.idx += 1
        self._n += 1

    def close(self):
        pass


class DirFilesCorpus(Corpus, MultiProcessingAble):
    """
    A corpus representing all files in a directory that match the given extension.
    """

    def __init__(self,
                 dirpath: str,
                 ext: str = "bdocjs",
                 fmt: Optional[str] = None,
                 recursive: bool = True,
                 sort: Union[bool, Callable] = False,
                 sort_reverse: bool = False,
                 nparts: int = 1,
                 partnr: int = 0
                 ):
        """
        Creates the DirCorpus.

        Args:
            dirpath: the directory path
            ext: the file extension that must be matched by all files for the corpus, "bdocjs" if empty or None
            fmt: the format to use, if None, will be determined from the extension
            recursive: if True (default) all matching files from all subdirectories are included
            sort: if True, sort by file paths, if a function sort by that function (default: False)
            sort_reverse: if sort is not False and this is True, sort in reverse order
            nparts: only yield every nparts-th document (default 1: every document)
            partnr: start with that index, before yieldieng every nparts-th document (default 0: start at beginning)
        """
        if not ext:
            ext = "bdocjs"
        if not ext.startswith("."):
            ext = "." + ext
        self.dirpath = dirpath
        self.ext = ext
        self.fmt = fmt
        if not os.path.exists(dirpath):
            raise Exception(f"Directory {dirpath} does not exist")
        if not os.path.isdir(dirpath):
            raise Exception(f"Not a directory: {dirpath}")
        self.paths = list(matching_paths(dirpath, exts=[ext], recursive=recursive))
        if sort or nparts > 1:
            if callable(sort):
                self.paths.sort(key=sort, reverse=sort_reverse)
            else:
                self.paths.sort(reverse=sort_reverse)
        if nparts > 1:
            self.paths = [
                p
                for idx, p in enumerate(self.paths)
                if ((idx - partnr) % nparts) == 0
            ]
        self.size = len(self.paths)

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        assert isinstance(idx, int)
        path = self.paths[idx]
        abspath = os.path.join(self.dirpath, path)
        try:
            doc = Document.load(abspath, fmt=self.fmt)
        except Exception as ex:
            print(f"Error loading document from {abspath} using format {self.fmt}")
            raise ex
        doc.features[self.idxfeatname()] = idx
        # doc.features["__idx"] = idx
        # doc.features["__relpath"] = path
        # doc.features["__abspath"] = abspath
        return doc

    def __setitem__(self, idx, doc):
        """
        Set the document for a specific index.

        Args:
            idx: the index of the document
            doc: the Document, if None, no action is performed and the existing document is left unchanged
        """
        if doc is None:
            return
        assert isinstance(idx, int)
        assert isinstance(doc, Document)
        path = self.paths[idx]
        doc.save(os.path.join(self.dirpath, path), fmt=self.fmt)


class NumberedDirFilesCorpus(Corpus, MultiProcessingAble):
    """
    A corpus that represents files from a (nested) directory, where the filename is derived from
    the index number of the document. This corpus can represent missing elements as None, both
    on reading (when the corresponding expected document does not exist) and on writing (the
    corresponding document gets deleted).
    """

    def __init__(
        self,
        dirpath,
        digits=1,
        levels=1,
        ext="bdocjs",
        fmt=None,
        size=None,
        store_none=True,
    ):
        """
        Creates the NumberedDirFilesCorpus. This corpus, is able to return None for non-existing documents
        and remove document files by setting to None depending on the parameters.

        Args:
            dirpath: the directory path
            digits: the number of digits to use for the file path
            levels: the number of levels to split the digits up which are then used as subdire names.
            ext: the file extension used for all files in the corpus
            fmt: the format to use, if None, determined from the extension
            size: the size of the corpus. This can be used to create a corpus from an empty directory
                to contain only None elements initially.  It can also be used to limit access to only the
                first size elements if the directory contains more documents.
            store_none: if True, will store None in the corpus, i.e. remove the corresponding file from
                the directory. If False, will ignore the action and leave whatever is at the index unchanged.
        """
        if not ext.startswith("."):
            ext = "." + ext
        self.dirpath = dirpath
        self.ext = ext
        self.fmt = fmt
        self.size = size
        self.store_none = store_none
        self.file_path_maker = maker_file_path_fromidx(digits, levels)

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        assert isinstance(idx, int)
        path = self.file_path_maker(idx)
        path = path + self.ext
        abspath = os.path.join(self.dirpath, path)
        if os.path.exists(path):
            doc = Document.load(abspath, fmt=self.fmt)
            doc.features[self.idxfeatname()] = idx
            # doc.features["__idx"] = idx
            # doc.features["__relpath"] = path
            # doc.features["__abspath"] = abspath
        else:
            doc = None
        return doc

    def __setitem__(self, idx, doc):
        assert isinstance(idx, int)
        assert doc is None or isinstance(doc, Document)
        path = self.file_path_maker(idx)
        path = path + self.ext
        if doc is None:
            if self.store_none:
                if os.path.exists(path):
                    os.remove(path)
        else:
            Document.save(os.path.join(self.dirpath, path), fmt=self.fmt)

Functions

def maker_file_path_fromidx(digits=1, levels=1)

Creates a method that returns a file path for the given number of leading digits and levels.

Args

digits
minimum number of digits to use for the path, any number with less digits will have leading zeros added.
levels
how to split the original sequence of digits into a hierarchical path name. For example if digits=10 and levels=3, the generated function will convert the index number 23 into 0/000/000/023

Returns

a function that takes the keyword arguments idx and doc and returns a relative path name (str)

Expand source code
def maker_file_path_fromidx(digits=1, levels=1):
    """
    Creates a method that returns a file path for the given number of leading digits and levels.

    Args:
        digits: minimum number of digits to use for the path, any number with less digits will have leading zeros
           added.
        levels: how to split the original sequence of digits into a hierarchical path name. For example if digits=10
           and levels=3, the generated function will convert the index number 23 into 0/000/000/023

    Returns:
        a function that takes the keyword arguments idx and doc and returns a relative path name (str)
    """
    if (
        not isinstance(digits, int)
        or not isinstance(levels, int)
        or digits < 1
        or levels < 1
        or digits < levels
    ):
        raise Exception(
            "digits and levels must be integers larger than 0 and digits must not be smaller than "
            f"levels, got {digits}/{levels}"
        )

    def file_path_fromidx(doc=None, idx=None):
        # NOTE: doc is unused here but used with other methods to create the file path!
        if idx is None or not isinstance(idx, int) or idx < 0:
            raise Exception("Index must be an integer >= 0")
        per = int(digits / levels)
        asstr = str(idx)
        digs = max(0, digits - len(asstr))
        tmp = "0" * digs
        tmp += str(idx)
        path = ""
        fromdigit = len(tmp) - per
        todigit = len(tmp)
        for _lvl in range(levels - 1):
            path = tmp[fromdigit:todigit] + path
            # print("per=", per, "from=", fromdigit, "to=", todigit, "sec=", tmp[fromdigit:todigit])
            path = "/" + path
            fromdigit = fromdigit - per
            todigit = todigit - per
        path = tmp[:todigit] + path
        return path

    return file_path_fromidx
def matching_paths(dirpath: str, exts: Union[Iterable[+T_co], str, None] = None, recursive: bool = True, relative: bool = True)

Yields all relative file paths from dirpath which match the list of extensions and which do not start with a dot.

Args

dirpath
the directory to traverse
exts
a single extension of a list of allowed extensions (inluding the dot). If None, all files in the directory not starting with a dot are included
recursive
if True (default) include all matching paths from all subdirectories as well, otherwise only paths from the top directory.
relative
if True (default), the paths are relative to the directory path
Expand source code
def matching_paths(
        dirpath: str,
        exts: Optional[Union[Iterable, str]] = None,
        recursive: bool = True,
        relative: bool = True):
    """
    Yields all relative file paths from dirpath which match the list of extensions
    and which do not start with a dot.

    Args:
        dirpath: the directory to traverse
        exts: a single extension of a list of allowed extensions (inluding the dot). If None,
            all files in the directory not starting with a dot are included
        recursive: if True (default) include all matching paths from all subdirectories as well, otherwise
          only paths from the top directory.
        relative: if True (default), the paths are relative to the directory path
    """
    if isinstance(exts, str):
        exts = [exts]
    if recursive:
        for root, _, filenames in os.walk(dirpath):
            for fname in filenames:
                if exts:
                    for ext in exts:
                        if fname.endswith(ext) and not fname.startswith("."):
                            if relative:
                                yield os.path.relpath(
                                    os.path.join(root, fname), dirpath
                                )
                            else:
                                yield os.path.join(root, fname)
                            break
                else:
                    if not fname.startswith("."):
                        if relative:
                            yield os.path.relpath(os.path.join(root, fname), dirpath)
                        else:
                            yield os.path.join(root, fname)
    else:
        for fname in os.listdir(dirpath):
            full = os.path.join(dirpath, fname)
            if not os.path.isfile(full) or fname.startswith("."):
                pass
            elif exts:
                for ext in exts:
                    if fname.endswith(ext):
                        if relative:
                            yield os.path.relpath(full, dirpath)
                        else:
                            yield full
                        break
            else:
                if relative:
                    yield os.path.relpath(full, dirpath)
                else:
                    yield full
def minstem(path)
Expand source code
def minstem(path):
    stem = Path(path).stem
    dotidx = stem.find(".")
    if dotidx > 0:
        stem = stem[:dotidx]
    return stem

Classes

class DirFilesCorpus (dirpath: str, ext: str = 'bdocjs', fmt: Optional[str] = None, recursive: bool = True, sort: Union[bool, Callable] = False, sort_reverse: bool = False, nparts: int = 1, partnr: int = 0)

A corpus representing all files in a directory that match the given extension.

Creates the DirCorpus.

Args

dirpath
the directory path
ext
the file extension that must be matched by all files for the corpus, "bdocjs" if empty or None
fmt
the format to use, if None, will be determined from the extension
recursive
if True (default) all matching files from all subdirectories are included
sort
if True, sort by file paths, if a function sort by that function (default: False)
sort_reverse
if sort is not False and this is True, sort in reverse order
nparts
only yield every nparts-th document (default 1: every document)
partnr
start with that index, before yieldieng every nparts-th document (default 0: start at beginning)
Expand source code
class DirFilesCorpus(Corpus, MultiProcessingAble):
    """
    A corpus representing all files in a directory that match the given extension.
    """

    def __init__(self,
                 dirpath: str,
                 ext: str = "bdocjs",
                 fmt: Optional[str] = None,
                 recursive: bool = True,
                 sort: Union[bool, Callable] = False,
                 sort_reverse: bool = False,
                 nparts: int = 1,
                 partnr: int = 0
                 ):
        """
        Creates the DirCorpus.

        Args:
            dirpath: the directory path
            ext: the file extension that must be matched by all files for the corpus, "bdocjs" if empty or None
            fmt: the format to use, if None, will be determined from the extension
            recursive: if True (default) all matching files from all subdirectories are included
            sort: if True, sort by file paths, if a function sort by that function (default: False)
            sort_reverse: if sort is not False and this is True, sort in reverse order
            nparts: only yield every nparts-th document (default 1: every document)
            partnr: start with that index, before yieldieng every nparts-th document (default 0: start at beginning)
        """
        if not ext:
            ext = "bdocjs"
        if not ext.startswith("."):
            ext = "." + ext
        self.dirpath = dirpath
        self.ext = ext
        self.fmt = fmt
        if not os.path.exists(dirpath):
            raise Exception(f"Directory {dirpath} does not exist")
        if not os.path.isdir(dirpath):
            raise Exception(f"Not a directory: {dirpath}")
        self.paths = list(matching_paths(dirpath, exts=[ext], recursive=recursive))
        if sort or nparts > 1:
            if callable(sort):
                self.paths.sort(key=sort, reverse=sort_reverse)
            else:
                self.paths.sort(reverse=sort_reverse)
        if nparts > 1:
            self.paths = [
                p
                for idx, p in enumerate(self.paths)
                if ((idx - partnr) % nparts) == 0
            ]
        self.size = len(self.paths)

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        assert isinstance(idx, int)
        path = self.paths[idx]
        abspath = os.path.join(self.dirpath, path)
        try:
            doc = Document.load(abspath, fmt=self.fmt)
        except Exception as ex:
            print(f"Error loading document from {abspath} using format {self.fmt}")
            raise ex
        doc.features[self.idxfeatname()] = idx
        # doc.features["__idx"] = idx
        # doc.features["__relpath"] = path
        # doc.features["__abspath"] = abspath
        return doc

    def __setitem__(self, idx, doc):
        """
        Set the document for a specific index.

        Args:
            idx: the index of the document
            doc: the Document, if None, no action is performed and the existing document is left unchanged
        """
        if doc is None:
            return
        assert isinstance(idx, int)
        assert isinstance(doc, Document)
        path = self.paths[idx]
        doc.save(os.path.join(self.dirpath, path), fmt=self.fmt)

Ancestors

Inherited members

class DirFilesDestination (dirpath, path_from: Union[str, Callable] = 'default', ext: str = 'bdocjs', fmt=None)

A destination where each document is stored in a file in a directory or directory tree in some known serialization format. The filename or path of the file can be derived from a document feature, the document name, the running number of file added, or any function that can derive a file path from the document and the running number.

Create a destination to store documents in files inside a directory or directory tree.

Args

dirpath
the directory to contain the files
path_from

one of options listed below. If a string is used as a path name, then the forward slash is always used as the directory path separator, on all systems!

  • "default" (default) a heuristic which uses "relpath" if it is available, otherwise uses the index with at least 5 digits.
  • "relpath": use the relative path used when creating the document, but replace the extension
  • "idx": just use the index/running number of the added document as the base name
  • "idx:5": use the index/running number with at least 5 digits in the name.
  • "idx:10:2": use the index and organize a total of 10 digits into a hierarchical pathname of 2 levels, so 10:2 would mean the first 5 digits are for the name of the subdirectory and the second 5 digits are for the file base name. 10:3 would have for levels, the first subdirectory level with 1 digit, the next two with 3 digits and the remaining 3 digits for the filename. NOTE: "idx" by itself is equivalent to idx:1:1
  • "feature:fname": use the document feature with the feature name fname as a relative path as is but add the extension
  • "docname": use the document name as the relative path, but add extension.
  • "minstem": use the relative path with all extensions replaced by the new extension
  • somefunction: a function that should return the pathname (without extension) and should take two keyword arguments: doc (the document) and idx (the running index of the document).
ext
the file extension to add to all generated file names
fmt
the format to use for serializing the document, if None, will try to determine from the extension.
Expand source code
class DirFilesDestination(DocumentDestination):
    """
    A destination where each document is stored in a file in a directory or directory tree in some
    known serialization format. The filename or path of the file can be derived from a document feature,
    the document name, the running number of file added, or any function that can derive a file path
    from the document and the running number.
    """

    def __init__(self, dirpath, path_from: Union[str, Callable] = "default", ext: str = "bdocjs", fmt=None):
        """
        Create a destination to store documents in files inside a directory or directory tree.

        Args:
            dirpath: the directory to contain the files
            path_from: one of options listed below. If a string is used as a path name, then the forward slash
                 is always used as the directory path separator, on all systems!

               * "default" (default) a heuristic which uses "relpath" if it is available, otherwise uses the
                    index with at least 5 digits.
               * "relpath": use the relative path used when creating the document, but
                   replace the extension
               * "idx": just use the index/running number of the added document as the base name
               * "idx:5": use the index/running number with at least 5 digits in the name.
               * "idx:10:2": use the index and organize a total of 10 digits into a hierarchical
                   pathname of 2 levels, so 10:2 would mean the first 5 digits are for the name of the subdirectory
                   and the second 5 digits are for the file base name. 10:3 would have for levels, the first
                   subdirectory level with 1 digit, the next two with 3 digits and the remaining 3 digits for the
                   filename.
                   NOTE: "idx" by itself is equivalent to idx:1:1
                * "feature:fname": use the document feature with the feature name fname as a relative path as is
                   but add the extension
                * "docname": use the document name as the relative path, but add extension.
                * "minstem": use the relative path with all extensions replaced by the new extension
                * somefunction: a function that should return the pathname (without extension) and should take two
                   keyword arguments: doc (the document) and idx (the running index of the document).

            ext: the file extension to add to all generated file names
            fmt: the format to use for serializing the document, if None, will try to determine from the extension.
        """
        super().__init__()
        if not os.path.isdir(dirpath):
            raise Exception("Not a directory: ", dirpath)
        self.dirpath = dirpath
        self.idx = 0

        def pathmaker_default(doc=None, idx=None):
            relpath = doc.features.get(self.relpathfeatname())
            if relpath:
                return os.path.splitext(doc.features[self.relpathfeatname()])[0]
            else:
                return f"{idx:05d}"

        if path_from.startswith("idx"):
            rest = path_from[
                3:
            ]  # if we have digits or levels, there is a leading colon!
            if len(rest) == 0:
                digits = 1
                levels = 1
            else:
                parms = rest.split(":")
                parms.append("1")
                digits, levels = parms[1:3]
                digits = int(digits)
                levels = int(levels)
            self.file_path_maker = maker_file_path_fromidx(digits, levels)
        elif path_from.startswith("feature"):
            _, fname = path_from.split(":")
            self.file_path_maker = lambda doc=None, idx=None: doc.features[fname]
        elif path_from == "default":
            self.file_path_maker = pathmaker_default
        elif path_from == "relpath":
            self.file_path_maker = \
                lambda doc=None, idx=None: os.path.splitext(doc.features[self.relpathfeatname()])[0]
        elif path_from == "docname":
            self.file_path_maker = lambda doc=None, idx=None: doc.name
        elif path_from == "minstem":
            self.file_path_maker = lambda doc=None, idx=None: minstem(doc.features[self.relpathfeatname()])
        elif callable(path_from):
            self.file_path_maker = path_from
        else:
            raise Exception(f"Not allowed for path_from: {path_from}")
        if not ext.startswith("."):
            ext = "." + ext
        self.ext = ext
        self.fmt = fmt

    def append(self, doc):
        """
        Add a document to the destination.

        Args:
            doc: the document or None, if None, no action is performed.
        """
        if doc is None:
            return
        assert isinstance(doc, Document)
        path = self.file_path_maker(doc=doc, idx=self.idx)
        path = os.path.normpath(
            path
        )  # convert forward slashes to backslashes on windows
        path = os.path.join(self.dirpath, path) + self.ext
        # check if we need to create the directories. For this we first need to get the directories part of the path,
        # which is everything left of the last slash
        if os.path.sep in path:
            dirs = path[: path.rindex(os.path.sep)]
            if not os.path.exists(os.path.normpath(dirs)):
                os.makedirs(dirs)
        Document.save(doc, path, fmt=self.fmt)
        self.idx += 1
        self._n += 1

    def close(self):
        pass

Ancestors

Methods

def append(self, doc)

Add a document to the destination.

Args

doc
the document or None, if None, no action is performed.
Expand source code
def append(self, doc):
    """
    Add a document to the destination.

    Args:
        doc: the document or None, if None, no action is performed.
    """
    if doc is None:
        return
    assert isinstance(doc, Document)
    path = self.file_path_maker(doc=doc, idx=self.idx)
    path = os.path.normpath(
        path
    )  # convert forward slashes to backslashes on windows
    path = os.path.join(self.dirpath, path) + self.ext
    # check if we need to create the directories. For this we first need to get the directories part of the path,
    # which is everything left of the last slash
    if os.path.sep in path:
        dirs = path[: path.rindex(os.path.sep)]
        if not os.path.exists(os.path.normpath(dirs)):
            os.makedirs(dirs)
    Document.save(doc, path, fmt=self.fmt)
    self.idx += 1
    self._n += 1

Inherited members

class DirFilesSource (dirpath: str, paths: Optional[Iterable[str]] = None, paths_from: Union[str, pathlib.Path, urllib.parse.ParseResult] = None, exts: Optional[Iterable[str]] = None, fmt: Optional[str] = None, recursive: bool = True, sort: Union[bool, Callable] = False, sort_reverse: bool = False, docname_from: Optional[str] = None, nparts: int = 1, partnr: int = 0)

A document source which iterates over documents represented as files in a directory.

Create a DirFilesSource.

Args

dirpath
the directory that contains the file to load as documents.
paths
if not None, must be an iterable of relate file paths to load from the directory
paths_from
if not None, must be a file or URL to load a list of relative file paths from
exts
an iterable of allowed file extensions or file extension regexps
fmt
the format to use for loading files. This is only useful if all files have the same format but the file extensions does not indicate the format.
recursive
recursively include paths from all subdirectories as well
sort
a boolean to indicate that paths should get processed in sort order, or a callable that will be used to extract the sort key. The paths get always sorted if partnr is > 1.
sort_reverse
if paths should get serted in reverse order
docname_from
If not None set the document name from "basename", "stem" (basename without last extension) "minstem" (basename with all extensions removed) "relpath", "index" (sequence number of document within this part).
nparts
only yield every nparts-th document (default 1: every document)
partnr
start with that index, before yieldieng every nparts-th document (default 0: start at beginning)
Expand source code
class DirFilesSource(DocumentSource, EveryNthBase, MultiProcessingAble):
    """
    A document source which iterates over documents represented as files in a directory.
    """
    def __init__(
        self,
        dirpath: str,
        paths: Optional[Iterable[str]] = None,
        paths_from: Union[str, Path, ParseResult] = None,
        exts: Optional[Iterable[str]] = None,
        fmt: Optional[str] = None,
        recursive: bool = True,
        sort: Union[bool, Callable] = False,
        sort_reverse: bool = False,
        docname_from: Optional[str] = None,
        nparts: int = 1,
        partnr: int = 0,
    ):
        """
        Create a DirFilesSource.

        Args:
            dirpath: the directory that contains the file to load as documents.
            paths:  if not None, must be an iterable of relate file paths to load from the directory
            paths_from: if not None, must be a file or URL to load a list of relative file paths from
            exts: an iterable of allowed file extensions or file extension regexps
            fmt: the format to use for loading files. This is only useful if all files have the same format
               but the file extensions does not indicate the format.
            recursive: recursively include paths from all subdirectories as well
            sort: a boolean to indicate that paths should get processed in sort order, or a callable that
                will be used to extract the sort key.
                The paths get always sorted if partnr is > 1.
            sort_reverse: if paths should get serted in reverse order
            docname_from: If not None set the document name from "basename", "stem" (basename without last extension)
                "minstem" (basename with all extensions removed) "relpath",
                "index" (sequence number of document within this part).
            nparts: only yield every nparts-th document (default 1: every document)
            partnr: start with that index, before yieldieng every nparts-th document (default 0: start at beginning)
        """
        self.dirpath = dirpath
        if paths is not None and paths_from is not None:
            raise Exception("Parameters paths and paths_from cannot be both specified")
        super().__init__()
        EveryNthBase.__init__(self, nparts=nparts, partnr=partnr)
        if docname_from is not None:
            assert docname_from in ["basename", "relpath", "index", "stem", "minstem"]
        self.docname_from = docname_from
        if paths is not None:
            self.paths = paths
        elif paths_from is not None:
            self.paths = []
            for pth in yield_lines_from(paths_from):
                self.paths.append(pth.rstrip("\n\r"))
        else:
            self.paths = list(matching_paths(dirpath, exts=exts, recursive=recursive))
        if sort or nparts > 1:
            if callable(sort):
                self.paths.sort(key=sort, reverse=sort_reverse)
            else:
                self.paths.sort(reverse=sort_reverse)
        if nparts > 1:
            self.paths = [
                p
                for idx, p in enumerate(self.paths)
                if ((idx - partnr) % nparts) == 0
            ]
        self.fmt = fmt

    def __iter__(self):
        """
        Yield the next document from the source.
        """
        self._n = 0
        for p in self.paths:
            fullpath = os.path.join(self.dirpath, p)
            doc = Document.load(fullpath, fmt=self.fmt)
            self.setrelpathfeature(doc, p)
            if self.docname_from:
                if self.docname_from == "basename":
                    docname = os.path.basename(fullpath)
                elif self.docname_from == "stem":
                    docname = Path(fullpath).stem
                elif self.docname_from == "index":
                    docname = str(self._n)
                elif self.docname_from == "relpath":
                    docname = p
                elif self.docname_from == "minstem":
                    docname = minstem(fullpath)
                doc.name = docname
            self._n += 1
            yield doc

Ancestors

Inherited members

class NumberedDirFilesCorpus (dirpath, digits=1, levels=1, ext='bdocjs', fmt=None, size=None, store_none=True)

A corpus that represents files from a (nested) directory, where the filename is derived from the index number of the document. This corpus can represent missing elements as None, both on reading (when the corresponding expected document does not exist) and on writing (the corresponding document gets deleted).

Creates the NumberedDirFilesCorpus. This corpus, is able to return None for non-existing documents and remove document files by setting to None depending on the parameters.

Args

dirpath
the directory path
digits
the number of digits to use for the file path
levels
the number of levels to split the digits up which are then used as subdire names.
ext
the file extension used for all files in the corpus
fmt
the format to use, if None, determined from the extension
size
the size of the corpus. This can be used to create a corpus from an empty directory to contain only None elements initially. It can also be used to limit access to only the first size elements if the directory contains more documents.
store_none
if True, will store None in the corpus, i.e. remove the corresponding file from the directory. If False, will ignore the action and leave whatever is at the index unchanged.
Expand source code
class NumberedDirFilesCorpus(Corpus, MultiProcessingAble):
    """
    A corpus that represents files from a (nested) directory, where the filename is derived from
    the index number of the document. This corpus can represent missing elements as None, both
    on reading (when the corresponding expected document does not exist) and on writing (the
    corresponding document gets deleted).
    """

    def __init__(
        self,
        dirpath,
        digits=1,
        levels=1,
        ext="bdocjs",
        fmt=None,
        size=None,
        store_none=True,
    ):
        """
        Creates the NumberedDirFilesCorpus. This corpus, is able to return None for non-existing documents
        and remove document files by setting to None depending on the parameters.

        Args:
            dirpath: the directory path
            digits: the number of digits to use for the file path
            levels: the number of levels to split the digits up which are then used as subdire names.
            ext: the file extension used for all files in the corpus
            fmt: the format to use, if None, determined from the extension
            size: the size of the corpus. This can be used to create a corpus from an empty directory
                to contain only None elements initially.  It can also be used to limit access to only the
                first size elements if the directory contains more documents.
            store_none: if True, will store None in the corpus, i.e. remove the corresponding file from
                the directory. If False, will ignore the action and leave whatever is at the index unchanged.
        """
        if not ext.startswith("."):
            ext = "." + ext
        self.dirpath = dirpath
        self.ext = ext
        self.fmt = fmt
        self.size = size
        self.store_none = store_none
        self.file_path_maker = maker_file_path_fromidx(digits, levels)

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        assert isinstance(idx, int)
        path = self.file_path_maker(idx)
        path = path + self.ext
        abspath = os.path.join(self.dirpath, path)
        if os.path.exists(path):
            doc = Document.load(abspath, fmt=self.fmt)
            doc.features[self.idxfeatname()] = idx
            # doc.features["__idx"] = idx
            # doc.features["__relpath"] = path
            # doc.features["__abspath"] = abspath
        else:
            doc = None
        return doc

    def __setitem__(self, idx, doc):
        assert isinstance(idx, int)
        assert doc is None or isinstance(doc, Document)
        path = self.file_path_maker(idx)
        path = path + self.ext
        if doc is None:
            if self.store_none:
                if os.path.exists(path):
                    os.remove(path)
        else:
            Document.save(os.path.join(self.dirpath, path), fmt=self.fmt)

Ancestors

Inherited members