Module gatenlp.corpora.dirs
Module that defines Corpus and DocumentSource/DocumentDestination classes which access documents as files in a directory.
Expand source code
"""
Module that defines Corpus and DocumentSource/DocumentDestination classes which access documents
as files in a directory.
"""
import os
from typing import Union, Callable, Iterable, Optional
from pathlib import Path
from urllib.parse import ParseResult
from gatenlp.urlfileutils import yield_lines_from
from gatenlp.document import Document
from gatenlp.corpora.base import DocumentSource, DocumentDestination, Corpus
from gatenlp.corpora.base import MultiProcessingAble
from gatenlp.corpora.base import EveryNthBase
def minstem(path):
    stem = Path(path).stem
    dotidx = stem.find(".")
    if dotidx > 0:
        stem = stem[:dotidx]
    return stem
def matching_paths(
        dirpath: str,
        exts: Optional[Union[Iterable, str]] = None,
        recursive: bool = True,
        relative: bool = True):
    """
    Yields all relative file paths from dirpath which match the list of extensions
    and which do not start with a dot.
    Args:
        dirpath: the directory to traverse
        exts: a single extension of a list of allowed extensions (inluding the dot). If None,
            all files in the directory not starting with a dot are included
        recursive: if True (default) include all matching paths from all subdirectories as well, otherwise
          only paths from the top directory.
        relative: if True (default), the paths are relative to the directory path
    """
    if isinstance(exts, str):
        exts = [exts]
    if recursive:
        for root, _, filenames in os.walk(dirpath):
            for fname in filenames:
                if exts:
                    for ext in exts:
                        if fname.endswith(ext) and not fname.startswith("."):
                            if relative:
                                yield os.path.relpath(
                                    os.path.join(root, fname), dirpath
                                )
                            else:
                                yield os.path.join(root, fname)
                            break
                else:
                    if not fname.startswith("."):
                        if relative:
                            yield os.path.relpath(os.path.join(root, fname), dirpath)
                        else:
                            yield os.path.join(root, fname)
    else:
        for fname in os.listdir(dirpath):
            full = os.path.join(dirpath, fname)
            if not os.path.isfile(full) or fname.startswith("."):
                pass
            elif exts:
                for ext in exts:
                    if fname.endswith(ext):
                        if relative:
                            yield os.path.relpath(full, dirpath)
                        else:
                            yield full
                        break
            else:
                if relative:
                    yield os.path.relpath(full, dirpath)
                else:
                    yield full
def maker_file_path_fromidx(digits=1, levels=1):
    """
    Creates a method that returns a file path for the given number of leading digits and levels.
    Args:
        digits: minimum number of digits to use for the path, any number with less digits will have leading zeros
           added.
        levels: how to split the original sequence of digits into a hierarchical path name. For example if digits=10
           and levels=3, the generated function will convert the index number 23 into 0/000/000/023
    Returns:
        a function that takes the keyword arguments idx and doc and returns a relative path name (str)
    """
    if (
        not isinstance(digits, int)
        or not isinstance(levels, int)
        or digits < 1
        or levels < 1
        or digits < levels
    ):
        raise Exception(
            "digits and levels must be integers larger than 0 and digits must not be smaller than "
            f"levels, got {digits}/{levels}"
        )
    def file_path_fromidx(doc=None, idx=None):
        # NOTE: doc is unused here but used with other methods to create the file path!
        if idx is None or not isinstance(idx, int) or idx < 0:
            raise Exception("Index must be an integer >= 0")
        per = int(digits / levels)
        asstr = str(idx)
        digs = max(0, digits - len(asstr))
        tmp = "0" * digs
        tmp += str(idx)
        path = ""
        fromdigit = len(tmp) - per
        todigit = len(tmp)
        for _lvl in range(levels - 1):
            path = tmp[fromdigit:todigit] + path
            # print("per=", per, "from=", fromdigit, "to=", todigit, "sec=", tmp[fromdigit:todigit])
            path = "/" + path
            fromdigit = fromdigit - per
            todigit = todigit - per
        path = tmp[:todigit] + path
        return path
    return file_path_fromidx
# TODO: set the special features for the relative path, index number, document id?
class DirFilesSource(DocumentSource, EveryNthBase, MultiProcessingAble):
    """
    A document source which iterates over documents represented as files in a directory.
    """
    def __init__(
        self,
        dirpath: str,
        paths: Optional[Iterable[str]] = None,
        paths_from: Union[str, Path, ParseResult] = None,
        exts: Optional[Iterable[str]] = None,
        fmt: Optional[str] = None,
        recursive: bool = True,
        sort: Union[bool, Callable] = False,
        sort_reverse: bool = False,
        docname_from: Optional[str] = None,
        nparts: int = 1,
        partnr: int = 0,
    ):
        """
        Create a DirFilesSource.
        Args:
            dirpath: the directory that contains the file to load as documents.
            paths:  if not None, must be an iterable of relate file paths to load from the directory
            paths_from: if not None, must be a file or URL to load a list of relative file paths from
            exts: an iterable of allowed file extensions or file extension regexps
            fmt: the format to use for loading files. This is only useful if all files have the same format
               but the file extensions does not indicate the format.
            recursive: recursively include paths from all subdirectories as well
            sort: a boolean to indicate that paths should get processed in sort order, or a callable that
                will be used to extract the sort key.
                The paths get always sorted if partnr is > 1.
            sort_reverse: if paths should get serted in reverse order
            docname_from: If not None set the document name from "basename", "stem" (basename without last extension)
                "minstem" (basename with all extensions removed) "relpath",
                "index" (sequence number of document within this part).
            nparts: only yield every nparts-th document (default 1: every document)
            partnr: start with that index, before yieldieng every nparts-th document (default 0: start at beginning)
        """
        self.dirpath = dirpath
        if paths is not None and paths_from is not None:
            raise Exception("Parameters paths and paths_from cannot be both specified")
        super().__init__()
        EveryNthBase.__init__(self, nparts=nparts, partnr=partnr)
        if docname_from is not None:
            assert docname_from in ["basename", "relpath", "index", "stem", "minstem"]
        self.docname_from = docname_from
        if paths is not None:
            self.paths = paths
        elif paths_from is not None:
            self.paths = []
            for pth in yield_lines_from(paths_from):
                self.paths.append(pth.rstrip("\n\r"))
        else:
            self.paths = list(matching_paths(dirpath, exts=exts, recursive=recursive))
        if sort or nparts > 1:
            if callable(sort):
                self.paths.sort(key=sort, reverse=sort_reverse)
            else:
                self.paths.sort(reverse=sort_reverse)
        if nparts > 1:
            self.paths = [
                p
                for idx, p in enumerate(self.paths)
                if ((idx - partnr) % nparts) == 0
            ]
        self.fmt = fmt
    def __iter__(self):
        """
        Yield the next document from the source.
        """
        self._n = 0
        for p in self.paths:
            fullpath = os.path.join(self.dirpath, p)
            doc = Document.load(fullpath, fmt=self.fmt)
            self.setrelpathfeature(doc, p)
            if self.docname_from:
                if self.docname_from == "basename":
                    docname = os.path.basename(fullpath)
                elif self.docname_from == "stem":
                    docname = Path(fullpath).stem
                elif self.docname_from == "index":
                    docname = str(self._n)
                elif self.docname_from == "relpath":
                    docname = p
                elif self.docname_from == "minstem":
                    docname = minstem(fullpath)
                doc.name = docname
            self._n += 1
            yield doc
class DirFilesDestination(DocumentDestination):
    """
    A destination where each document is stored in a file in a directory or directory tree in some
    known serialization format. The filename or path of the file can be derived from a document feature,
    the document name, the running number of file added, or any function that can derive a file path
    from the document and the running number.
    """
    def __init__(self, dirpath, path_from: Union[str, Callable] = "default", ext: str = "bdocjs", fmt=None):
        """
        Create a destination to store documents in files inside a directory or directory tree.
        Args:
            dirpath: the directory to contain the files
            path_from: one of options listed below. If a string is used as a path name, then the forward slash
                 is always used as the directory path separator, on all systems!
               * "default" (default) a heuristic which uses "relpath" if it is available, otherwise uses the
                    index with at least 5 digits.
               * "relpath": use the relative path used when creating the document, but
                   replace the extension
               * "idx": just use the index/running number of the added document as the base name
               * "idx:5": use the index/running number with at least 5 digits in the name.
               * "idx:10:2": use the index and organize a total of 10 digits into a hierarchical
                   pathname of 2 levels, so 10:2 would mean the first 5 digits are for the name of the subdirectory
                   and the second 5 digits are for the file base name. 10:3 would have for levels, the first
                   subdirectory level with 1 digit, the next two with 3 digits and the remaining 3 digits for the
                   filename.
                   NOTE: "idx" by itself is equivalent to idx:1:1
                * "feature:fname": use the document feature with the feature name fname as a relative path as is
                   but add the extension
                * "docname": use the document name as the relative path, but add extension.
                * "minstem": use the relative path with all extensions replaced by the new extension
                * somefunction: a function that should return the pathname (without extension) and should take two
                   keyword arguments: doc (the document) and idx (the running index of the document).
            ext: the file extension to add to all generated file names
            fmt: the format to use for serializing the document, if None, will try to determine from the extension.
        """
        super().__init__()
        if not os.path.isdir(dirpath):
            raise Exception("Not a directory: ", dirpath)
        self.dirpath = dirpath
        self.idx = 0
        def pathmaker_default(doc=None, idx=None):
            relpath = doc.features.get(self.relpathfeatname())
            if relpath:
                return os.path.splitext(doc.features[self.relpathfeatname()])[0]
            else:
                return f"{idx:05d}"
        if path_from.startswith("idx"):
            rest = path_from[
                3:
            ]  # if we have digits or levels, there is a leading colon!
            if len(rest) == 0:
                digits = 1
                levels = 1
            else:
                parms = rest.split(":")
                parms.append("1")
                digits, levels = parms[1:3]
                digits = int(digits)
                levels = int(levels)
            self.file_path_maker = maker_file_path_fromidx(digits, levels)
        elif path_from.startswith("feature"):
            _, fname = path_from.split(":")
            self.file_path_maker = lambda doc=None, idx=None: doc.features[fname]
        elif path_from == "default":
            self.file_path_maker = pathmaker_default
        elif path_from == "relpath":
            self.file_path_maker = \
                lambda doc=None, idx=None: os.path.splitext(doc.features[self.relpathfeatname()])[0]
        elif path_from == "docname":
            self.file_path_maker = lambda doc=None, idx=None: doc.name
        elif path_from == "minstem":
            self.file_path_maker = lambda doc=None, idx=None: minstem(doc.features[self.relpathfeatname()])
        elif callable(path_from):
            self.file_path_maker = path_from
        else:
            raise Exception(f"Not allowed for path_from: {path_from}")
        if not ext.startswith("."):
            ext = "." + ext
        self.ext = ext
        self.fmt = fmt
    def append(self, doc):
        """
        Add a document to the destination.
        Args:
            doc: the document or None, if None, no action is performed.
        """
        if doc is None:
            return
        assert isinstance(doc, Document)
        path = self.file_path_maker(doc=doc, idx=self.idx)
        path = os.path.normpath(
            path
        )  # convert forward slashes to backslashes on windows
        path = os.path.join(self.dirpath, path) + self.ext
        # check if we need to create the directories. For this we first need to get the directories part of the path,
        # which is everything left of the last slash
        if os.path.sep in path:
            dirs = path[: path.rindex(os.path.sep)]
            if not os.path.exists(os.path.normpath(dirs)):
                os.makedirs(dirs)
        Document.save(doc, path, fmt=self.fmt)
        self.idx += 1
        self._n += 1
    def close(self):
        pass
class DirFilesCorpus(Corpus, MultiProcessingAble):
    """
    A corpus representing all files in a directory that match the given extension.
    """
    def __init__(self,
                 dirpath: str,
                 ext: str = "bdocjs",
                 fmt: Optional[str] = None,
                 recursive: bool = True,
                 sort: Union[bool, Callable] = False,
                 sort_reverse: bool = False,
                 nparts: int = 1,
                 partnr: int = 0
                 ):
        """
        Creates the DirCorpus.
        Args:
            dirpath: the directory path
            ext: the file extension that must be matched by all files for the corpus, "bdocjs" if empty or None
            fmt: the format to use, if None, will be determined from the extension
            recursive: if True (default) all matching files from all subdirectories are included
            sort: if True, sort by file paths, if a function sort by that function (default: False)
            sort_reverse: if sort is not False and this is True, sort in reverse order
            nparts: only yield every nparts-th document (default 1: every document)
            partnr: start with that index, before yieldieng every nparts-th document (default 0: start at beginning)
        """
        if not ext:
            ext = "bdocjs"
        if not ext.startswith("."):
            ext = "." + ext
        self.dirpath = dirpath
        self.ext = ext
        self.fmt = fmt
        if not os.path.exists(dirpath):
            raise Exception(f"Directory {dirpath} does not exist")
        if not os.path.isdir(dirpath):
            raise Exception(f"Not a directory: {dirpath}")
        self.paths = list(matching_paths(dirpath, exts=[ext], recursive=recursive))
        if sort or nparts > 1:
            if callable(sort):
                self.paths.sort(key=sort, reverse=sort_reverse)
            else:
                self.paths.sort(reverse=sort_reverse)
        if nparts > 1:
            self.paths = [
                p
                for idx, p in enumerate(self.paths)
                if ((idx - partnr) % nparts) == 0
            ]
        self.size = len(self.paths)
    def __len__(self):
        return self.size
    def __getitem__(self, idx):
        assert isinstance(idx, int)
        path = self.paths[idx]
        abspath = os.path.join(self.dirpath, path)
        try:
            doc = Document.load(abspath, fmt=self.fmt)
        except Exception as ex:
            print(f"Error loading document from {abspath} using format {self.fmt}")
            raise ex
        doc.features[self.idxfeatname()] = idx
        # doc.features["__idx"] = idx
        # doc.features["__relpath"] = path
        # doc.features["__abspath"] = abspath
        return doc
    def __setitem__(self, idx, doc):
        """
        Set the document for a specific index.
        Args:
            idx: the index of the document
            doc: the Document, if None, no action is performed and the existing document is left unchanged
        """
        if doc is None:
            return
        assert isinstance(idx, int)
        assert isinstance(doc, Document)
        path = self.paths[idx]
        doc.save(os.path.join(self.dirpath, path), fmt=self.fmt)
class NumberedDirFilesCorpus(Corpus, MultiProcessingAble):
    """
    A corpus that represents files from a (nested) directory, where the filename is derived from
    the index number of the document. This corpus can represent missing elements as None, both
    on reading (when the corresponding expected document does not exist) and on writing (the
    corresponding document gets deleted).
    """
    def __init__(
        self,
        dirpath,
        digits=1,
        levels=1,
        ext="bdocjs",
        fmt=None,
        size=None,
        store_none=True,
    ):
        """
        Creates the NumberedDirFilesCorpus. This corpus, is able to return None for non-existing documents
        and remove document files by setting to None depending on the parameters.
        Args:
            dirpath: the directory path
            digits: the number of digits to use for the file path
            levels: the number of levels to split the digits up which are then used as subdire names.
            ext: the file extension used for all files in the corpus
            fmt: the format to use, if None, determined from the extension
            size: the size of the corpus. This can be used to create a corpus from an empty directory
                to contain only None elements initially.  It can also be used to limit access to only the
                first size elements if the directory contains more documents.
            store_none: if True, will store None in the corpus, i.e. remove the corresponding file from
                the directory. If False, will ignore the action and leave whatever is at the index unchanged.
        """
        if not ext.startswith("."):
            ext = "." + ext
        self.dirpath = dirpath
        self.ext = ext
        self.fmt = fmt
        self.size = size
        self.store_none = store_none
        self.file_path_maker = maker_file_path_fromidx(digits, levels)
    def __len__(self):
        return self.size
    def __getitem__(self, idx):
        assert isinstance(idx, int)
        path = self.file_path_maker(idx)
        path = path + self.ext
        abspath = os.path.join(self.dirpath, path)
        if os.path.exists(path):
            doc = Document.load(abspath, fmt=self.fmt)
            doc.features[self.idxfeatname()] = idx
            # doc.features["__idx"] = idx
            # doc.features["__relpath"] = path
            # doc.features["__abspath"] = abspath
        else:
            doc = None
        return doc
    def __setitem__(self, idx, doc):
        assert isinstance(idx, int)
        assert doc is None or isinstance(doc, Document)
        path = self.file_path_maker(idx)
        path = path + self.ext
        if doc is None:
            if self.store_none:
                if os.path.exists(path):
                    os.remove(path)
        else:
            Document.save(os.path.join(self.dirpath, path), fmt=self.fmt)Functions
- def maker_file_path_fromidx(digits=1, levels=1)
- 
Creates a method that returns a file path for the given number of leading digits and levels. Args- digits
- minimum number of digits to use for the path, any number with less digits will have leading zeros added.
- levels
- how to split the original sequence of digits into a hierarchical path name. For example if digits=10 and levels=3, the generated function will convert the index number 23 into 0/000/000/023
 Returnsa function that takes the keyword arguments idx and doc and returns a relative path name (str) Expand source codedef maker_file_path_fromidx(digits=1, levels=1): """ Creates a method that returns a file path for the given number of leading digits and levels. Args: digits: minimum number of digits to use for the path, any number with less digits will have leading zeros added. levels: how to split the original sequence of digits into a hierarchical path name. For example if digits=10 and levels=3, the generated function will convert the index number 23 into 0/000/000/023 Returns: a function that takes the keyword arguments idx and doc and returns a relative path name (str) """ if ( not isinstance(digits, int) or not isinstance(levels, int) or digits < 1 or levels < 1 or digits < levels ): raise Exception( "digits and levels must be integers larger than 0 and digits must not be smaller than " f"levels, got {digits}/{levels}" ) def file_path_fromidx(doc=None, idx=None): # NOTE: doc is unused here but used with other methods to create the file path! if idx is None or not isinstance(idx, int) or idx < 0: raise Exception("Index must be an integer >= 0") per = int(digits / levels) asstr = str(idx) digs = max(0, digits - len(asstr)) tmp = "0" * digs tmp += str(idx) path = "" fromdigit = len(tmp) - per todigit = len(tmp) for _lvl in range(levels - 1): path = tmp[fromdigit:todigit] + path # print("per=", per, "from=", fromdigit, "to=", todigit, "sec=", tmp[fromdigit:todigit]) path = "/" + path fromdigit = fromdigit - per todigit = todigit - per path = tmp[:todigit] + path return path return file_path_fromidx
- def matching_paths(dirpath: str, exts: Union[Iterable[+T_co], str, None] = None, recursive: bool = True, relative: bool = True)
- 
Yields all relative file paths from dirpath which match the list of extensions and which do not start with a dot. Args- dirpath
- the directory to traverse
- exts
- a single extension of a list of allowed extensions (inluding the dot). If None, all files in the directory not starting with a dot are included
- recursive
- if True (default) include all matching paths from all subdirectories as well, otherwise only paths from the top directory.
- relative
- if True (default), the paths are relative to the directory path
 Expand source codedef matching_paths( dirpath: str, exts: Optional[Union[Iterable, str]] = None, recursive: bool = True, relative: bool = True): """ Yields all relative file paths from dirpath which match the list of extensions and which do not start with a dot. Args: dirpath: the directory to traverse exts: a single extension of a list of allowed extensions (inluding the dot). If None, all files in the directory not starting with a dot are included recursive: if True (default) include all matching paths from all subdirectories as well, otherwise only paths from the top directory. relative: if True (default), the paths are relative to the directory path """ if isinstance(exts, str): exts = [exts] if recursive: for root, _, filenames in os.walk(dirpath): for fname in filenames: if exts: for ext in exts: if fname.endswith(ext) and not fname.startswith("."): if relative: yield os.path.relpath( os.path.join(root, fname), dirpath ) else: yield os.path.join(root, fname) break else: if not fname.startswith("."): if relative: yield os.path.relpath(os.path.join(root, fname), dirpath) else: yield os.path.join(root, fname) else: for fname in os.listdir(dirpath): full = os.path.join(dirpath, fname) if not os.path.isfile(full) or fname.startswith("."): pass elif exts: for ext in exts: if fname.endswith(ext): if relative: yield os.path.relpath(full, dirpath) else: yield full break else: if relative: yield os.path.relpath(full, dirpath) else: yield full
- def minstem(path)
- 
Expand source codedef minstem(path): stem = Path(path).stem dotidx = stem.find(".") if dotidx > 0: stem = stem[:dotidx] return stem
Classes
- class DirFilesCorpus (dirpath: str, ext: str = 'bdocjs', fmt: Optional[str] = None, recursive: bool = True, sort: Union[bool, Callable] = False, sort_reverse: bool = False, nparts: int = 1, partnr: int = 0)
- 
A corpus representing all files in a directory that match the given extension. Creates the DirCorpus. Args- dirpath
- the directory path
- ext
- the file extension that must be matched by all files for the corpus, "bdocjs" if empty or None
- fmt
- the format to use, if None, will be determined from the extension
- recursive
- if True (default) all matching files from all subdirectories are included
- sort
- if True, sort by file paths, if a function sort by that function (default: False)
- sort_reverse
- if sort is not False and this is True, sort in reverse order
- nparts
- only yield every nparts-th document (default 1: every document)
- partnr
- start with that index, before yieldieng every nparts-th document (default 0: start at beginning)
 Expand source codeclass DirFilesCorpus(Corpus, MultiProcessingAble): """ A corpus representing all files in a directory that match the given extension. """ def __init__(self, dirpath: str, ext: str = "bdocjs", fmt: Optional[str] = None, recursive: bool = True, sort: Union[bool, Callable] = False, sort_reverse: bool = False, nparts: int = 1, partnr: int = 0 ): """ Creates the DirCorpus. Args: dirpath: the directory path ext: the file extension that must be matched by all files for the corpus, "bdocjs" if empty or None fmt: the format to use, if None, will be determined from the extension recursive: if True (default) all matching files from all subdirectories are included sort: if True, sort by file paths, if a function sort by that function (default: False) sort_reverse: if sort is not False and this is True, sort in reverse order nparts: only yield every nparts-th document (default 1: every document) partnr: start with that index, before yieldieng every nparts-th document (default 0: start at beginning) """ if not ext: ext = "bdocjs" if not ext.startswith("."): ext = "." + ext self.dirpath = dirpath self.ext = ext self.fmt = fmt if not os.path.exists(dirpath): raise Exception(f"Directory {dirpath} does not exist") if not os.path.isdir(dirpath): raise Exception(f"Not a directory: {dirpath}") self.paths = list(matching_paths(dirpath, exts=[ext], recursive=recursive)) if sort or nparts > 1: if callable(sort): self.paths.sort(key=sort, reverse=sort_reverse) else: self.paths.sort(reverse=sort_reverse) if nparts > 1: self.paths = [ p for idx, p in enumerate(self.paths) if ((idx - partnr) % nparts) == 0 ] self.size = len(self.paths) def __len__(self): return self.size def __getitem__(self, idx): assert isinstance(idx, int) path = self.paths[idx] abspath = os.path.join(self.dirpath, path) try: doc = Document.load(abspath, fmt=self.fmt) except Exception as ex: print(f"Error loading document from {abspath} using format {self.fmt}") raise ex doc.features[self.idxfeatname()] = idx # doc.features["__idx"] = idx # doc.features["__relpath"] = path # doc.features["__abspath"] = abspath return doc def __setitem__(self, idx, doc): """ Set the document for a specific index. Args: idx: the index of the document doc: the Document, if None, no action is performed and the existing document is left unchanged """ if doc is None: return assert isinstance(idx, int) assert isinstance(doc, Document) path = self.paths[idx] doc.save(os.path.join(self.dirpath, path), fmt=self.fmt)Ancestors- Corpus
- abc.ABC
- CorpusSourceBase
- collections.abc.Sized
- typing.Generic
- MultiProcessingAble
 Inherited members
- class DirFilesDestination (dirpath, path_from: Union[str, Callable] = 'default', ext: str = 'bdocjs', fmt=None)
- 
A destination where each document is stored in a file in a directory or directory tree in some known serialization format. The filename or path of the file can be derived from a document feature, the document name, the running number of file added, or any function that can derive a file path from the document and the running number. Create a destination to store documents in files inside a directory or directory tree. Args- dirpath
- the directory to contain the files
- path_from
- 
one of options listed below. If a string is used as a path name, then the forward slash is always used as the directory path separator, on all systems! - "default" (default) a heuristic which uses "relpath" if it is available, otherwise uses the index with at least 5 digits.
- "relpath": use the relative path used when creating the document, but replace the extension
- "idx": just use the index/running number of the added document as the base name
- "idx:5": use the index/running number with at least 5 digits in the name.
- "idx:10:2": use the index and organize a total of 10 digits into a hierarchical pathname of 2 levels, so 10:2 would mean the first 5 digits are for the name of the subdirectory and the second 5 digits are for the file base name. 10:3 would have for levels, the first subdirectory level with 1 digit, the next two with 3 digits and the remaining 3 digits for the filename. NOTE: "idx" by itself is equivalent to idx:1:1
- "feature:fname": use the document feature with the feature name fname as a relative path as is but add the extension
- "docname": use the document name as the relative path, but add extension.
- "minstem": use the relative path with all extensions replaced by the new extension
- somefunction: a function that should return the pathname (without extension) and should take two keyword arguments: doc (the document) and idx (the running index of the document).
 
- ext
- the file extension to add to all generated file names
- fmt
- the format to use for serializing the document, if None, will try to determine from the extension.
 Expand source codeclass DirFilesDestination(DocumentDestination): """ A destination where each document is stored in a file in a directory or directory tree in some known serialization format. The filename or path of the file can be derived from a document feature, the document name, the running number of file added, or any function that can derive a file path from the document and the running number. """ def __init__(self, dirpath, path_from: Union[str, Callable] = "default", ext: str = "bdocjs", fmt=None): """ Create a destination to store documents in files inside a directory or directory tree. Args: dirpath: the directory to contain the files path_from: one of options listed below. If a string is used as a path name, then the forward slash is always used as the directory path separator, on all systems! * "default" (default) a heuristic which uses "relpath" if it is available, otherwise uses the index with at least 5 digits. * "relpath": use the relative path used when creating the document, but replace the extension * "idx": just use the index/running number of the added document as the base name * "idx:5": use the index/running number with at least 5 digits in the name. * "idx:10:2": use the index and organize a total of 10 digits into a hierarchical pathname of 2 levels, so 10:2 would mean the first 5 digits are for the name of the subdirectory and the second 5 digits are for the file base name. 10:3 would have for levels, the first subdirectory level with 1 digit, the next two with 3 digits and the remaining 3 digits for the filename. NOTE: "idx" by itself is equivalent to idx:1:1 * "feature:fname": use the document feature with the feature name fname as a relative path as is but add the extension * "docname": use the document name as the relative path, but add extension. * "minstem": use the relative path with all extensions replaced by the new extension * somefunction: a function that should return the pathname (without extension) and should take two keyword arguments: doc (the document) and idx (the running index of the document). ext: the file extension to add to all generated file names fmt: the format to use for serializing the document, if None, will try to determine from the extension. """ super().__init__() if not os.path.isdir(dirpath): raise Exception("Not a directory: ", dirpath) self.dirpath = dirpath self.idx = 0 def pathmaker_default(doc=None, idx=None): relpath = doc.features.get(self.relpathfeatname()) if relpath: return os.path.splitext(doc.features[self.relpathfeatname()])[0] else: return f"{idx:05d}" if path_from.startswith("idx"): rest = path_from[ 3: ] # if we have digits or levels, there is a leading colon! if len(rest) == 0: digits = 1 levels = 1 else: parms = rest.split(":") parms.append("1") digits, levels = parms[1:3] digits = int(digits) levels = int(levels) self.file_path_maker = maker_file_path_fromidx(digits, levels) elif path_from.startswith("feature"): _, fname = path_from.split(":") self.file_path_maker = lambda doc=None, idx=None: doc.features[fname] elif path_from == "default": self.file_path_maker = pathmaker_default elif path_from == "relpath": self.file_path_maker = \ lambda doc=None, idx=None: os.path.splitext(doc.features[self.relpathfeatname()])[0] elif path_from == "docname": self.file_path_maker = lambda doc=None, idx=None: doc.name elif path_from == "minstem": self.file_path_maker = lambda doc=None, idx=None: minstem(doc.features[self.relpathfeatname()]) elif callable(path_from): self.file_path_maker = path_from else: raise Exception(f"Not allowed for path_from: {path_from}") if not ext.startswith("."): ext = "." + ext self.ext = ext self.fmt = fmt def append(self, doc): """ Add a document to the destination. Args: doc: the document or None, if None, no action is performed. """ if doc is None: return assert isinstance(doc, Document) path = self.file_path_maker(doc=doc, idx=self.idx) path = os.path.normpath( path ) # convert forward slashes to backslashes on windows path = os.path.join(self.dirpath, path) + self.ext # check if we need to create the directories. For this we first need to get the directories part of the path, # which is everything left of the last slash if os.path.sep in path: dirs = path[: path.rindex(os.path.sep)] if not os.path.exists(os.path.normpath(dirs)): os.makedirs(dirs) Document.save(doc, path, fmt=self.fmt) self.idx += 1 self._n += 1 def close(self): passAncestors- DocumentDestination
- contextlib.AbstractContextManager
- abc.ABC
 Methods- def append(self, doc)
- 
Add a document to the destination. Args- doc
- the document or None, if None, no action is performed.
 Expand source codedef append(self, doc): """ Add a document to the destination. Args: doc: the document or None, if None, no action is performed. """ if doc is None: return assert isinstance(doc, Document) path = self.file_path_maker(doc=doc, idx=self.idx) path = os.path.normpath( path ) # convert forward slashes to backslashes on windows path = os.path.join(self.dirpath, path) + self.ext # check if we need to create the directories. For this we first need to get the directories part of the path, # which is everything left of the last slash if os.path.sep in path: dirs = path[: path.rindex(os.path.sep)] if not os.path.exists(os.path.normpath(dirs)): os.makedirs(dirs) Document.save(doc, path, fmt=self.fmt) self.idx += 1 self._n += 1
 Inherited members
- class DirFilesSource (dirpath: str, paths: Optional[Iterable[str]] = None, paths_from: Union[str, pathlib.Path, urllib.parse.ParseResult] = None, exts: Optional[Iterable[str]] = None, fmt: Optional[str] = None, recursive: bool = True, sort: Union[bool, Callable] = False, sort_reverse: bool = False, docname_from: Optional[str] = None, nparts: int = 1, partnr: int = 0)
- 
A document source which iterates over documents represented as files in a directory. Create a DirFilesSource. Args- dirpath
- the directory that contains the file to load as documents.
- paths
- if not None, must be an iterable of relate file paths to load from the directory
- paths_from
- if not None, must be a file or URL to load a list of relative file paths from
- exts
- an iterable of allowed file extensions or file extension regexps
- fmt
- the format to use for loading files. This is only useful if all files have the same format but the file extensions does not indicate the format.
- recursive
- recursively include paths from all subdirectories as well
- sort
- a boolean to indicate that paths should get processed in sort order, or a callable that will be used to extract the sort key. The paths get always sorted if partnr is > 1.
- sort_reverse
- if paths should get serted in reverse order
- docname_from
- If not None set the document name from "basename", "stem" (basename without last extension) "minstem" (basename with all extensions removed) "relpath", "index" (sequence number of document within this part).
- nparts
- only yield every nparts-th document (default 1: every document)
- partnr
- start with that index, before yieldieng every nparts-th document (default 0: start at beginning)
 Expand source codeclass DirFilesSource(DocumentSource, EveryNthBase, MultiProcessingAble): """ A document source which iterates over documents represented as files in a directory. """ def __init__( self, dirpath: str, paths: Optional[Iterable[str]] = None, paths_from: Union[str, Path, ParseResult] = None, exts: Optional[Iterable[str]] = None, fmt: Optional[str] = None, recursive: bool = True, sort: Union[bool, Callable] = False, sort_reverse: bool = False, docname_from: Optional[str] = None, nparts: int = 1, partnr: int = 0, ): """ Create a DirFilesSource. Args: dirpath: the directory that contains the file to load as documents. paths: if not None, must be an iterable of relate file paths to load from the directory paths_from: if not None, must be a file or URL to load a list of relative file paths from exts: an iterable of allowed file extensions or file extension regexps fmt: the format to use for loading files. This is only useful if all files have the same format but the file extensions does not indicate the format. recursive: recursively include paths from all subdirectories as well sort: a boolean to indicate that paths should get processed in sort order, or a callable that will be used to extract the sort key. The paths get always sorted if partnr is > 1. sort_reverse: if paths should get serted in reverse order docname_from: If not None set the document name from "basename", "stem" (basename without last extension) "minstem" (basename with all extensions removed) "relpath", "index" (sequence number of document within this part). nparts: only yield every nparts-th document (default 1: every document) partnr: start with that index, before yieldieng every nparts-th document (default 0: start at beginning) """ self.dirpath = dirpath if paths is not None and paths_from is not None: raise Exception("Parameters paths and paths_from cannot be both specified") super().__init__() EveryNthBase.__init__(self, nparts=nparts, partnr=partnr) if docname_from is not None: assert docname_from in ["basename", "relpath", "index", "stem", "minstem"] self.docname_from = docname_from if paths is not None: self.paths = paths elif paths_from is not None: self.paths = [] for pth in yield_lines_from(paths_from): self.paths.append(pth.rstrip("\n\r")) else: self.paths = list(matching_paths(dirpath, exts=exts, recursive=recursive)) if sort or nparts > 1: if callable(sort): self.paths.sort(key=sort, reverse=sort_reverse) else: self.paths.sort(reverse=sort_reverse) if nparts > 1: self.paths = [ p for idx, p in enumerate(self.paths) if ((idx - partnr) % nparts) == 0 ] self.fmt = fmt def __iter__(self): """ Yield the next document from the source. """ self._n = 0 for p in self.paths: fullpath = os.path.join(self.dirpath, p) doc = Document.load(fullpath, fmt=self.fmt) self.setrelpathfeature(doc, p) if self.docname_from: if self.docname_from == "basename": docname = os.path.basename(fullpath) elif self.docname_from == "stem": docname = Path(fullpath).stem elif self.docname_from == "index": docname = str(self._n) elif self.docname_from == "relpath": docname = p elif self.docname_from == "minstem": docname = minstem(fullpath) doc.name = docname self._n += 1 yield docAncestors- DocumentSource
- abc.ABC
- collections.abc.Iterable
- typing.Generic
- CorpusSourceBase
- EveryNthBase
- MultiProcessingAble
 Inherited members
- class NumberedDirFilesCorpus (dirpath, digits=1, levels=1, ext='bdocjs', fmt=None, size=None, store_none=True)
- 
A corpus that represents files from a (nested) directory, where the filename is derived from the index number of the document. This corpus can represent missing elements as None, both on reading (when the corresponding expected document does not exist) and on writing (the corresponding document gets deleted). Creates the NumberedDirFilesCorpus. This corpus, is able to return None for non-existing documents and remove document files by setting to None depending on the parameters. Args- dirpath
- the directory path
- digits
- the number of digits to use for the file path
- levels
- the number of levels to split the digits up which are then used as subdire names.
- ext
- the file extension used for all files in the corpus
- fmt
- the format to use, if None, determined from the extension
- size
- the size of the corpus. This can be used to create a corpus from an empty directory to contain only None elements initially. It can also be used to limit access to only the first size elements if the directory contains more documents.
- store_none
- if True, will store None in the corpus, i.e. remove the corresponding file from the directory. If False, will ignore the action and leave whatever is at the index unchanged.
 Expand source codeclass NumberedDirFilesCorpus(Corpus, MultiProcessingAble): """ A corpus that represents files from a (nested) directory, where the filename is derived from the index number of the document. This corpus can represent missing elements as None, both on reading (when the corresponding expected document does not exist) and on writing (the corresponding document gets deleted). """ def __init__( self, dirpath, digits=1, levels=1, ext="bdocjs", fmt=None, size=None, store_none=True, ): """ Creates the NumberedDirFilesCorpus. This corpus, is able to return None for non-existing documents and remove document files by setting to None depending on the parameters. Args: dirpath: the directory path digits: the number of digits to use for the file path levels: the number of levels to split the digits up which are then used as subdire names. ext: the file extension used for all files in the corpus fmt: the format to use, if None, determined from the extension size: the size of the corpus. This can be used to create a corpus from an empty directory to contain only None elements initially. It can also be used to limit access to only the first size elements if the directory contains more documents. store_none: if True, will store None in the corpus, i.e. remove the corresponding file from the directory. If False, will ignore the action and leave whatever is at the index unchanged. """ if not ext.startswith("."): ext = "." + ext self.dirpath = dirpath self.ext = ext self.fmt = fmt self.size = size self.store_none = store_none self.file_path_maker = maker_file_path_fromidx(digits, levels) def __len__(self): return self.size def __getitem__(self, idx): assert isinstance(idx, int) path = self.file_path_maker(idx) path = path + self.ext abspath = os.path.join(self.dirpath, path) if os.path.exists(path): doc = Document.load(abspath, fmt=self.fmt) doc.features[self.idxfeatname()] = idx # doc.features["__idx"] = idx # doc.features["__relpath"] = path # doc.features["__abspath"] = abspath else: doc = None return doc def __setitem__(self, idx, doc): assert isinstance(idx, int) assert doc is None or isinstance(doc, Document) path = self.file_path_maker(idx) path = path + self.ext if doc is None: if self.store_none: if os.path.exists(path): os.remove(path) else: Document.save(os.path.join(self.dirpath, path), fmt=self.fmt)Ancestors- Corpus
- abc.ABC
- CorpusSourceBase
- collections.abc.Sized
- typing.Generic
- MultiProcessingAble
 Inherited members