Module `gatenlp.corpora.files`

Module that defines Corpus and DocumentSource/DocumentDestination classes which access documents as lines or parts in a file.

Expand source code

"""
Module that defines Corpus and DocumentSource/DocumentDestination classes which access documents
as lines or parts in a file.
"""

from typing import Optional, Union, List, Dict, IO
import json
from gatenlp.urlfileutils import yield_lines_from
from gatenlp.document import Document
from gatenlp.corpora.base import DocumentSource, DocumentDestination
from gatenlp.corpora.base import MultiProcessingAble


class BdocjsLinesFileSource(DocumentSource, MultiProcessingAble):
    """
    A document source which reads one bdoc json serialization of a document from each line of the given file.
    """

    def __init__(self, file):
        """
        Create a JsonLinesFileSource.

        Args:
            file: the file path (a string) or an open file handle.
        """
        super().__init__()
        self.file = file
        self.fh = open(self.file, "rt", encoding="utf-8")

    def __enter__(self):
        return self

    def __exit__(self, extype, value, traceback):
        self.fh.close()

    def close(self):
        self.fh.close()

    def __iter__(self):
        for line in self.fh:
            self._n += 1
            yield Document.load_mem(line, fmt="json")


class BdocjsLinesFileDestination(DocumentDestination):
    """
    Writes one line of JSON per document to the a single output file.
    """

    def __init__(self, file):
        """

        Args:
            file: the file to write to. If it exists, it gets overwritten without warning.
               Expected to be a string or an open file handle.
        """
        super().__init__()
        if isinstance(file, str):
            self.fh = open(file, "wt", encoding="utf-8")
        else:
            self.fh = file

    def __enter__(self):
        return self

    def __exit__(self, extype, value, traceback):
        self.fh.close()

    def append(self, doc):
        """
        Append a document to the destination.

        Args:
            doc: the document, if None, no action is performed.
        """
        if doc is None:
            return
        assert isinstance(doc, Document)
        self.fh.write(doc.save_mem(fmt="json"))
        self.fh.write("\n")
        self._n += 1

    def close(self):
        self.fh.close()


def _update_dict_from_dict_4spec(todict, fromdict, spec, exclude_key=None, exclude4underscore=False):
    """
    Helper function for updating the todoct dict-like object from the fromdict dict-like object, according to
    spec, where spec is either None/False (do not update anything), True (include all but the exclude_key and
    keys starting with an underscore if exclude4underscore is True), a list of keys to update (if they are
    present in the from dict), or a dictionary, mapping which name in the fromdict to update as which key in
    the todict.

    Returns:
         nothing, the todict dictionary is updated in place
    """
    if not spec:
        return
    if spec is True:
        spec = {}
        for k in fromdict.keys():
            if k != exclude_key:
                if exclude4underscore and k.startswith("_"):
                    continue
                spec[k] = k
    elif isinstance(spec, list):
        specnew = {}
        for k in spec:
            if k != exclude_key:
                if exclude4underscore and k.startswith("_"):
                    continue
                specnew[k] = k
        spec = specnew
    elif not isinstance(spec, dict):
        raise Exception(f"Must specify None, boolean, a list of names or a map of names to names, not {spec}")
    for kfrom, kto in spec.items():
        if kfrom in fromdict:
            todict[kto] = fromdict[kfrom]


class JsonLinesFileSource(DocumentSource, MultiProcessingAble):
    """
    A document source which reads one json serialization per line, creates a document from one field
    in the json and optionally stores all or a selection of remaining fields as document features or
    into a single document feature "__data".
    """

    def __init__(
            self,
            file: str,
            text_field: str = "text",
            feature_fields: Optional[Union[bool, List[str], Dict[str, str]]] = None,
            data_fields: Optional[Union[bool, List[str], Dict[str, str]]] = None,
            data_feature: Optional[str] = "__data" ):
        """
        Create a JsonLinesFileSource.

        Args:
            file: the file path (a string) or an open file handle.
            text_field: the field name where to get the document text from. If a json object does not contain
                this field, the empty string is used instead.
            feature_fields: if not None and not False: either a list of field names which will get stored as
                features with the same name, or a dictionary mapping json fields to feature names, or True to
                indiciate that all fields (except the one containing the document text and fields where the field
                name starts with an underscore) get stored as features.
            data_fields: if not None and not False: either a list of field names which will get stored in the data
                feature as fields with the same name, or a dictionary mapping json fields to new names, or True to
                indiciate that all fields (except the one containing the document text) get stored in the data feature.
                The data feature should be a transient feature (the name starts with two underscores), the
                name for that feature is specified through the data_feature parameter
            data_feature:  the name of the data feature if used (if None, "__data" is used)
        """
        super().__init__()
        self.file = file
        self.text_field = text_field
        self.feature_fields = feature_fields
        self.data_fields = data_fields
        if data_feature is None:
            data_feature = "__data"
        self.data_feature = data_feature
        self.fh: IO = open(self.file, "rt", encoding="utf-8")

    def __iter__(self):
        for line in self.fh:
            data = json.loads(line)
            text = data.get(self.text_field, "")
            doc = Document(text)
            _update_dict_from_dict_4spec(
                doc.features, data, self.feature_fields,
                exclude_key=self.text_field, exclude4underscore=self.feature_fields is True)
            if self.data_fields:
                doc.features[self.data_feature] = {}
                _update_dict_from_dict_4spec(
                    doc.features[self.data_feature], data, self.data_fields,
                    exclude_key=self.text_field, exclude4underscore=False)
            self._n += 1
            yield doc

    def __enter__(self):
        return self

    def __exit__(self, extype, value, traceback):
        self.fh.close()

    def close(self):
        self.fh.close()


class JsonLinesFileDestination(DocumentDestination):
    """
    Writes one line of JSON per document to the a single output file. This will either write the document json
    as nested data or the document text to the field designated for the document and will write other json
    fields from the "__data" document feature.
    """

    def __init__(
            self,
            file: Union[str, IO],
            text_field: str = "text",
            document_bdocjs: bool = False,
            feature_fields: Optional[Union[bool, List[str], Dict[str, str]]] = None,
            data_fields: Optional[Union[bool, List[str], Dict[str, str]]] = None,
            data_feature="__data"):
        """

        Args:
            file: the file to write to. If it exists, it gets overwritten without warning.
               Expected to be a string or an open file handle.
            text_field: the name of the json field that will contain the document either just the text or
               the bdocjs representation if document_bdocjs is True.
            document_bdocjs: if True store the bdocjs serialization into the document_field instead of just the text
            feature_fields: if not None and not False: either a list of features names which will get stored as
                fields with the same name, or a dictionary mapping feature names to field names, or True to
                indiciate that all features (except the one containing the document text and features where the field
                name starts with an underscore) get stored as fields.
            data_fields: if not None and not False: either a list of feature names from the data feature which will get
                stored as fields with the same name, or a dictionary mapping feature to field names, or True to
                indiciate that all features (except the one containing the document text) get stored as fields.
            data_feature:  the name of the data feature if used (if None, "__data" is used)
        """
        super().__init__()
        if isinstance(file, str):
            self.fh = open(file, "wt", encoding="utf-8")
        else:
            self.fh = file
        self.text_field = text_field
        self.document_bdocjs = document_bdocjs
        self.feature_fields = feature_fields
        self.data_fields = data_fields
        if data_feature is None:
            data_feature = "__data"
        self.data_feature = data_feature

    def __enter__(self):
        return self

    def __exit__(self, _extype, _value, _traceback):
        self.fh.close()

    def append(self, doc):
        """
        Append a document to the destination.

        Args:
            doc: the document, if None, no action is performed.
        """
        if doc is None:
            return
        assert isinstance(doc, Document)
        data = {}
        _update_dict_from_dict_4spec(
            data,
            doc.features,
            self.feature_fields,
            exclude_key=self.text_field, exclude4underscore=self.feature_fields is True)
        _update_dict_from_dict_4spec(
            data,
            doc.features.get(self.data_feature, {}), self.data_fields,
            exclude_key=self.text_field, exclude4underscore=False)
        # assign the document field last so it overwrites anything that comes from the data feature!
        if self.document_bdocjs:
            data[self.text_field] = doc.save_mem(fmt="json")
        else:
            data[self.text_field] = doc.text
        self.fh.write(json.dumps(data))
        self.fh.write("\n")
        self._n += 1

    def close(self):
        self.fh.close()


class TsvFileSource(DocumentSource, MultiProcessingAble):
    """
    A TsvFileSource is a DocumentSource which is a single TSV file with a fixed number of tab-separated
    values per row. Each document in sequence is created from the text in one of the columns and
    document features can be set from arbitrary columns as well.
    """
    # TODO: better implementation where we make explicit use of the context manager and iterator
    def __init__(self, source, hdr=True, text_col=None, feature_cols=None, data_cols=None, data_feature="__data"):
        """
        Creates the TsvFileSource.

        Args:
            source: a file path or URL
            hdr: if True (default), expects a header line with the column names, if a list, should be the list
                of column names, if False/None, no header line is expected.
            text_col: the column which contains the text for creating the document. Either the column number,
                or the name of the column (only possible if there is a header line) or a function that should
                take the list of fields and arbitrary kwargs and return the text. Also passes "cols" and "n"
                as keyward arguments.
            feature_cols: if not None, must be either a dictionary mapping document feature names to the
                column numbers or column names of where to get the feature value from;
                or a function that should take the list of fields and arbitrary kwargs and return a dictionary
                with the features. Also passes "cols" (dict mapping column names to column indices, or None) and
                "n" (current line number) as keyword arguments.
            data_cols: if not None, either an iterable of the names of columns to store in the special document
                feature "__data" or if "True", stores all columns. At the moment this only works if the tsv file
                has a header line. The values are stored as a list in the order of the names given or the original
                order of the values in the TSV file.
            data_feature: the name of the document feature where to store the data, default is "__data"
        """
        super().__init__()
        assert text_col is not None
        self.hdr = hdr
        self.text_col = text_col
        self.feature_cols = feature_cols
        self.data_cols = data_cols
        self.source = source
        self.hdr2col = {}
        self.nlines = 0
        if data_cols and not hdr:
            raise Exception("Header must be present if data_cols should be used")
        self.data_feature = data_feature

    def __iter__(self):
        reader = yield_lines_from(self.source)
        if self.hdr and self.nlines == 0:
            self.nlines += 1
            self.hdr = next(reader).rstrip("\n\r").split("\t")
        if self.hdr:
            self.hdr2col = {name: idx for idx, name in enumerate(self.hdr)}
        for line in reader:
            line = line.rstrip("\n\r")
            fields = line.split("\t")
            if isinstance(self.text_col, int):
                text = fields[self.text_col]
            elif callable(self.text_col):
                text = self.text_col(fields, cols=self.hdr2col, n=self.n)
            else:
                text = fields[self.hdr2col[self.text_col]]
            doc = Document(text)
            if self.feature_cols:
                if callable(self.feature_cols):
                    doc.features.update(
                        self.feature_cols(fields, cols=self.hdr2col, n=self.n)
                    )
                else:
                    for fname, colid in self.feature_cols.items():
                        if isinstance(colid, int):
                            value = fields[colid]
                        else:
                            value = fields[self.hdr2col[colid]]
                        doc.features[fname] = value
            if self.data_cols:
                if isinstance(self.data_cols, list):
                    data = {}
                    for cname in self.data_cols:
                        if isinstance(cname, str):
                            data[cname] = fields[self.hdr2col[cname]]
                        else:
                            # assume it is the column index!
                            data[cname] = fields[cname]
                else:
                    data = fields
                doc.features[self.data_feature] = data
            self.nlines += 1
            self._n += 1
            yield doc

    def __enter__(self):
        return self

    def __exit__(self, extype, value, traceback):
        pass

    def close(self):
        pass

Classes

class BdocjsLinesFileDestination (file)

Writes one line of JSON per document to the a single output file.

Args

file: the file to write to. If it exists, it gets overwritten without warning. Expected to be a string or an open file handle.

Expand source code

class BdocjsLinesFileDestination(DocumentDestination):
    """
    Writes one line of JSON per document to the a single output file.
    """

    def __init__(self, file):
        """

        Args:
            file: the file to write to. If it exists, it gets overwritten without warning.
               Expected to be a string or an open file handle.
        """
        super().__init__()
        if isinstance(file, str):
            self.fh = open(file, "wt", encoding="utf-8")
        else:
            self.fh = file

    def __enter__(self):
        return self

    def __exit__(self, extype, value, traceback):
        self.fh.close()

    def append(self, doc):
        """
        Append a document to the destination.

        Args:
            doc: the document, if None, no action is performed.
        """
        if doc is None:
            return
        assert isinstance(doc, Document)
        self.fh.write(doc.save_mem(fmt="json"))
        self.fh.write("\n")
        self._n += 1

    def close(self):
        self.fh.close()

Ancestors

DocumentDestination
contextlib.AbstractContextManager
abc.ABC

Methods

def append(self, doc)

Append a document to the destination.

Args

doc: the document, if None, no action is performed.

Expand source code

def append(self, doc):
    """
    Append a document to the destination.

    Args:
        doc: the document, if None, no action is performed.
    """
    if doc is None:
        return
    assert isinstance(doc, Document)
    self.fh.write(doc.save_mem(fmt="json"))
    self.fh.write("\n")
    self._n += 1

Inherited members

DocumentDestination:
- close
- relpathfeatname

class BdocjsLinesFileSource (file)

A document source which reads one bdoc json serialization of a document from each line of the given file.

Create a JsonLinesFileSource.

Args

file: the file path (a string) or an open file handle.

Expand source code

class BdocjsLinesFileSource(DocumentSource, MultiProcessingAble):
    """
    A document source which reads one bdoc json serialization of a document from each line of the given file.
    """

    def __init__(self, file):
        """
        Create a JsonLinesFileSource.

        Args:
            file: the file path (a string) or an open file handle.
        """
        super().__init__()
        self.file = file
        self.fh = open(self.file, "rt", encoding="utf-8")

    def __enter__(self):
        return self

    def __exit__(self, extype, value, traceback):
        self.fh.close()

    def close(self):
        self.fh.close()

    def __iter__(self):
        for line in self.fh:
            self._n += 1
            yield Document.load_mem(line, fmt="json")

Ancestors

DocumentSource
abc.ABC
collections.abc.Iterable
typing.Generic
CorpusSourceBase
MultiProcessingAble

Methods

def close(self)

Expand source code

def close(self):
    self.fh.close()

Inherited members

DocumentSource:
- nparts
- partnr
- relpathfeatname
- setrelpathfeature

class JsonLinesFileDestination (file: Union[str, IO], text_field: str = 'text', document_bdocjs: bool = False, feature_fields: Union[bool, List[str], Dict[str, str], None] = None, data_fields: Union[bool, List[str], Dict[str, str], None] = None, data_feature='__data')

Writes one line of JSON per document to the a single output file. This will either write the document json as nested data or the document text to the field designated for the document and will write other json fields from the "__data" document feature.

Args

file: the file to write to. If it exists, it gets overwritten without warning. Expected to be a string or an open file handle.
text_field: the name of the json field that will contain the document either just the text or the bdocjs representation if document_bdocjs is True.
document_bdocjs: if True store the bdocjs serialization into the document_field instead of just the text
feature_fields: if not None and not False: either a list of features names which will get stored as fields with the same name, or a dictionary mapping feature names to field names, or True to indiciate that all features (except the one containing the document text and features where the field name starts with an underscore) get stored as fields.
data_fields: if not None and not False: either a list of feature names from the data feature which will get stored as fields with the same name, or a dictionary mapping feature to field names, or True to indiciate that all features (except the one containing the document text) get stored as fields.
data_feature: the name of the data feature if used (if None, "__data" is used)

Expand source code

class JsonLinesFileDestination(DocumentDestination):
    """
    Writes one line of JSON per document to the a single output file. This will either write the document json
    as nested data or the document text to the field designated for the document and will write other json
    fields from the "__data" document feature.
    """

    def __init__(
            self,
            file: Union[str, IO],
            text_field: str = "text",
            document_bdocjs: bool = False,
            feature_fields: Optional[Union[bool, List[str], Dict[str, str]]] = None,
            data_fields: Optional[Union[bool, List[str], Dict[str, str]]] = None,
            data_feature="__data"):
        """

        Args:
            file: the file to write to. If it exists, it gets overwritten without warning.
               Expected to be a string or an open file handle.
            text_field: the name of the json field that will contain the document either just the text or
               the bdocjs representation if document_bdocjs is True.
            document_bdocjs: if True store the bdocjs serialization into the document_field instead of just the text
            feature_fields: if not None and not False: either a list of features names which will get stored as
                fields with the same name, or a dictionary mapping feature names to field names, or True to
                indiciate that all features (except the one containing the document text and features where the field
                name starts with an underscore) get stored as fields.
            data_fields: if not None and not False: either a list of feature names from the data feature which will get
                stored as fields with the same name, or a dictionary mapping feature to field names, or True to
                indiciate that all features (except the one containing the document text) get stored as fields.
            data_feature:  the name of the data feature if used (if None, "__data" is used)
        """
        super().__init__()
        if isinstance(file, str):
            self.fh = open(file, "wt", encoding="utf-8")
        else:
            self.fh = file
        self.text_field = text_field
        self.document_bdocjs = document_bdocjs
        self.feature_fields = feature_fields
        self.data_fields = data_fields
        if data_feature is None:
            data_feature = "__data"
        self.data_feature = data_feature

    def __enter__(self):
        return self

    def __exit__(self, _extype, _value, _traceback):
        self.fh.close()

    def append(self, doc):
        """
        Append a document to the destination.

        Args:
            doc: the document, if None, no action is performed.
        """
        if doc is None:
            return
        assert isinstance(doc, Document)
        data = {}
        _update_dict_from_dict_4spec(
            data,
            doc.features,
            self.feature_fields,
            exclude_key=self.text_field, exclude4underscore=self.feature_fields is True)
        _update_dict_from_dict_4spec(
            data,
            doc.features.get(self.data_feature, {}), self.data_fields,
            exclude_key=self.text_field, exclude4underscore=False)
        # assign the document field last so it overwrites anything that comes from the data feature!
        if self.document_bdocjs:
            data[self.text_field] = doc.save_mem(fmt="json")
        else:
            data[self.text_field] = doc.text
        self.fh.write(json.dumps(data))
        self.fh.write("\n")
        self._n += 1

    def close(self):
        self.fh.close()

Ancestors

DocumentDestination
contextlib.AbstractContextManager
abc.ABC

Methods

def append(self, doc)

Append a document to the destination.

Args

doc: the document, if None, no action is performed.

Expand source code

def append(self, doc):
    """
    Append a document to the destination.

    Args:
        doc: the document, if None, no action is performed.
    """
    if doc is None:
        return
    assert isinstance(doc, Document)
    data = {}
    _update_dict_from_dict_4spec(
        data,
        doc.features,
        self.feature_fields,
        exclude_key=self.text_field, exclude4underscore=self.feature_fields is True)
    _update_dict_from_dict_4spec(
        data,
        doc.features.get(self.data_feature, {}), self.data_fields,
        exclude_key=self.text_field, exclude4underscore=False)
    # assign the document field last so it overwrites anything that comes from the data feature!
    if self.document_bdocjs:
        data[self.text_field] = doc.save_mem(fmt="json")
    else:
        data[self.text_field] = doc.text
    self.fh.write(json.dumps(data))
    self.fh.write("\n")
    self._n += 1

Inherited members

DocumentDestination:
- close
- relpathfeatname

class JsonLinesFileSource (file: str, text_field: str = 'text', feature_fields: Union[bool, List[str], Dict[str, str], None] = None, data_fields: Union[bool, List[str], Dict[str, str], None] = None, data_feature: Optional[str] = '__data')

A document source which reads one json serialization per line, creates a document from one field in the json and optionally stores all or a selection of remaining fields as document features or into a single document feature "__data".

Create a JsonLinesFileSource.

Args

file: the file path (a string) or an open file handle.
text_field: the field name where to get the document text from. If a json object does not contain this field, the empty string is used instead.
feature_fields: if not None and not False: either a list of field names which will get stored as features with the same name, or a dictionary mapping json fields to feature names, or True to indiciate that all fields (except the one containing the document text and fields where the field name starts with an underscore) get stored as features.
data_fields: if not None and not False: either a list of field names which will get stored in the data feature as fields with the same name, or a dictionary mapping json fields to new names, or True to indiciate that all fields (except the one containing the document text) get stored in the data feature. The data feature should be a transient feature (the name starts with two underscores), the name for that feature is specified through the data_feature parameter
data_feature: the name of the data feature if used (if None, "__data" is used)

Expand source code

class JsonLinesFileSource(DocumentSource, MultiProcessingAble):
    """
    A document source which reads one json serialization per line, creates a document from one field
    in the json and optionally stores all or a selection of remaining fields as document features or
    into a single document feature "__data".
    """

    def __init__(
            self,
            file: str,
            text_field: str = "text",
            feature_fields: Optional[Union[bool, List[str], Dict[str, str]]] = None,
            data_fields: Optional[Union[bool, List[str], Dict[str, str]]] = None,
            data_feature: Optional[str] = "__data" ):
        """
        Create a JsonLinesFileSource.

        Args:
            file: the file path (a string) or an open file handle.
            text_field: the field name where to get the document text from. If a json object does not contain
                this field, the empty string is used instead.
            feature_fields: if not None and not False: either a list of field names which will get stored as
                features with the same name, or a dictionary mapping json fields to feature names, or True to
                indiciate that all fields (except the one containing the document text and fields where the field
                name starts with an underscore) get stored as features.
            data_fields: if not None and not False: either a list of field names which will get stored in the data
                feature as fields with the same name, or a dictionary mapping json fields to new names, or True to
                indiciate that all fields (except the one containing the document text) get stored in the data feature.
                The data feature should be a transient feature (the name starts with two underscores), the
                name for that feature is specified through the data_feature parameter
            data_feature:  the name of the data feature if used (if None, "__data" is used)
        """
        super().__init__()
        self.file = file
        self.text_field = text_field
        self.feature_fields = feature_fields
        self.data_fields = data_fields
        if data_feature is None:
            data_feature = "__data"
        self.data_feature = data_feature
        self.fh: IO = open(self.file, "rt", encoding="utf-8")

    def __iter__(self):
        for line in self.fh:
            data = json.loads(line)
            text = data.get(self.text_field, "")
            doc = Document(text)
            _update_dict_from_dict_4spec(
                doc.features, data, self.feature_fields,
                exclude_key=self.text_field, exclude4underscore=self.feature_fields is True)
            if self.data_fields:
                doc.features[self.data_feature] = {}
                _update_dict_from_dict_4spec(
                    doc.features[self.data_feature], data, self.data_fields,
                    exclude_key=self.text_field, exclude4underscore=False)
            self._n += 1
            yield doc

    def __enter__(self):
        return self

    def __exit__(self, extype, value, traceback):
        self.fh.close()

    def close(self):
        self.fh.close()

Ancestors

DocumentSource
abc.ABC
collections.abc.Iterable
typing.Generic
CorpusSourceBase
MultiProcessingAble

Methods

def close(self)

Expand source code

def close(self):
    self.fh.close()

Inherited members

DocumentSource:
- nparts
- partnr
- relpathfeatname
- setrelpathfeature

class TsvFileSource (source, hdr=True, text_col=None, feature_cols=None, data_cols=None, data_feature='__data')

A TsvFileSource is a DocumentSource which is a single TSV file with a fixed number of tab-separated values per row. Each document in sequence is created from the text in one of the columns and document features can be set from arbitrary columns as well.

Creates the TsvFileSource.

Args

source: a file path or URL
hdr: if True (default), expects a header line with the column names, if a list, should be the list of column names, if False/None, no header line is expected.
text_col: the column which contains the text for creating the document. Either the column number, or the name of the column (only possible if there is a header line) or a function that should take the list of fields and arbitrary kwargs and return the text. Also passes "cols" and "n" as keyward arguments.
feature_cols: if not None, must be either a dictionary mapping document feature names to the column numbers or column names of where to get the feature value from; or a function that should take the list of fields and arbitrary kwargs and return a dictionary with the features. Also passes "cols" (dict mapping column names to column indices, or None) and "n" (current line number) as keyword arguments.
data_cols: if not None, either an iterable of the names of columns to store in the special document feature "__data" or if "True", stores all columns. At the moment this only works if the tsv file has a header line. The values are stored as a list in the order of the names given or the original order of the values in the TSV file.
data_feature: the name of the document feature where to store the data, default is "__data"

Expand source code

class TsvFileSource(DocumentSource, MultiProcessingAble):
    """
    A TsvFileSource is a DocumentSource which is a single TSV file with a fixed number of tab-separated
    values per row. Each document in sequence is created from the text in one of the columns and
    document features can be set from arbitrary columns as well.
    """
    # TODO: better implementation where we make explicit use of the context manager and iterator
    def __init__(self, source, hdr=True, text_col=None, feature_cols=None, data_cols=None, data_feature="__data"):
        """
        Creates the TsvFileSource.

        Args:
            source: a file path or URL
            hdr: if True (default), expects a header line with the column names, if a list, should be the list
                of column names, if False/None, no header line is expected.
            text_col: the column which contains the text for creating the document. Either the column number,
                or the name of the column (only possible if there is a header line) or a function that should
                take the list of fields and arbitrary kwargs and return the text. Also passes "cols" and "n"
                as keyward arguments.
            feature_cols: if not None, must be either a dictionary mapping document feature names to the
                column numbers or column names of where to get the feature value from;
                or a function that should take the list of fields and arbitrary kwargs and return a dictionary
                with the features. Also passes "cols" (dict mapping column names to column indices, or None) and
                "n" (current line number) as keyword arguments.
            data_cols: if not None, either an iterable of the names of columns to store in the special document
                feature "__data" or if "True", stores all columns. At the moment this only works if the tsv file
                has a header line. The values are stored as a list in the order of the names given or the original
                order of the values in the TSV file.
            data_feature: the name of the document feature where to store the data, default is "__data"
        """
        super().__init__()
        assert text_col is not None
        self.hdr = hdr
        self.text_col = text_col
        self.feature_cols = feature_cols
        self.data_cols = data_cols
        self.source = source
        self.hdr2col = {}
        self.nlines = 0
        if data_cols and not hdr:
            raise Exception("Header must be present if data_cols should be used")
        self.data_feature = data_feature

    def __iter__(self):
        reader = yield_lines_from(self.source)
        if self.hdr and self.nlines == 0:
            self.nlines += 1
            self.hdr = next(reader).rstrip("\n\r").split("\t")
        if self.hdr:
            self.hdr2col = {name: idx for idx, name in enumerate(self.hdr)}
        for line in reader:
            line = line.rstrip("\n\r")
            fields = line.split("\t")
            if isinstance(self.text_col, int):
                text = fields[self.text_col]
            elif callable(self.text_col):
                text = self.text_col(fields, cols=self.hdr2col, n=self.n)
            else:
                text = fields[self.hdr2col[self.text_col]]
            doc = Document(text)
            if self.feature_cols:
                if callable(self.feature_cols):
                    doc.features.update(
                        self.feature_cols(fields, cols=self.hdr2col, n=self.n)
                    )
                else:
                    for fname, colid in self.feature_cols.items():
                        if isinstance(colid, int):
                            value = fields[colid]
                        else:
                            value = fields[self.hdr2col[colid]]
                        doc.features[fname] = value
            if self.data_cols:
                if isinstance(self.data_cols, list):
                    data = {}
                    for cname in self.data_cols:
                        if isinstance(cname, str):
                            data[cname] = fields[self.hdr2col[cname]]
                        else:
                            # assume it is the column index!
                            data[cname] = fields[cname]
                else:
                    data = fields
                doc.features[self.data_feature] = data
            self.nlines += 1
            self._n += 1
            yield doc

    def __enter__(self):
        return self

    def __exit__(self, extype, value, traceback):
        pass

    def close(self):
        pass

Ancestors

DocumentSource
abc.ABC
collections.abc.Iterable
typing.Generic
CorpusSourceBase
MultiProcessingAble

Methods

def close(self)

Expand source code

def close(self):
    pass

Inherited members

DocumentSource:
- nparts
- partnr
- relpathfeatname
- setrelpathfeature