# All new files used in this notebook will get created in directory tmp
import os
if not os.path.exists("tmp"):
    os.mkdir("tmp")


from gatenlp import Document
# Create a list of documents from a list of texts
texts = [
    "This is the first text.",
    "Another text.\nThis one has two lines",
    "This is the third document.\nIt has three lines.\nThis line is the last one.\n",
    "And another document."
]
docs = [Document(t) for t in texts]

# Create the List corpus
from gatenlp.corpora import ListCorpus

lcorpus = ListCorpus(docs)
len(lcorpus)

4


# all corpus instances allow to access and set elements and get the length

lcorpus[2] = Document("some other document")
print(len(lcorpus))
lcorpus[3]

4


# corpus instances also have the store method which allows to store back a document that was 
# fetched from the corpus without knowing its index. This is accomplished by storing the index it was 
# retrieved from in a hidden document feature
doc = lcorpus[2]
print(doc.features[lcorpus.idxfeatname()])

lcorpus.store(doc)  # store back the document
# NOTE: since this ListCorpus is wrapping an in-memory list, any change to the document is reflected
# in the corpus anyways, so storing back of a changed document is not necessary here. The code above 
# serves as an illustration for how this works with corpus classes where the documents are read/saved
# somewhere else.

2


# Some corpus classes also implement the append() method which allows to add additional documents
# to the corpus (just like it is possible to add elements to a list)
# However, this is limited to specific corpus implementations and only allows appending to the end of the corpus

lcorpus.append(Document("Another new document"))
print(len(lcorpus))

5


# lets first save the in-memory ListCorpus as a single JSONL file using the JsonBdocjsLinesFileDestination
from gatenlp.corpora import BdocjsLinesFileSource, BdocjsLinesFileDestination
bdocjsfile = os.path.join("tmp", "bdocjsfile.json")
with BdocjsLinesFileDestination(bdocjsfile) as bdocjsdest:
    for doc in lcorpus:
        bdocjsdest.append(doc)


# now lets test reading that file to iterate over the documents
with BdocjsLinesFileSource(bdocjsfile) as bdocjssrc:
    for doc in bdocjssrc:
        print(doc.text)

This is the first text.
Another text.
This one has two lines
some other document
And another document.
Another new document


import os
from gatenlp.corpora import DirFilesSource, DirFilesDestination, DirFilesCorpus
dir1 = os.path.join("tmp", "dir1")
if not os.path.exists(dir1):
    os.mkdir(dir1)  # The directory for a DirFilesDestination must exist
# The path_from="idx" setting makes the DirFilesCorpus use the running number of the document as 
# the file base name.

with BdocjsLinesFileSource(bdocjsfile) as src:
    with DirFilesDestination(dir1, ext="bdocjs", path_from="idx") as dest:    
        for doc in src:
            dest.append(doc)
    

# lets see what the content of the directory is now:
print(os.listdir(dir1))

['1.bdocjs', '3.bdocjs', '2.bdocjs', '4.bdocjs', '0.bdocjs']


with DirFilesSource(dir1) as src2:
    for doc in src2:
        print(doc)

Document(Another text.
This one has two lines,features=Features({'_relpath': '1.bdocjs'}),anns=[])
Document(And another document.,features=Features({'_relpath': '3.bdocjs'}),anns=[])
Document(some other document,features=Features({'_relpath': '2.bdocjs'}),anns=[])
Document(Another new document,features=Features({'_relpath': '4.bdocjs'}),anns=[])
Document(This is the first text.,features=Features({'_relpath': '0.bdocjs'}),anns=[])


corp1 = DirFilesCorpus(dir1)


# we can get the length
print("length is:", len(corp1))

# we can iterate over the documents in it:
print("Original documents:")
for doc in corp1:
    print(doc)
    
# but we can also update each element which will save the corresponding document to the original
# file in the directory where it was loaded from. Here we add an annotation and document feature
# to each document in the corpus.
for idx, doc in enumerate(corp1):
    doc.features["docidx"] = idx
    doc.annset().add(0,3,"Type1")
    corp1[idx] = doc  # !! this is what updates the document file in the directory
    
# the files in the directory now contain the modified documents. lets open them again and show them 
# using a dirfiles source:
src3 = DirFilesSource(dir1)
print("Updated documents:")
for doc in src2:
    print(doc)

length is: 5
Original documents:
Document(Another text.
This one has two lines,features=Features({'__idx_139813269566096': 0}),anns=[])
Document(And another document.,features=Features({'__idx_139813269566096': 1}),anns=[])
Document(some other document,features=Features({'__idx_139813269566096': 2}),anns=[])
Document(Another new document,features=Features({'__idx_139813269566096': 3}),anns=[])
Document(This is the first text.,features=Features({'__idx_139813269566096': 4}),anns=[])
Updated documents:
Document(Another text.
This one has two lines,features=Features({'docidx': 0, '_relpath': '1.bdocjs'}),anns=['':1])
Document(And another document.,features=Features({'docidx': 1, '_relpath': '3.bdocjs'}),anns=['':1])
Document(some other document,features=Features({'docidx': 2, '_relpath': '2.bdocjs'}),anns=['':1])
Document(Another new document,features=Features({'docidx': 3, '_relpath': '4.bdocjs'}),anns=['':1])
Document(This is the first text.,features=Features({'docidx': 4, '_relpath': '0.bdocjs'}),anns=['':1])


from gatenlp.visualization import CorpusViewer

viewer = CorpusViewer(corp1)
viewer.show()


import json
from gatenlp.corpora import JsonLinesFileDestination, JsonLinesFileSource

# lets first create a JSONL file 
jsonlfile = os.path.join("tmp", "jsonlfile.json")
data = [
    dict(text="this is some text", f1=12, f2="some string"),
    dict(text="another text", f1=3, f2="also a string"),
]
with open(jsonlfile, "wt") as outfp:
    for j in data:
        print(json.dumps(j), file=outfp)


# Read the JSONL and convert the text to a document, and store all other fields as document features
docs = []   # save documents in there for later
with JsonLinesFileSource(jsonlfile, text_field="text", feature_fields=["f1"], data_fields=["f2"]) as src:
    for doc in src:
        print(doc)
        docs.append(doc)

Document(this is some text,features=Features({'f1': 12, '__data': {'f2': 'some string'}}),anns=[])
Document(another text,features=Features({'f1': 3, '__data': {'f2': 'also a string'}}),anns=[])


# similarly, a sequence of documents can be written to a JSON lines file by saving the text into 
# some field and (selected) document features to other fields
with JsonLinesFileDestination(
    jsonlfile, text_field="text", feature_fields=["f1"], data_fields=["f2"], data_feature=None) as dest:
    for doc in docs:
        dest.append(doc)
        
with open(jsonlfile, "rt") as infp:
    for l in infp:
        print(l, end="")

{"f1": 12, "f2": "some string", "text": "this is some text"}
{"f1": 3, "f2": "also a string", "text": "another text"}


# However, by default, if data_features is not set to None, all fields other than the text field 
# get saved in a hidden feature "__data" from where those fields get written back to json by default
# with the JsonLinesFileDestination:

docs = []   # save documents in there for later
with JsonLinesFileSource(jsonlfile, text_field="text", data_fields=True) as src:
    print("Documents:")
    for doc in src:
        print(doc)
        docs.append(doc)
        
with JsonLinesFileDestination(jsonlfile, text_field="text", data_fields=True) as dest:
    for doc in docs:
        dest.append(doc)
        
with open(jsonlfile, "rt") as infp:
    print("JSON lines:")
    for l in infp:
        print(l, end="")

Documents:
Document(this is some text,features=Features({'__data': {'f1': 12, 'f2': 'some string'}}),anns=[])
Document(another text,features=Features({'__data': {'f1': 3, 'f2': 'also a string'}}),anns=[])
JSON lines:
{"f1": 12, "f2": "some string", "text": "this is some text"}
{"f1": 3, "f2": "also a string", "text": "another text"}


# Let's load documents from a tsv file on a web page. This tsv file has three columns and a header line which 
# gives the names "text", "feat1" "feat2" to the columns. 
# We create the documents by fetching the text from column "text" and creating two document features
# with names "f1" and "f2" from the columns "feat1" and "feat2":
from gatenlp.corpora import TsvFileSource
tsvsrc = TsvFileSource("https://gatenlp.github.io/python-gatenlp/tsvcorpus_example1.tsv",
                      text_col="text", feature_cols=dict(f1="feat1", f2="feat2"))



for doc in tsvsrc:
    print(doc)

Document(Here is some text. Like with JSON, newlines are escaped:\nHere is another line.,features=Features({'f1': 'fval1', 'f2': 'fval2'}),anns=[])
Document(Another text\nThis one\nhas more\n\nlines.,features=Features({'f1': '11', 'f2': '22'}),anns=[])
Document(And another.,features=Features({'f1': 'a', 'f2': 'b'}),anns=[])


# clean up after ourselves
#import shutil
#shutil.rmtree("tmp")


import gatenlp
print("NB last updated with gatenlp version", gatenlp.__version__)

NB last updated with gatenlp version 1.0.8a1

Python GateNLP

Document Corpora, Document Source and Destination¶

List Corpus¶

BdocjsLinesFileSource and BdocjsLinesFileDestination¶

Document sources¶

Document destinations¶

DirFilesSource, DirFilesDestination, DirFilesCorpus¶

Viewing and browsing a Corpus¶

CorpusViewer Screenshot¶

JsonLinesFileSource and JsonLinesFileDestination¶

TsvFileSource¶

Notebook last updated¶