from gatenlp import Document


doc1 = Document("This is a small test document")


print(doc1)

Document(This is a small test document,features=Features({}),anns=[])


# from IPython.display import display
doc1


doc1.show()


doc2 = Document.load('./data/document-testing.txt')
doc2


doc1.save("myfirstdocument.bdocjs")

with open("myfirstdocument.bdocjs", "rt", encoding="utf-8") as infp:
    print(infp.read())

{"annotation_sets": {}, "text": "This is a small test document", "features": {}, "offset_type": "p", "name": ""}


doc1.save("myfirstdocument.bdocym")  # use YAML serialization

with open("myfirstdocument.bdocym", "rt", encoding="utf-8") as infp:
    print(infp.read())

annotation_sets: {}
features: {}
name: ''
offset_type: p
text: This is a small test document


# Can also "save" to memory/string, here the format is needed!
doc1.save_mem(fmt="bdocjs")

'{"annotation_sets": {}, "text": "This is a small test document", "features": {}, "offset_type": "p", "name": ""}'


import datetime
doc1.features["loading_date"] = str(datetime.datetime.now())
doc1.features["purpose"] = "Testing gatenlp."
doc1.features["numeric_value"] = 22
doc1.features["dict_of_objects"] = {"dict_key": "dict_value", "a_list": [1,2,3,4,5]}
doc1.features["_tmp1"] = "some value"
doc1.features["__tmp2"] = 12345
doc1


print("1:", doc1.features["purpose"])
print("2:", doc1.features.get("doesntexist"))
print("3:", doc1.features.get("doesntexist", "NA!"))

1: Testing gatenlp.
2: None
3: NA!


for name, value in doc1.features.items():
    print(f"{name}: {value}")

loading_date: 2021-02-27 10:36:27.181482
purpose: Testing gatenlp.
numeric_value: 22
dict_of_objects: {'dict_key': 'dict_value', 'a_list': [1, 2, 3, 4, 5]}
_tmp1: some value
__tmp2: 12345


import pprint, json

js_str = doc1.save_mem(fmt="bdocjs")
js = json.loads(js_str)
pprint.pprint(js)

{'annotation_sets': {},
 'features': {'_tmp1': 'some value',
              'dict_of_objects': {'a_list': [1, 2, 3, 4, 5],
                                  'dict_key': 'dict_value'},
              'loading_date': '2021-02-27 10:36:27.181482',
              'numeric_value': 22,
              'purpose': 'Testing gatenlp.'},
 'name': '',
 'offset_type': 'p',
 'text': 'This is a small test document'}


# create and get an annotation set with the name "Set1"
annset = doc1.annset("Set1")
#Now, add an annotation, this method returns the newly created annotation
annset.add(0,4,"AnnType1")

Annotation(0,4,AnnType1,features=Features({}),id=0)


annset.add(0, 4, "Token", {"id": "token1'"})
annset.add(5, 7, "Token", {"id": "token2'"})
annset.add(8, 9, "Token", {"id": "token3'"})
annset.add(10, 15, "Token", {"id": "token4'"})
annset.add(16, 20, "Token", {"id": "token5"})
annset.add(21, 29, "Token", {"id": "token6"})
annset.add(0, 29, "Sentence", {"what": "The first 'sentence' annotation"});
for ann in annset:
    print(ann)

Annotation(0,4,AnnType1,features=Features({}),id=0)
Annotation(0,4,Token,features=Features({'id': "token1'"}),id=1)
Annotation(0,29,Sentence,features=Features({'what': "The first 'sentence' annotation"}),id=7)
Annotation(5,7,Token,features=Features({'id': "token2'"}),id=2)
Annotation(8,9,Token,features=Features({'id': "token3'"}),id=3)
Annotation(10,15,Token,features=Features({'id': "token4'"}),id=4)
Annotation(16,20,Token,features=Features({'id': 'token5'}),id=5)
Annotation(21,29,Token,features=Features({'id': 'token6'}),id=6)


doc1


ann0 = annset.get(0)    # get by annotation id
print("Annotation id=0:", ann0)
annset.remove(ann0)     # remove the annotation with the annotation id of ann1
ann1 = annset.get(1)
print("Annotation id=1:", ann1)
annset.remove(1)   # remove the annotation with the given id
annset.remove([2,3,4])  # remove a whole list of annotations
print("After some anns removed ", annset)
annset.clear()
print("After set cleared: ", annset)
doc1.remove_annset("Set1")

Annotation id=0: Annotation(0,4,AnnType1,features=Features({}),id=0)
Annotation id=1: Annotation(0,4,Token,features=Features({'id': "token1'"}),id=1)
After some anns removed  AnnotationSet([Annotation(0,29,Sentence,features=Features({'what': "The first 'sentence' annotation"}),id=7), Annotation(16,20,Token,features=Features({'id': 'token5'}),id=5), Annotation(21,29,Token,features=Features({'id': 'token6'}),id=6)])
After set cleared:  AnnotationSet([])


doc3 = Document.load("data/ann-relations.bdocjs")
doc3.show(htmlid="view1")


# make a variable for each annotation type
for anntype in list(doc3.annset("set1").type_names):
    vars()[anntype.lower()] = doc3.annset("set1").with_type(anntype).for_idx(0)


print("Ann2 isoverlapping Ann1:", ann2.isoverlapping(ann1))
print("Ann2 isbefore Ann3:", ann2.isbefore(ann3))
print("Ann3 isafter Ann2:", ann3.isafter(ann2))
print("Ann1 iscovering Ann5:", ann1.iscovering(ann5))
print("Ann3 iscoextensive Ann9:", ann3.iscoextensive(ann9))
print("Ann6 iswithin Ann1:", ann6.iswithin(ann1))
print("Ann4 isrightoverlapping Ann1:", ann4.isrightoverlapping(ann1))

Ann2 isoverlapping Ann1: True
Ann2 isbefore Ann3: True
Ann3 isafter Ann2: True
Ann1 iscovering Ann5: True
Ann3 iscoextensive Ann9: True
Ann6 iswithin Ann1: True
Ann4 isrightoverlapping Ann1: True


from gatenlp import Span
span1 = Span(3,4)
span2 = ann2.span
span3 = doc3.annset("set1").span
span4 = Span(ann5)
print([f"span{i}: {s}" for i, s in enumerate([span1, span2, span3, span4])])

['span0: Span(3,4)', 'span1: Span(0,6)', 'span2: Span(0,45)', 'span3: Span(12,18)']


set1 = doc3.annset("set1") # "attached" set
print("Within Ann1: ", [a.type for a in set1.within(ann1)])
print("Coextensive with Ann3:", [a.type for a in set1.coextensive(ann3)])
print("Coextensive with span of Ann3:", [a.type for a in set1.coextensive(ann3.span)])

Within Ann1:  ['Ann10', 'Ann5', 'Ann3', 'Ann7', 'Ann9', 'Ann11', 'Ann6', 'Ann8', 'Ann12']
Coextensive with Ann3: ['Ann9']
Coextensive with span of Ann3: ['Ann3', 'Ann9']


print("Size of set1:", len(set1))
subset1 = set1.within(ann1)
print("Size of subset1:", len(subset1))

Size of set1: 12
Size of subset1: 9


# try to add an annotation to subset1:
try:
    subset1.add(2,3,"ANewOne")
except Exception as ex:
    print("Got exception:", ex)

Got exception: Cannot add an annotation to an immutable annotation set


# make the set mutable and try again
subset1.immutable = False
subset1.add(2,3,"ANewOne")
print("Size of set1:", len(set1))
print("Size of subset1:", len(subset1))
print("Is set1 detached:", set1.isdetached())
print("Is subset1 detached:", subset1.isdetached())

Size of set1: 12
Size of subset1: 10
Is set1 detached: False
Is subset1 detached: True


# lets load and view the main GateNLP documentation page:
doc4 = Document.load("https://gatenlp.github.io/python-gatenlp/", fmt="html")
doc4


doc4.show(annsets=[("Original markups", ["h1","h2","a","li"])])


doc4.save("gatenlp-doc.html", fmt="html-ann-viewer", notebook=False, stretch_height=True)
from IPython.display import IFrame
IFrame("gatenlp-doc.html", 900,400)


from gatenlp.corpora import ListCorpus
texts = ["this is text one", "here is text two", "and this is text three"]
docs = [Document(t) for t in texts]
lcorp = ListCorpus(docs)
doc1 = lcorp[1]
print(doc1.features)
lcorp.store(doc1)

Features({'__idx': 1, '__idx_140270484012728': 1})


from gatenlp.corpora import DirFilesCorpus
corp1 = DirFilesCorpus("data/dir1")  # get all the matching filenames from the directory
print("Number of documents:", len(corp1))
doc1 = corp1[2]  # actually read the document from the directory
print("Text for idx=2:", doc1.text)
print("Features for idx=2:", doc1.features)
doc1.annset().add(0,len(doc1.text), "Document", dict(what="test document"))
# this writes the document back to the file:
corp1.store(doc1)
# could also have used: corp1[2] = doc1

Number of documents: 4
Text for idx=2: This is another document for testing which mentions John Smith.
Features for idx=2: Features({'__idx_140270484365552': 2, '__idx': 2, '__relpath': 'doc2.bdocjs', '__abspath': 'data/dir1/doc2.bdocjs'})


from gatenlp.corpora import TsvFileSource
tsvsrc1 = TsvFileSource("data/mytsvfile.tsv", text_col="text", feature_cols=dict(src="source",year="year"))
for doc in tsvsrc1:
    print(doc)

Document(This is the text of the first row. It has several sentences.,features=Features({'src': 'source1', 'year': '2005'}),anns=[])
Document(Text of the second row.,features=Features({'src': 'source1', 'year': '2006'}),anns=[])
Document(Another text, this time of the third row. ,features=Features({'src': 'source2', 'year': '2001'}),anns=[])
Document(And here another, from the fourth row.,features=Features({'src': 'source3', 'year': '2013'}),anns=[])


from gatenlp.corpora import PandasDfSource
try:  # this requires Pandas!
    import pandas as pd, csv
    df = pd.read_csv("data/mytsvfile.tsv", sep="\t", quotechar=None, index_col=None, quoting=csv.QUOTE_NONE)
    pdsrc1 = PandasDfSource(df, text_col="text", data_cols=["source", "year"])
    for doc in pdsrc1:
        print(doc)
except:
    print("Pandas not installed")

Document(This is the text of the first row. It has several sentences.,features=Features({'__data': {'source': 'source1', 'year': 2005}}),anns=[])
Document(Text of the second row.,features=Features({'__data': {'source': 'source1', 'year': 2006}}),anns=[])
Document(Another text, this time of the third row. ,features=Features({'__data': {'source': 'source2', 'year': 2001}}),anns=[])
Document(And here another, from the fourth row.,features=Features({'__data': {'source': 'source3', 'year': 2013}}),anns=[])


from gatenlp.corpora.conll import ConllUFileSource
src = ConllUFileSource("data/ar-tiny.conllu", group_by="doc", n=1)
corp = list(src)
print(len(corp))

3


corp[0].show(doc_style="direction: rtl; font-size: 1.5em; line-height: 1.5;")


from gatenlp.corpora import ListCorpus
from gatenlp.processing.pipeline import Pipeline 
from gatenlp.processing.annotator import AnnotatorFunction
from gatenlp.processing.executor import SerialCorpusExecutor

texts = ["Some text.", "Another text.", "Also some text here.", "And this is also some text."]
docs = [Document(t) for t in texts]
corp = ListCorpus(docs)

def annfunc1(doc):
    doc.annset().add(0,3,"Ann1")
    return doc
def annfunc2(doc):
    doc.annset("set1").add(1,4,"Type1")
    return doc
ann1 = AnnotatorFunction(annfunc1)
ann2 = AnnotatorFunction(annfunc2)
pipeline = Pipeline()
pipeline.add(ann1, name="FirstAnnotator")
pipeline.add(ann2, name="SecondAnnotator")


exe = SerialCorpusExecutor(pipeline, corpus=corp)
exe()
corp[2]


# use corp as source and create another ListCorpus as destination
corpnew = ListCorpus([])
exe2 = SerialCorpusExecutor(pipeline, source=corp, destination=corpnew)
exe2()
print("Length of corpnew:", len(corpnew))
print(f"in={exe2.n_in}, out={exe2.n_out}, none={exe2.n_none}, ok={exe2.n_ok}, err={exe2.n_err}")
corpnew[2]

Length of corpnew: 4
in=4, out=4, none=0, ok=4, err=0


import spacy
print("Spacy version:", spacy.__version__)
from gatenlp.lib_spacy import AnnSpacy

nlp = spacy.load("en_core_web_sm")
annotator = AnnSpacy(pipeline=nlp, outsetname="Spacy")
doc2.annset("Spacy").clear()   # avoid annotation duplication when running several times
doc2 = annotator(doc2)

Spacy version: 2.3.2


# Adapt size of viewer
from IPython.core.display import display, HTML
display(HTML("<style>#view2-wrapper { font-size: 80% !important; } #view2-row1 {height: 15em; min-height: 5em;}</style>"))


doc2.show(htmlid="view2")


import stanza
print("Stanza version:",stanza.__version__)
from gatenlp.lib_stanza import AnnStanza

nlpstanza = stanza.Pipeline(logging_level="ERROR")
annotatorstanza = AnnStanza(pipeline=nlpstanza, outsetname="Stanza")
doc2.annset("Stanza").clear()   # avoid annotation duplication when running several times
doc2 = annotatorstanza(doc2)

Stanza version: 1.2


# change size of document viewer
from IPython.core.display import display, HTML
display(HTML("<style>#view3-wrapper { font-size: 100% !important; } #view3-row1 {height: 10em; min-height: 5em;}</style>"))

doc2.show(htmlid="view3")


from gatenlp.processing.tokenizer import NLTKTokenizer
from nltk.tokenize.destructive import NLTKWordTokenizer  # get some tokenizer to use
nltk_tokenizer = NLTKTokenizer(nltk_tokenizer=NLTKWordTokenizer(), token_type="Token")
doc2.annset().clear()  
doc2 = nltk_tokenizer(doc2)


doc2


from gatenlp.processing.gazetteer import TokenGazetteer
gazlist1 = [
    (["Donald", "Trump"], dict(what="person", country="US")),
    (["Boris", "Johnson"], dict(what="person", country="UK")),
    (["Google"], dict(what="company", country="Everywhere, really!"))    
]
tgaz1 = TokenGazetteer(gazlist1, fmt="gazlist", annset="", outset="TGaz1", outtype="Lookup")
doc2.annset("TGaz1").clear()
doc2 = tgaz1(doc2)


doc2.show()


tgaz2 = TokenGazetteer("data/gaz1.def", fmt="gate-def", annset="", outset="TGaz2", outtype="Lookup", tokenizer=nltk_tokenizer)
doc2.annset("TGaz2").clear()
doc2 = tgaz2(doc2)

2021-02-27 10:36:32,514|INFO|gatenlp.processing.gazetteer|Reading list file data/persons.lst
2021-02-27 10:36:32,515|INFO|gatenlp.processing.gazetteer|Reading list file data/companies.lst


doc2


from gatenlp.pam.pampac import Ann, AnnAt, Rule, Pampac, AddAnn, N, Seq, Or
from gatenlp.pam.matcher import FeatureMatcher, ifnot

r1 = Rule(
    # first the pattern
    Or ( Ann("Token", name="tok").within("ORG"),
         Ann("Token", name="tok").within("PERSON")
       ),
    # then the action for the pattern
    AddAnn(name="tok", anntype="PersOrOrg")
)
# get the annotations we want to use for matching
anns2match = doc2.annset("Stanza").with_type(["Token", "PERSON", "ORG"])

outset = doc2.annset("Pampac1")
outset.clear()
# Create the Pampac instance from the single rule and run it on the annotations, also specify output set
# The run method returns the list of offsets and the action return values where the rule matches in the doc
Pampac(r1).run(doc2, anns2match, outset=outset)
len(outset)

15


doc2


from gatenlp.pam.pampac import Ann, AnnAt, Rule, Pampac, AddAnn, N, Seq
from gatenlp.pam.matcher import FeatureMatcher, ifnot

feat = FeatureMatcher(upos="PROPN")
r1 = Rule(
    # first the pattern
    Seq( Ann("Token", features=feat),
         N( Seq( N(Ann("Token", features=ifnot(feat)), min=0, max=1),
                 Ann("Token", features=feat)),
           min=1, max=99),
         name="seq1"
       ),
    # then the action for the pattern
    AddAnn(name="seq1", anntype="PROPNSEQ")
)
# get the annotations we want to use for matching
anns2match = doc2.annset("Stanza").with_type("Token")

outset = doc2.annset("Pampac2")
outset.clear()
# Create the Pampac instance from the single rule and run it on the annotations, also specify output set
# The run method returns the list of offsets and the action return values where the rule matches in the doc
Pampac(r1).run(doc2, anns2match, outset=outset)
len(outset)

8


doc2


from gatenlp.gateworker import GateWorker

gs = GateWorker()
# if GATE_HOME not set use gs = GateWorker(gatehome="/where/Gate/is/Installed")
# if java is not on the PATH use gs = GateWorker(java=""/path/to/the/java/binary")

Trying to start GATE Worker on port=25333 host=127.0.0.1 log=false keep=false
PythonWorkerRunner.java: starting server with 25333/127.0.0.1/_7eAC59eNDOtlt02VIrUb8SfxYI/false


# Create a GATE document on the JAVA GATE side and return a handle
gdoc1 = gs.createDocument("An example document mentioning Barack Obama and New York")
# Can call Java API methods on that handle and get/convert the result
print(gdoc1.getClass())
print(gdoc1.getName())
print(gdoc1.getAnnotationSetNames())

class gate.corpora.DocumentImpl
GATE Document_00015
set()


# lets load the prepared ANNIE pipeline on the Java side and process the GATE document with it
gs.loadMavenPlugin("uk.ac.gate.plugins", "annie", "9.0")
gpipe = gs.loadPipelineFromPlugin("uk.ac.gate.plugins", "annie", "/resources/ANNIE_with_defaults.gapp")
gcorp = gs.newCorpus()
gcorp.add(gdoc1)
gpipe.setCorpus(gcorp)
gpipe.execute()


pdoc1 = gs.gdoc2pdoc(gdoc1)
pdoc1


gs.close()


from gatenlp.gateworker import GateWorkerAnnotator
# Specify a prepared GATE pipeline file to get loaded into Java GATE
# optionally add the gatehome=... kw argument
# optionally specify port using port=23445 or similar
gs_app = GateWorkerAnnotator(pipeline="data/annie.xgapp", port=25444)

Trying to start GATE Worker on port=25444 host=127.0.0.1 log=false keep=false
PythonWorkerRunner.java: starting server with 25444/127.0.0.1/2UCtOtK0lWjim8a3Qg-PtdV7PI8/false


dircorpus = DirFilesCorpus("data/dir1", sort=True)
exe = SerialCorpusExecutor(annotator=gs_app, corpus=dircorpus)
exe()
gs_app.close()

tmpdoc = dircorpus[2]
print(tmpdoc.features)
tmpdoc

Features({'__idx_140270168910480': 2, '__idx': 2, '__relpath': 'doc3.bdocjs', '__abspath': 'data/dir1/doc3.bdocjs'})


from gatenlp import Document, AnnotationSet, GateNlpPr, interact

@GateNlpPr
class MyAnnotator:
    # the following method is run on every document, this method must exist:
    def __call__(self, doc, **kwargs):
        pass

    # the start and finish methods are optional, if they exist the start
    # method is called before the first document of a corpus and the finish 
    # method is called after the last document.
    # def start(self, **kwargs):
    #     pass
    # def finish(self, **kwargs):
    #     pass

# THE FOLLOWING MUST BE PRESENT SO THAT GATE CAN COMMUNICATE WITH THE PYTHON PROCESS!
if __name__ == "___main__": # NOTE: changed from __main__ to ___main__ to prevent running in Notebook!
    interact()


from gatenlp import Document, AnnotationSet, GateNlpPr, interact

@GateNlpPr
class MyAnnotator:
    def __init__(self):
        self.n_docs = 0
    def __call__(self, doc, **kwargs):
        self.n_docs += 1
        doc.annset().add(0,3,"SomeType")
        doc.features["docnr"] = self.n_docs
    def start(self, **kwargs):
        print("Processing starting, we got kwargs:", kwargs)
        self.n_docs = 0
    def finish(self, **kwargs):
        print("Processing finished, documents processed: ", self.n_docs)

if __name__ == "___main__":   # NOTE: changed from __main__ to ___main__ to prevent running in Notebook!
    interact()

GATE COURSE MODULE 11GATE & PYTHON

GATE & PYTHON¶

Python GateNLP¶

Python GateNLP: status¶

Python GateNLP: Info and Feedback¶

Preparation: Install Python¶

Preparation: install Miniconda (Linux)¶

Preparation: install Miniconda (Windows)¶

Install gatenlp¶

Requirement: Java GATE¶

Follow along¶

Python GateNLP: Main Concepts¶

Documents¶

Documents¶

Documents: load¶

Documents: save (JSON)¶

Document: save (YAML)¶

Document features¶

Features: API¶

Annotations & Annotation Sets & Spans¶

Adding annotations¶

Add a few more annotations:¶

Lets show the document again¶

Remove annotations, remove sets¶

Annotation Relations¶

Annotation Relations¶

Annotation Relations API¶

Spans¶

AnnotationSet: retrieve by relation¶

AnnotationSet: detached / immutable¶

AnnotationSet: detached / immutable¶

Document loading/saving¶

Document: load HTML¶

Document: view sets/types¶

Document: save html-ann-viewer¶

Exchange Documents with Java GATE¶

Corpus¶

ListCorpus¶

DirFilesCorpus¶

Other Corpus Classes¶

Source, Destination¶

Source, Destination examples¶

TsvFileSource¶

PandasDfSource¶

Conll-U Source¶

Conll-U Source¶

Annotators, Executors¶

Example 1/3¶

Example 2/3¶

Example 3/3¶

Spacy Annotator¶

Spacy Annotator¶

Spacy Annotator¶

Stanza Annotator¶

Stanza Annotator¶

Stanza Annotator¶

Gazetteers¶

Document Tokenization¶

Document Tokenization¶

Gazetteer List¶

Gazetteer List¶

Gazetteer List¶

Gazetteer List (GATE def)¶

Gazetteer List¶

Token Gazetteer¶

PAMPAC¶

PAMPAC - How to use¶

PAMPAC - Example 1¶

PAMPAC - Example 1¶

PAMPAC - Example 2¶

PAMPAC - Example 2¶

GATE Worker¶

GATE Worker¶

GATE Worker¶

GATE Worker¶

GateWorker¶

GateWorker Annotator¶

GateWorkerAnnotator¶

Java GATE Python Plugin¶

Requirements¶

GATE COURSE MODULE 11
GATE & PYTHON