# Import gatenlp to check gatenlp version:
import gatenlp
print("GateNLP version:", gatenlp.__version__)
from gatenlp import Document

GateNLP version: 1.0.8.dev3


doc1 = Document("This is a small test document")
print(doc1)

Document(This is a small test document,features=Features({}),anns=[])


# from IPython.display import display
doc1


doc1.show(doc_style="color: blue; font-weight: bold;")


doc2 = Document.load('./data/document-testing.txt')
doc2


doc1.save("myfirstdocument.bdocjs")

with open("myfirstdocument.bdocjs", "rt", encoding="utf-8") as infp:
    print(infp.read())

{"annotation_sets": {}, "text": "This is a small test document", "features": {}, "offset_type": "p", "name": ""}


doc1.save("myfirstdocument.bdocym")  # use YAML serialization

with open("myfirstdocument.bdocym", "rt", encoding="utf-8") as infp:
    print(infp.read())

annotation_sets: {}
features: {}
name: ''
offset_type: p
text: This is a small test document


# Can also "save" to memory/string, here the format is needed!
doc1.save_mem(fmt="bdocjs")

'{"annotation_sets": {}, "text": "This is a small test document", "features": {}, "offset_type": "p", "name": ""}'


import datetime
doc1.features["loading_date"] = str(datetime.datetime.now())
doc1.features["purpose"] = "Testing gatenlp."
doc1.features["numeric_value"] = 22
doc1.features["dict_of_objects"] = {"dict_key": "dict_value", "a_list": [1,2,3,4,5]}
doc1.features["_tmp1"] = "some value"
doc1.features["__tmp2"] = 12345
doc1


print("1:", doc1.features["purpose"])
print("2:", doc1.features.get("doesntexist"))
print("3:", doc1.features.get("doesntexist", "NA!"))

1: Testing gatenlp.
2: None
3: NA!


for name, value in doc1.features.items():
    print(f"{name}: {value}")

loading_date: 2022-07-02 21:48:56.801623
purpose: Testing gatenlp.
numeric_value: 22
dict_of_objects: {'dict_key': 'dict_value', 'a_list': [1, 2, 3, 4, 5]}
_tmp1: some value
__tmp2: 12345


import pprint, json

js_str = doc1.save_mem(fmt="bdocjs")
js = json.loads(js_str)
pprint.pprint(js)

{'annotation_sets': {},
 'features': {'_tmp1': 'some value',
              'dict_of_objects': {'a_list': [1, 2, 3, 4, 5],
                                  'dict_key': 'dict_value'},
              'loading_date': '2022-07-02 21:48:56.801623',
              'numeric_value': 22,
              'purpose': 'Testing gatenlp.'},
 'name': '',
 'offset_type': 'p',
 'text': 'This is a small test document'}


# create and get an annotation set with the name "Set1"
annset = doc1.annset("Set1")
#Now, add an annotation, this method returns the newly created annotation
annset.add(0,4,"AnnType1")

Annotation(0,4,AnnType1,features=Features({}),id=0)


# add a few more
annset.add(0, 4, "Token", {"id": "token1'"})
annset.add(5, 7, "Token", {"id": "token2'"})
annset.add(8, 9, "Token", {"id": "token3'"})
annset.add(10, 15, "Token", {"id": "token4'"})
annset.add(16, 20, "Token", {"id": "token5"})
annset.add(21, 29, "Token", {"id": "token6"})
annset.add(0, 29, "Sentence", {"what": "The first 'sentence' annotation"});
for ann in annset:
    print(ann)

Annotation(0,4,AnnType1,features=Features({}),id=0)
Annotation(0,4,Token,features=Features({'id': "token1'"}),id=1)
Annotation(0,29,Sentence,features=Features({'what': "The first 'sentence' annotation"}),id=7)
Annotation(5,7,Token,features=Features({'id': "token2'"}),id=2)
Annotation(8,9,Token,features=Features({'id': "token3'"}),id=3)
Annotation(10,15,Token,features=Features({'id': "token4'"}),id=4)
Annotation(16,20,Token,features=Features({'id': 'token5'}),id=5)
Annotation(21,29,Token,features=Features({'id': 'token6'}),id=6)


doc1.show(preselect=[("Set1", ["AnnType1", "Sentence"])])


ann0 = annset.get(0)    # get by annotation id
print("Annotation id=0:", ann0)
annset.remove(ann0)     # remove the annotation with the annotation id of ann1
ann1 = annset.get(1)
print("Annotation id=1:", ann1)
annset.remove(1)   # remove the annotation with the given id
annset.remove([2,3,4])  # remove a whole list of annotations
print("After some anns removed ", annset)
annset.clear()
print("After set cleared: ", annset)
doc1.remove_annset("Set1")

Annotation id=0: Annotation(0,4,AnnType1,features=Features({}),id=0)
Annotation id=1: Annotation(0,4,Token,features=Features({'id': "token1'"}),id=1)
After some anns removed  AnnotationSet([Annotation(0,29,Sentence,features=Features({'what': "The first 'sentence' annotation"}),id=7), Annotation(16,20,Token,features=Features({'id': 'token5'}),id=5), Annotation(21,29,Token,features=Features({'id': 'token6'}),id=6)])
After set cleared:  AnnotationSet([])


doc3 = Document.load("data/ann-relations.bdocjs")
doc3.show(htmlid="view1")


# make a variable for each annotation type
for anntype in list(doc3.annset("set1").type_names):
    vars()[anntype.lower()] = doc3.annset("set1").with_type(anntype).for_idx(0)


print("Ann2 isoverlapping Ann1:", ann2.isoverlapping(ann1))
print("Ann2 isbefore Ann3:", ann2.isbefore(ann3))
print("Ann3 isafter Ann2:", ann3.isafter(ann2))
print("Ann1 iscovering Ann5:", ann1.iscovering(ann5))
print("Ann3 iscoextensive Ann9:", ann3.iscoextensive(ann9))
print("Ann6 iswithin Ann1:", ann6.iswithin(ann1))
print("Ann4 isrightoverlapping Ann1:", ann4.isrightoverlapping(ann1))

Ann2 isoverlapping Ann1: True
Ann2 isbefore Ann3: True
Ann3 isafter Ann2: True
Ann1 iscovering Ann5: True
Ann3 iscoextensive Ann9: True
Ann6 iswithin Ann1: True
Ann4 isrightoverlapping Ann1: True


from gatenlp import Span
span1 = Span(3,4)
span2 = ann2.span
span3 = doc3.annset("set1").span
span4 = Span(ann5)
print([f"span{i}: {s}" for i, s in enumerate([span1, span2, span3, span4])])

['span0: Span(3,4)', 'span1: Span(0,6)', 'span2: Span(0,45)', 'span3: Span(12,18)']


set1 = doc3.annset("set1") # "attached" set
print("Within Ann1: ", [a.type for a in set1.within(ann1)])
print("Coextensive with Ann3:", [a.type for a in set1.coextensive(ann3)])
print("Coextensive with span of Ann3:", [a.type for a in set1.coextensive(ann3.span)])

Within Ann1:  ['Ann10', 'Ann5', 'Ann3', 'Ann7', 'Ann9', 'Ann11', 'Ann6', 'Ann8', 'Ann12']
Coextensive with Ann3: ['Ann9']
Coextensive with span of Ann3: ['Ann3', 'Ann9']


print("Size of set1:", len(set1))
subset1 = set1.within(ann1)
print("Size of subset1:", len(subset1))

Size of set1: 12
Size of subset1: 9


# try to add an annotation to subset1:
try:
    subset1.add(2,3,"ANewOne")
except Exception as ex:
    print("Got exception:", ex)

Got exception: Cannot add an annotation to an immutable annotation set


# make the set mutable and try again
subset1.immutable = False
subset1.add(2,3,"ANewOne")
print("Size of set1:", len(set1))
print("Size of subset1:", len(subset1))
print("Is set1 detached:", set1.isdetached())
print("Is subset1 detached:", subset1.isdetached())

Size of set1: 12
Size of subset1: 10
Is set1 detached: False
Is subset1 detached: True


# lets load and view the main GateNLP documentation page:
doc4 = Document.load("https://gatenlp.github.io/python-gatenlp/", fmt="html")
doc4


doc4.show(annspec=[("Original markups", ["h1","h2","a","li"])])


doc4.save("gatenlp-doc.html", fmt="html-ann-viewer", notebook=False, stretch_height=True)
from IPython.display import IFrame
IFrame("gatenlp-doc.html", 900,400)


from gatenlp.corpora import ListCorpus
texts = ["this is text one", "here is text two", "and this is text three"]
docs = [Document(t) for t in texts]
lcorp = ListCorpus(docs)
doc1 = lcorp[1]
print(doc1.features)
lcorp.store(doc1)

Features({'__idx_139956274540624': 1})


from gatenlp.corpora import DirFilesCorpus
corp1 = DirFilesCorpus("data/dir1")  # get all the matching filenames from the directory
print("Number of documents:", len(corp1))
doc1 = corp1[2]  # actually read the document from the directory
print("Text for idx=2:", doc1.text)
print("Features for idx=2:", doc1.features)
doc1.annset().add(0,len(doc1.text), "Document", dict(what="test document"))
# this writes the document back to the file:
corp1.store(doc1)
# could also have used: corp1[2] = doc1

Number of documents: 4
Text for idx=2: This is another document for testing which mentions John Smith.
Features for idx=2: Features({'__idx_139955830010000': 2})


from gatenlp.visualization import CorpusViewer
cviewer = CorpusViewer(corp1)
cviewer.show()


from gatenlp.corpora import TsvFileSource
tsvsrc1 = TsvFileSource("data/mytsvfile.tsv", text_col="text", feature_cols=dict(src="source",year="year"))
for doc in tsvsrc1:
    print(doc)

Document(This is the text of the first row. It has several sentences.,features=Features({'src': 'source1', 'year': '2005'}),anns=[])
Document(Text of the second row.,features=Features({'src': 'source1', 'year': '2006'}),anns=[])
Document(Another text, this time of the third row. ,features=Features({'src': 'source2', 'year': '2001'}),anns=[])
Document(And here another, from the fourth row.,features=Features({'src': 'source3', 'year': '2013'}),anns=[])


from gatenlp.corpora import PandasDfSource
try:  # this requires Pandas!
    import pandas as pd, csv
    df = pd.read_csv("data/mytsvfile.tsv", sep="\t", quotechar=None, index_col=None, quoting=csv.QUOTE_NONE)
    pdsrc1 = PandasDfSource(df, text_col="text", data_cols=["source", "year"])
    for doc in pdsrc1:
        print(doc)
except:
    print("Pandas not installed")

Document(This is the text of the first row. It has several sentences.,features=Features({'__data': {'source': 'source1', 'year': 2005}}),anns=[])
Document(Text of the second row.,features=Features({'__data': {'source': 'source1', 'year': 2006}}),anns=[])
Document(Another text, this time of the third row. ,features=Features({'__data': {'source': 'source2', 'year': 2001}}),anns=[])
Document(And here another, from the fourth row.,features=Features({'__data': {'source': 'source3', 'year': 2013}}),anns=[])


from gatenlp.corpora.conll import ConllUFileSource
src = ConllUFileSource("data/ar-tiny.conllu", group_by="doc", group_by_n=1)
corp = list(src)
print(len(corp))

3


corp[0].show(doc_style="direction: rtl; font-size: 1.5em; line-height: 1.5;")


from gatenlp.corpora import ListCorpus
from gatenlp.processing.pipeline import Pipeline 
from gatenlp.processing.annotator import AnnotatorFunction
from gatenlp.processing.executor import SerialCorpusExecutor

texts = ["Some text.", "Another text.", "Also some text here.", "And this is also some text."]
docs = [Document(t) for t in texts]
corp = ListCorpus(docs)

def annfunc1(doc):
    doc.annset().add(0,3,"Ann1")
    return doc
def annfunc2(doc):
    doc.annset("set1").add(1,4,"Type1")
    return doc
ann1 = AnnotatorFunction(annfunc1)
ann2 = AnnotatorFunction(annfunc2)
pipeline = Pipeline()
pipeline.add(ann1, name="FirstAnnotator")
pipeline.add(ann2, name="SecondAnnotator")


exe = SerialCorpusExecutor(pipeline, corpus=corp)
exe()
corp[2]


# use corp as source and create another ListCorpus as destination
corpnew = ListCorpus([])
exe2 = SerialCorpusExecutor(pipeline, source=corp, destination=corpnew)
exe2()
print("Length of corpnew:", len(corpnew))
print(f"in={exe2.n_in}, out={exe2.n_out}, none={exe2.n_none}, ok={exe2.n_ok}, err={exe2.n_err}")
corpnew[2]

Length of corpnew: 4
in=4, out=4, none=0, ok=4, err=0


import spacy
print("Spacy version:", spacy.__version__)
from gatenlp.lib_spacy import AnnSpacy

nlp = spacy.load("en_core_web_sm")
annotator = AnnSpacy(pipeline=nlp, outsetname="Spacy")
doc2.annset("Spacy").clear()   # avoid annotation duplication when running several times
doc2 = annotator(doc2)

/home/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages/torch/cuda/__init__.py:80: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 9010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at  ../c10/cuda/CUDAFunctions.cpp:112.)
  return torch._C._cuda_getDeviceCount() > 0

Spacy version: 3.3.1

/home/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages/spacy/util.py:837: UserWarning: [W095] Model 'en_core_web_sm' (3.2.0) was trained with spaCy v3.2 and may not be 100% compatible with the current version (3.3.1). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate
  warnings.warn(warn_msg)


# Adapt size of viewer
from IPython.core.display import display, HTML
display(HTML("<style>#view2-wrapper { font-size: 80% !important; } #view2-row1 {height: 15em; min-height: 5em;}</style>"))


doc2.show(htmlid="view2")


import stanza
print("Stanza version:",stanza.__version__)
from gatenlp.lib_stanza import AnnStanza

nlpstanza = stanza.Pipeline(logging_level="ERROR")
annotatorstanza = AnnStanza(pipeline=nlpstanza, outsetname="Stanza")
doc2.annset("Stanza").clear()   # avoid annotation duplication when running several times
doc2 = annotatorstanza(doc2)

Stanza version: 1.3.0


# change size of document viewer
from IPython.core.display import display, HTML
display(HTML("<style>#view3-wrapper { font-size: 100% !important; } #view3-row1 {height: 10em; min-height: 5em;}</style>"))

doc2.show(htmlid="view3")


from gatenlp.processing.gazetteer import StringGazetteer

# Strings we want to find
sgazlist1 = [
    ("Barack Obama", dict(url="https://en.wikipedia.org/wiki/Barack_Obama")),
    ("Obama", dict(url="https://en.wikipedia.org/wiki/Barack_Obama")),
    ("Donald Trump", dict(url="https://en.wikipedia.org/wiki/Donald_Trump")),
    ("Trump", dict(url="https://en.wikipedia.org/wiki/Donald_Trump")),
    ("George W. Bush", dict(url="https://en.wikipedia.org/wiki/George_W._Bush")),
    ("George Bush", dict(url="https://en.wikipedia.org/wiki/George_W._Bush")),
    ("Bush", dict(url="https://en.wikipedia.org/wiki/George_W._Bush")),
    ("    Bill        Clinton   ", dict(url="https://en.wikipedia.org/wiki/Bill_Clinton")),
    ("Clinton", dict(url="https://en.wikipedia.org/wiki/Bill_Clinton")),
]


# Document with some text mentioning some of the names in the gazeteer for testing
text = """Barack Obama was the 44th president of the US and he followed George W. Bush and
  was followed by Donald Trump. Before Bush, Bill Clinton was president.
  Also, lets include a sentence about South Korea which is called 대한민국 in Korean.
  And a sentence with the full name of Iran in Farsi: جمهوری اسلامی ایران and also with 
  just the word "Iran" in Farsi: ایران 
  Also barack obama in all lower case and SOUTH KOREA in all upper case
  """
doc0 = Document(text)
doc0


sgaz1 = StringGazetteer(source=sgazlist1, source_fmt="gazlist")
doc0 = sgaz1(doc0)
doc0


from gatenlp.processing.tokenizer import NLTKTokenizer
from nltk.tokenize.destructive import NLTKWordTokenizer  # get some tokenizer to use
nltk_tokenizer = NLTKTokenizer(nltk_tokenizer=NLTKWordTokenizer(), token_type="Token")
doc2.annset().clear()  
doc2 = nltk_tokenizer(doc2)


doc2


from gatenlp.processing.gazetteer import TokenGazetteer
gazlist1 = [
    (["Donald", "Trump"], dict(what="person", country="US")),
    (["Boris", "Johnson"], dict(what="person", country="UK")),
    (["Google"], dict(what="company", country="Everywhere, really!"))    
]
tgaz1 = TokenGazetteer(source=gazlist1, source_fmt="gazlist", 
                       annset_name="", outset_name="TGaz1", ann_type="Lookup")

doc2 = tgaz1(doc2)


doc2 = tgaz1(doc2)
doc2


tgaz2 = TokenGazetteer(
    source="data/gaz1.def", source_fmt="gate-def", annset_name="", outset_name="TGaz2", 
    ann_type="Lookup", source_tokenizer=nltk_tokenizer)
doc2.annset("TGaz2").clear()
doc2 = tgaz2(doc2)


doc2


from gatenlp.processing.gazetteer import StringRegexAnnotator


rules = """
year=(19[0-9]{2}|20[0-9]{2})
month=(0[0-9]|10|11|12)
day=([012][0-9]|3[01])

// The ISO date:
|{{year}}-{{month}}-{{day}}
0 => Date  type="iso", year=G1, month=G2, day=G3

# The traditional way of writing a date:
|{{day}}/({{month}})/{{year}}
0 => Date  type="traditional", year=G3, month=G2, day=G1
"""

re_anntr = StringRegexAnnotator(source=rules, source_fmt="string")


redoc = Document("""
A document that contains a date here: 2013-01-12 and also here: 14/02/1991. This should not 
get matched: 1833-12-21 and nor should this 45/03/2012 but this should 13/12/2012 and also
this, despite not being a valid data: 31/02/2000
""")
re_anntr(redoc)
redoc


from gatenlp.pam.pampac import Ann, AnnAt, Rule, Pampac, AddAnn, N, Seq, Or
from gatenlp.pam.matcher import FeatureMatcher, IfNot

r1 = Rule(
    # first the pattern
    Or ( Ann("Token", name="tok").within("ORG"),
         Ann("Token", name="tok").within("PERSON")
       ),
    # then the action for the pattern
    AddAnn(name="tok", type="PersOrOrg")
)
# get the annotations we want to use for matching
anns2match = doc2.annset("Stanza").with_type(["Token", "PERSON", "ORG"])

outset = doc2.annset("Pampac1")
outset.clear()
# Create the Pampac instance from the single rule and run it on the annotations, also specify output set
# The run method returns the list of offsets and the action return values where the rule matches in the doc
Pampac(r1).run(doc2, anns2match, outset=outset)
len(outset)

15


doc2


from gatenlp.pam.pampac import Ann, AnnAt, Rule, Pampac, AddAnn, N, Seq
from gatenlp.pam.matcher import FeatureMatcher, IfNot

feat = FeatureMatcher(upos="PROPN")
r1 = Rule(
    # first the pattern
    Seq( Ann("Token", features=feat),
         N( Seq( N(Ann("Token", features=IfNot(feat)), min=0, max=1),
                 Ann("Token", features=feat)),
           min=1, max=99),
         name="seq1"
       ),
    # then the action for the pattern
    AddAnn(name="seq1", type="PROPNSEQ")
)
# get the annotations we want to use for matching
anns2match = doc2.annset("Stanza").with_type("Token")

outset = doc2.annset("Pampac2")
outset.clear()
# Create the Pampac instance from the single rule and run it on the annotations, also specify output set
# The run method returns the list of offsets and the action return values where the rule matches in the doc
Pampac(r1).run(doc2, anns2match, outset=outset)
len(outset)

7


doc2


from gatenlp.gateworker import GateWorker

gs = GateWorker()
# if GATE_HOME not set use gs = GateWorker(gatehome="/where/Gate/is/Installed")
# if java is not on the PATH use gs = GateWorker(java=""/path/to/the/java/binary")
# If port(s) in use e.g.: `ss -ltp  -at dport=:25333`

Trying to start GATE Worker on port=25333 host=127.0.0.1 log=false keep=false
Process id is 1303916

CREOLE plugin loaded: creole 
CREOLE plugin loaded: Format: Bdoc 1.10
Plugin Python version: 3.0.7 commit: 9adf5ed dirty: false
Lib interaction version: 4.1 commit: 7819f1c dirty: false
Python gatenlp version: 1.0.7 commit: 8c15d82 dirty: false
CREOLE plugin loaded: Python 3.0.7

PythonWorkerRunner.java: starting server with 25333/127.0.0.1/LdhVaxhZGVsS_rbwE78Og6AFs7o/false


# Create a GATE document on the JAVA GATE side and return a handle
gdoc1 = gs.createDocument("An example document mentioning Barack Obama and New York")
# Can call Java API methods on that handle and get/convert the result
print(gdoc1.getClass())
print(gdoc1.getName())
print(gdoc1.getAnnotationSetNames())

class gate.corpora.DocumentImpl
GATE Document_00015
set()


# lets load the prepared ANNIE pipeline on the Java side and process the GATE document with it
gs.loadMavenPlugin("uk.ac.gate.plugins", "annie", "9.0")
gpipe = gs.loadPipelineFromPlugin("uk.ac.gate.plugins", "annie", "/resources/ANNIE_with_defaults.gapp")
gcorp = gs.newCorpus()
gcorp.add(gdoc1)
gpipe.setCorpus(gcorp)
gpipe.execute()

CREOLE plugin loaded: ANNIE 9.0


pdoc1 = gs.gdoc2pdoc(gdoc1)
pdoc1


gs.close()

Java GatenlpWorker ENDING: 1303916


from gatenlp.gateworker import GateWorkerAnnotator
# Specify a prepared GATE pipeline file to get loaded into Java GATE
# Specify a GateWorker
gw = GateWorker(port=31313)
gs_app = GateWorkerAnnotator(pipeline="data/annie.xgapp", gateworker=gw)

Trying to start GATE Worker on port=31313 host=127.0.0.1 log=false keep=false
Process id is 1303940

CREOLE plugin loaded: creole 
CREOLE plugin loaded: Format: Bdoc 1.10
Plugin Python version: 3.0.7 commit: 9adf5ed dirty: false
Lib interaction version: 4.1 commit: 7819f1c dirty: false
Python gatenlp version: 1.0.7 commit: 8c15d82 dirty: false
CREOLE plugin loaded: Python 3.0.7

PythonWorkerRunner.java: starting server with 31313/127.0.0.1/hLZUII_z566eMAyEUH5pVCZdDig/false

CREOLE plugin loaded: ANNIE 9.0


from gatenlp.processing.executor import SerialCorpusExecutor
dircorpus = DirFilesCorpus("data/dir1", sort=True)
exe = SerialCorpusExecutor(annotator=gs_app, corpus=dircorpus)
exe()
gw.close()

tmpdoc = dircorpus[2]
print(tmpdoc.features)
tmpdoc

Features({'gate.SourceURL': 'created from String', '__idx_139952273354960': 2})

Java GatenlpWorker ENDING: 1303940


from gatenlp import Document, AnnotationSet, GateNlpPr, interact

@GateNlpPr
class MyAnnotator:
    # the following method is run on every document, this method must exist:
    def __call__(self, doc, **kwargs):
        pass

    # the start and finish methods are optional, if they exist the start
    # method is called before the first document of a corpus and the finish 
    # method is called after the last document.
    # def start(self, **kwargs):
    #     pass
    # def finish(self, **kwargs):
    #     pass

# THE FOLLOWING MUST BE PRESENT SO THAT GATE CAN COMMUNICATE WITH THE PYTHON PROCESS!
if __name__ == "___main__": # NOTE: changed from __main__ to ___main__ to prevent running in Notebook!
    interact()


from gatenlp import Document, AnnotationSet, GateNlpPr, interact

@GateNlpPr
class MyAnnotator:
    def __init__(self):
        self.n_docs = 0
    def __call__(self, doc, **kwargs):
        self.n_docs += 1
        doc.annset().add(0,3,"SomeType")
        doc.features["docnr"] = self.n_docs
    def start(self, **kwargs):
        print("Processing starting, we got kwargs:", kwargs)
        self.n_docs = 0
    def finish(self, **kwargs):
        print("Processing finished, documents processed: ", self.n_docs)

if __name__ == "___main__":   # NOTE: changed from __main__ to ___main__ to prevent running in Notebook!
    interact()

GATE COURSE MODULE 11GATE & PYTHON

GATE & PYTHON¶

Python GateNLP¶

Python GateNLP: status¶

Python GateNLP: Info and Feedback¶

Preparation: Install Python¶

Preparation: install Miniconda (Linux)¶

Preparation: install Miniconda (Windows)¶

Install gatenlp¶

Java GATE¶

Follow along¶

Python GateNLP: Main Concepts¶

Documents¶

Documents¶

Documents¶

Documents: load¶

Documents: save (JSON)¶

Document: save (YAML)¶

Document features¶

Features: API¶

Features: API¶

Features: serialization¶

Annotations & Annotation Sets & Spans¶

Adding annotations¶

Annotations¶

Annotations¶

Annotations: document viewer¶

Annotations/sets: remove¶

Annotation Relations¶

Annotation Relations¶

Annotation Relations API¶

Spans¶

AnnotationSet: retrieve by relation¶

AnnotationSet: detached / immutable¶

AnnotationSet: detached / immutable¶

Document loading/saving¶

Document: load HTML¶

Document: view sets/types¶

Document: save html-ann-viewer¶

Exchange Documents with Java GATE¶

Corpus¶

ListCorpus¶

DirFilesCorpus¶

Corpus Viewer¶

Other Corpus Classes¶

Source, Destination¶

Source, Destination examples¶

TsvFileSource¶

PandasDfSource¶

Conll-U Source¶

Conll-U Source¶

Annotators, Executors¶

Example 1/3¶

Example 2/3¶

Example 3/3¶

Spacy Annotator¶

Spacy Annotator¶

Spacy Annotator¶

Stanza Annotator¶

Stanza Annotator¶

Stanza Annotator¶

Gazetteers¶

StringGazetteer: gazetteer list¶

StringGazetteer: document¶

StringGazetteer: match¶

Document Tokenization¶

Document Tokenization¶

TokenGazetteer: list¶

TokenGazetteer: match¶

TokenGazetteer: listfile¶

Gazetteer List (GATE def)¶

Gazetteer List¶

TokenGazetteer¶

Regexp Annotator¶

RegexpAnnotator: rules¶

RegexpAnnotator: match¶

PAMPAC¶

PAMPAC - How to use¶

PAMPAC - Example 1¶

PAMPAC - Example 1¶

GATE COURSE MODULE 11
GATE & PYTHON