Gazetteers

import os
from gatenlp import Document
from gatenlp.processing.gazetteer import TokenGazetteer
from gatenlp.processing.tokenizer import NLTKTokenizer

# all the example files will be created in "./tmp"
if not os.path.exists("tmp"):
    os.mkdir("tmp")

# 1) Create a gazetteer from a Python list 

gazlist = [
    ("Barack Obama", dict(url="https://en.wikipedia.org/wiki/Barack_Obama")),
    ("Obama", dict(url="https://en.wikipedia.org/wiki/Barack_Obama")),
    ("Donald Trump", dict(url="https://en.wikipedia.org/wiki/Donald_Trump")),
    ("Trump", dict(url="https://en.wikipedia.org/wiki/Donald_Trump")),
    ("George W. Bush", dict(url="https://en.wikipedia.org/wiki/George_W._Bush")),
    ("George Bush", dict(url="https://en.wikipedia.org/wiki/George_W._Bush")),
    ("Bush", dict(url="https://en.wikipedia.org/wiki/George_W._Bush")),
    ("Bill Clinton", dict(url="https://en.wikipedia.org/wiki/Bill_Clinton")),
    ("Clinton", dict(url="https://en.wikipedia.org/wiki/Bill_Clinton")),
]

# Document with some text mentioning some of the names
text = """Barack Obama was the 44th president of the US and he followed George W. Bush and
  was followed by Donald Trump. Before Bush, Bill Clinton was president."""
doc = Document(text)
doc
# Tokenize the document, lets use an NLTK tokenizer
from nltk.tokenize.destructive import NLTKWordTokenizer

tokenizer = NLTKTokenizer(nltk_tokenizer=NLTKWordTokenizer(), out_set="", token_type="Token")
doc = tokenizer(doc)
doc

# Tokenize the strings from our gazetteer list as well

def text2tokenstrings(text):
    tmpdoc = Document(text)
    tokenizer(tmpdoc)
    tokens = list(tmpdoc.annset().with_type("Token"))
    return [tmpdoc[tok] for tok in tokens]

gazlist = [(text2tokenstrings(txt), feats) for txt, feats in gazlist]
gazlist
    
[(['Barack', 'Obama'], {'url': 'https://en.wikipedia.org/wiki/Barack_Obama'}),
 (['Obama'], {'url': 'https://en.wikipedia.org/wiki/Barack_Obama'}),
 (['Donald', 'Trump'], {'url': 'https://en.wikipedia.org/wiki/Donald_Trump'}),
 (['Trump'], {'url': 'https://en.wikipedia.org/wiki/Donald_Trump'}),
 (['George', 'W.', 'Bush'],
  {'url': 'https://en.wikipedia.org/wiki/George_W._Bush'}),
 (['George', 'Bush'], {'url': 'https://en.wikipedia.org/wiki/George_W._Bush'}),
 (['Bush'], {'url': 'https://en.wikipedia.org/wiki/George_W._Bush'}),
 (['Bill', 'Clinton'], {'url': 'https://en.wikipedia.org/wiki/Bill_Clinton'}),
 (['Clinton'], {'url': 'https://en.wikipedia.org/wiki/Bill_Clinton'})]
# Create the gazetter and apply it to the document

gazetteer = TokenGazetteer(gazlist, fmt="gazlist", all=True, skip=False, outset="", outtype="Lookup",
                          annset="", tokentype="Token")

doc = gazetteer(doc)
doc