import os
from gatenlp import Document
from gatenlp.processing.gazetteer import TokenGazetteer
from gatenlp.processing.tokenizer import NLTKTokenizer

# all the example files will be created in "./tmp"
if not os.path.exists("tmp"):

# 1) Create a gazetteer from a Python list 

gazlist = [
    ("Barack Obama", dict(url="")),
    ("Obama", dict(url="")),
    ("Donald Trump", dict(url="")),
    ("Trump", dict(url="")),
    ("George W. Bush", dict(url="")),
    ("George Bush", dict(url="")),
    ("Bush", dict(url="")),
    ("Bill Clinton", dict(url="")),
    ("Clinton", dict(url="")),

# Document with some text mentioning some of the names
text = """Barack Obama was the 44th president of the US and he followed George W. Bush and
  was followed by Donald Trump. Before Bush, Bill Clinton was president."""
doc = Document(text)
# Tokenize the document, lets use an NLTK tokenizer
from nltk.tokenize.destructive import NLTKWordTokenizer

tokenizer = NLTKTokenizer(nltk_tokenizer=NLTKWordTokenizer(), out_set="", token_type="Token")
doc = tokenizer(doc)

# Tokenize the strings from our gazetteer list as well

def text2tokenstrings(text):
    tmpdoc = Document(text)
    tokens = list(tmpdoc.annset().with_type("Token"))
    return [tmpdoc[tok] for tok in tokens]

gazlist = [(text2tokenstrings(txt), feats) for txt, feats in gazlist]
[(['Barack', 'Obama'], {'url': ''}),
 (['Obama'], {'url': ''}),
 (['Donald', 'Trump'], {'url': ''}),
 (['Trump'], {'url': ''}),
 (['George', 'W.', 'Bush'],
  {'url': ''}),
 (['George', 'Bush'], {'url': ''}),
 (['Bush'], {'url': ''}),
 (['Bill', 'Clinton'], {'url': ''}),
 (['Clinton'], {'url': ''})]
# Create the gazetter and apply it to the document

gazetteer = TokenGazetteer(gazlist, fmt="gazlist", all=True, skip=False, outset="", outtype="Lookup",
                          annset="", tokentype="Token")

doc = gazetteer(doc)