Getting per token NER scores

This is not possible through the default annotators for Stanza and Spacy. Stanza does not provide any probabilities/logits/scores but it is possible to get these from Spacy by using some of the Spacy components directly.

from collections import defaultdict
import gatenlp
print("Using gatenlp version", gatenlp.__version__)
from gatenlp import Document
import spacy
print("Using Spacy version", spacy.__version__)
from gatenlp.lib_spacy import AnnSpacy, spacy2gatenlp

Using gatenlp version 1.0.7-dev0


/home/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages/torch/cuda/__init__.py:80: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 9010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at  ../c10/cuda/CUDAFunctions.cpp:112.)
  return torch._C._cuda_getDeviceCount() > 0


Using Spacy version 3.2.0
# In order to use the English pipeline with Spacy, the model has to get downloaded first
from spacy.cli import download as spacy_download
spacy_download("en_core_web_sm")

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
Requirement already satisfied: spacy<3.3.0,>=3.2.0 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from en-core-web-sm==3.2.0) (3.2.0)
Requirement already satisfied: jinja2 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.3)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.3.0)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.8 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.8)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.6)
Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.8.2)
Requirement already satisfied: thinc<8.1.0,>=8.0.12 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.13)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.6)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.1)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.62.3)
Requirement already satisfied: pathy>=0.3.5 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.6.1)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.6)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.6)
Requirement already satisfied: typer<0.5.0,>=0.3.0 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.4.0)
Requirement already satisfied: srsly<3.0.0,>=2.4.1 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.4.2)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.8.2)
Requirement already satisfied: packaging>=20.0 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (21.3)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.26.0)
Requirement already satisfied: setuptools in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (50.3.0.post20201006)
Requirement already satisfied: numpy>=1.15.0 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.21.4)
Requirement already satisfied: blis<0.8.0,>=0.4.0 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.7.5)
Requirement already satisfied: typing-extensions<4.0.0.0,>=3.7.4 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.10.0.2)
Requirement already satisfied: zipp>=0.5 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from catalogue<2.1.0,>=2.0.6->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.6.0)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from packaging>=20.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.6)
Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from pathy>=0.3.5->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (5.2.1)
Requirement already satisfied: charset-normalizer~=2.0.0 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2020.6.20)
Requirement already satisfied: idna<4,>=2.5 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.3)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.26.7)
Requirement already satisfied: click<9.0.0,>=7.1.1 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from typer<0.5.0,>=0.3.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.3)
Requirement already satisfied: MarkupSafe>=2.0 in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from jinja2->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.1)
Requirement already satisfied: importlib-metadata in /data/johann/software/anaconda/envs/gatenlp-37/lib/python3.7/site-packages (from click<9.0.0,>=7.1.1->typer<0.5.0,>=0.3.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.8.2)
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
# fetch an example document
doc = Document.load("https://gatenlp.github.io/python-gatenlp/testdocument2.txt")
doc

Test

nlp = spacy.load("en_core_web_sm")
spacy_annotator = AnnSpacy(pipeline=nlp)
sdoc = nlp(doc.text)
gdoc1 = spacy2gatenlp(sdoc)
gdoc1
# based on https://stackoverflow.com/questions/59877735/how-to-get-probability-of-prediction-per-entity-from-spacy-ner-model
def get_scores4token(nlp, spacydoc, beam_width=16, beam_density=0.0001):
    scores = defaultdict(float)
    er = nlp.get_pipe("ner")
    beam = er.beam_parse([spacydoc], beam_width=beam_width, beam_density=beam_density)[0]
    for score, ents in er.moves.get_beam_parses(beam):
        for start, end, label in ents:
            for i in range(start, end):
                scores[(i, label)] += score
    return scores

def add_scores2token(gdoc, scores, defscore=0.0, 
                     classes=["PERSON", "GPE", "LOC", "QUANTITY", "ORG", "TIME", "CARDINAL", "DATE"]):
    tokens = gdoc.annset().with_type("Token")
    for token in tokens:
        i = token.features["_i"]
        for c in classes:
            token.features["score-"+c] =  scores.get((i, c), defscore)            
            

scores = get_scores4token(nlp, sdoc)
add_scores2token(gdoc1, scores)
gdoc1