Module gatenlp.pam.pampac.data
Module for PAMPAC data structures.
This defines classes for representing the parser location (Location), a parsing result (Result), and successful and unsuccessful parses (Success, Failure).
Expand source code
"""
Module for PAMPAC data structures.
This defines classes for representing the parser location (Location), a parsing result (Result),
and successful and unsuccessful parses (Success, Failure).
"""
from typing import Union
from collections.abc import Iterable, Sized
from gatenlp import AnnotationSet, Annotation
class Location:
"""
A ParseLocation represents the next location in the text and annotation list where a parser will try to
match, i.e. the location after everything that has been consumed by the parser so far.
The text offset equal to the length of the text represent the EndOfText condition and the annotation index
equal to the length of the annotation list represents the EndOfAnns condition.
"""
def __init__(self, text_location=0, ann_location=0):
"""
Create a parser location.
Args:
text_location: the next text offset from which on to parse.
ann_location: the next annotation index from which on to parse.
"""
self.text_location = text_location
self.ann_location = ann_location
def __str__(self):
return f"Location({self.text_location},{self.ann_location})"
def __repr__(self):
return f"Location({self.text_location},{self.ann_location})"
def __eq__(self, other):
if not isinstance(other, Location):
return False
return (
self.text_location == other.text_location
and self.ann_location == other.ann_location
)
class Result(Iterable, Sized):
"""
Represents an individual parser result. A successful parse can have any number of parser results which
are alternate ways of how the parser can match the document. Each result can have an arbitrary number of
"matches" (named spans where some part of the pattern fits the document).
A result is an iterable of matches.
"""
def __init__(self, matches=None, location=None, span=None):
"""
Create a parser result.
Args:
matches: the matching info associated with the result, this can be a single item or a list of items.
location: the location where the result was matched, i.e. the location *before* matching was done.
span: the span representing the start and end text offset for the match
"""
assert location is not None
assert span is not None
if matches is not None:
if isinstance(matches, dict):
self.matches = [matches]
elif isinstance(matches, Iterable):
self.matches = list(matches)
else:
self.matches = [matches]
else:
self.matches = []
self.location = location
self.span = span
def anns4matches(self):
"""
Yields all the annotations, if any, in the results matches.
"""
for mtch_ in self.matches:
tmp = mtch_.get("ann")
if tmp:
yield tmp
def matches4name(self, name):
"""
Return a list of match info dictionaries with the given name.
"""
return [m for m in self.matches if m.get("name") == name]
def __str__(self):
return f"Result(loc={self.location},span=Span({self.span.start},{self.span.end}),nmatches={len(self.matches)})"
def __repr__(self):
return f"Result(loc={self.location},span=Span({self.span.start},{self.span.end}),matches={self.matches})"
def __iter__(self):
if self.matches is not None:
return iter(self.matches)
else:
return iter([])
def __len__(self):
if self.matches is not None:
return len(self.matches)
else:
return 0
class Failure:
"""
Represents a parse failure.
"""
def __init__(
self,
message=None,
parser=None,
location=None,
causes=None,
context=None,
):
"""
Create a Failure instance.
Args:
message: the message to describe the parse failure.
parser (str): the class name of the parser
location: the location at which the parser failed.
causes: another failure instance or a list of other failure instances which
can be used to describe the failure of nested parsers in more detail
context: the context at the point of failure. This is stored as a reference so
the context should not get modified after the failure is constructed.
"""
self.context = context
self._parser = parser
if not message:
message = "Parser Error"
self.message = message
if location:
self._cur_text = location.text_location
self._cur_ann = location.ann_location
else:
self._cur_text = "?"
self._cur_ann = "?"
if isinstance(causes, Failure):
self._causes = [causes]
else:
self._causes = causes
def issuccess(self):
"""
Method for success and failure results which indicates if we have a success or failure.
Returns:
False
"""
return False
def _get_causes(self):
for cause in self._causes:
if not cause._causes:
# The root cause since there's no further failures.
yield cause
else:
yield from cause._get_causes()
def describe(self, indent=4, level=0):
"""
Return a string with information about the failure.
Args:
indent: number of characters to indent for each recursive failure.
level: recursive level of failure
Returns:
String with information about the failure
"""
lead = " " * indent * level
desc = (
f"{lead}{self._parser} at {self._cur_text}/{self._cur_ann}: "
f"{self.message}"
)
tail = ""
if self._causes:
tail = f"\n{lead}Caused by:\n" + "\n".join(
x.describe(indent, level + 1) for x in self._get_causes()
)
return desc + tail
def __str__(self):
return self.describe()
def __repr__(self):
return (
f"{self.__class__.__name__}({self.message!r}, "
f"{self._cur_text!r}/{self._cur_ann}, {self._causes!r})"
)
class Success(Iterable, Sized):
"""
Represents a parse success as a (possibly empty) list of result elements.
Each success is a list of result elements, and each result element contains a list
of matching info for named patterns that match.
A result represents a fitting pattern at the top/outermost level of a parser.
A parser that is made of sub parsers and sub-sub-parsers returns one or more matches
over all the different ways how those sub-parsers can match at a specific location,
and each result contains a result element for all the named sub- and sub-sub-parsers
the main parser is made of.
"""
def __init__(self, results, context):
"""
Create a Success instance.
Args:
results: a result or a list of results which may be empty
context: the context used when parsing that result. A reference to the context is stored
so the context may change after the result has been produced if it is used for more
parsing.
"""
if results is None:
self._results = []
elif isinstance(results, Result): # now that the Result itself is an iterable, need to check first!
self._results = [results]
elif isinstance(results, Iterable):
self._results = list(results)
else:
self._results = [results]
self.context = context
def issuccess(self):
"""
Method for success and failure results which indicates if we have a success or failure.
Returns:
True
"""
return True
def pprint(self, file=None): # pragma: no cover
"""
Pretty print the success instance to the file or stdout if no file is specified.
Args:
file: open file handle for use with print.
"""
for idx, res in enumerate(self._results):
if file:
print(f"Result {idx}, location={res.location}:", file=file)
else:
print(f"Result {idx}, location={res.location}:", file=file)
for jdx, mtch_ in enumerate(res.matches):
if file:
print(f" {jdx}: {mtch_}", file)
else:
print(f" {jdx}: {mtch_}", file)
@staticmethod
def select_result(results, matchtype="first"):
"""
Return the result described by parameter matchtype.
If "all" returns the whole list of matches.
Args:
results: list of results to select from
matchtype: one of "first", "shortest", "longest", "all".
If there is more than one longest or shortest
result, the first one of those in the list is returned.
Returns:
the filtered result or all results
"""
if matchtype is None:
matchtype = "first"
if matchtype == "all":
return results
elif matchtype == "first":
return results[0]
elif matchtype == "longest":
result = results[0]
loc = result.location
for res in results:
if res.location.text_location > loc.text_location:
loc = res.location
result = res
return result
elif matchtype == "shortest":
result = results[0]
loc = result.location
for res in results:
if res.location.text_location < loc.text_location:
loc = res.location
result = res
return result
else:
raise Exception(f"Not a valid value for matchtype: {matchtype}")
def result(self, matchtype="first"):
"""
Return the result described by parameter matchtype. If "all" returns the whole list of matches.
Args:
matchtype: one of "first", "shortest", "longest", "all".
If there is more than one longest or shortest
result, the first one of those in the list is returned.
Returns:
the filtered result or all results
"""
return Success.select_result(self._results, matchtype)
def __iter__(self):
return iter(self._results)
def __len__(self):
return len(self._results)
def __eq__(self, other):
if not isinstance(other, Success):
return False
return self._results == other._results
def __str__(self):
return str(self._results)
def __getitem__(self, item):
return self._results[item]
class Context:
"""
Context contains information and refers to information for carrying out the parse.
A context contains a reference to the document being parsed, the list of annotations to use,
the start and end text offsets the parsing should be restricted to, the output annotation set
to use, the maximum recursion depth and a structure for memoization.
All these fields are immutable, i.e. the references stored do not usually change during parsing or
when Pampac executes rules on a document. However, all the referenced data apart from start and
end may change.
"""
def __init__(
self,
doc,
anns,
start=None,
end=None,
outset=None,
# memoize=False,
# max_recusion=None,
):
"""
Initialize a parse context.
Args:
doc: the document which should get parsed
anns: an iterable of annotations to use for the parsing. The annotations are used in the order they
occur in the iterator (for a set, this is the default order by start offset and annotation id).
If the order is different from the default order, the result may be unexpected or matching may not
work depending on the exact patterns used.
start: the starting text offset for the parse
end: the ending text offset for the parse
outset: an annotation set for where to add any new annotations in an action
"""
# max_recusion: the maximum recursion depth for recursive parse rules (NOT YET IMPLEMENTED)
# self._memotable = {}
# self.max_recursion = max_recusion
self.doc = doc
self.outset = outset
self._annset = (
None # cache for the annotations as a detached immutable set, if needed
)
# make sure the start and end offsets are plausible or set the default to start/end of document
if start is None:
self.start = 0
else:
if start >= len(doc.text) or start < 0:
raise Exception(
"Invalid start offset: {start}, document length is {len(doc.text}"
)
self.start = start
if end is None:
self.end = len(doc.text) # offset after the last text character!
else:
if end <= start or end > len(doc.text):
raise Exception("Invalid end offset: {end}, start is {self.start}")
self.end = end
# make sure all the anns are within the given offset range
anns = [a for a in anns if a.start >= self.start and a.end <= self.end]
self.anns = anns
# self.memoize = memoize
@property
def annset(self):
"""
Return the annotations as a set.
Returns:
annotations as a detached immutable AnnotationSet
"""
if self._annset is None:
self._annset = AnnotationSet.from_anns(self.anns)
return self._annset
def get_ann(self, location) -> Union[Annotation, None]:
"""
Return the ann at the given location, or None if there is none (mainly for the end-of-anns index).
Returns:
annotation or None
"""
if location.ann_location >= len(self.anns):
return None
return self.anns[location.ann_location]
def nextidx4offset(self, location, offset, next_ann=False):
"""
Return the index of the next annotation that starts at or after the given text offset.
If no such annotation exists the end of annotations index (equal to length of annotations) is returned.
Args:
location: current location, the annotation is searched from the annotation index following the one in the
current location
offset: offset to look for
next_ann: if True, always finds the NEXT annotation after the one pointed at with the current location.
If false keeps the current one if it is still the next one.
Returns:
annotation index
"""
idx = location.ann_location
# print(f"DEBUG Trying to find next idx for curlocation={location} and curidx={idx}, offset={offset}")
if next_ann:
idx += 1
while True:
if idx >= len(self.anns):
return len(self.anns)
ann = self.anns[idx]
# print(f"DEBUG Checking ann={ann}")
if ann.start >= offset:
return idx
idx += 1
def inc_location(self, location, by_offset=None, by_index=None, to_offset=None):
"""
Return a new location which represents the given location incremented by either the given number of index
count (usually 1), or by the given offset length. Only one of the by parameters should be specified.
If the update occurs by offset, then the annotation index is updated to that of the next index with
a start offset equal or larger than the updated text offset. This may be the end of annotations index.
If the text offset hits the end of text offset, the annotation index is set to the end of annotations index.
If the update occurs by index, then the text offset is updated to the offset corresponding to the end offset
of the annotation, if there is one.
Args:
location:
by_offset: the number of text characters to increment the text offset by
by_index: the number of annotations to increment the index by
to_offset: if given, the by_ arguments are ignored and instead the offset is set to the given
offset with the annotation index set to the next annotation at or after that offset
Returns:
new location
"""
newloc = Location(
text_location=location.text_location, ann_location=location.ann_location
)
if to_offset is not None:
assert to_offset < self.end
assert to_offset >= self.start
newloc.text_location = to_offset
newloc.ann_location = self.nextidx4offset(
location, newloc.text_location
)
elif by_index is not None:
# get the annotation before the one we want to point at next, so we get the end offset of the
# last annotation consumed
newloc.ann_location += by_index - 1
ann = self.get_ann(location)
# if we already are at the end of the annotations, just leave everything as it is
if not ann:
return location
newloc.text_location = ann.end
# this is now the index of the next ann or the end of anns index
newloc.ann_location += 1
else:
# update by text offset
# print(f"DEBUG Updating by text offset: {by_offset}, current loc is {newloc.text_location}")
if newloc.text_location + by_offset >= self.end:
# if we reach the end of the text, update the annotation index to end of annotations as well
newloc.text_location = self.end
newloc.ann_location = len(self.anns)
else:
# otherwise try to find the next matching annotation
newloc.text_location += by_offset
newloc.ann_location = self.nextidx4offset(
location, newloc.text_location
)
# if we got end of annotations index, we do NOT update the text to end of text!
# we could still want to match something in the text after the last annotation.
return newloc
def update_location_byoffset(self, location):
"""
Update the passed location so that the annotation index is updated by the text offset: all annotations are
skipped until the start offset of the annotation is at or past the text offset.
Args:
location: the location to update
Returns:
a new location with the annotation index updated
"""
for i in range(location.ann_location, len(self.anns)):
if self.anns[i].start >= location.text_location:
return Location(location.text_location, i)
return Location(location.text_location, len(self.anns))
def update_location_byindex(self, location):
"""
Update the passed location from the annotation index and make sure it points to the end of the current
annotation or the end of the document.
Args:
location: the location to update
Returns:
a new location with the text offset updated
"""
if location.ann_location == len(self.anns):
# we already are beyond the last annotation so we set the text offset to beyond the text
return Location(len(self.doc.text), location.ann_location)
else:
# set the text location to the end of the current annotation
return Location(
self.anns[location.ann_location].end, location.ann_location
)
def at_endoftext(self, location):
"""
Returns true if the location represents the end of text location
Args:
location: location
Returns:
True if we are at end of text
"""
return location.text_location >= self.end
def at_endofanns(self, location):
"""
Returns true if the location represents the end of anns location
Args:
location: location
Returns:
True if we are at end of anns
"""
return location.ann_location >= len(self.anns)
Classes
class Context (doc, anns, start=None, end=None, outset=None)
-
Context contains information and refers to information for carrying out the parse.
A context contains a reference to the document being parsed, the list of annotations to use, the start and end text offsets the parsing should be restricted to, the output annotation set to use, the maximum recursion depth and a structure for memoization.
All these fields are immutable, i.e. the references stored do not usually change during parsing or when Pampac executes rules on a document. However, all the referenced data apart from start and end may change.
Initialize a parse context.
Args
doc
- the document which should get parsed
anns
- an iterable of annotations to use for the parsing. The annotations are used in the order they occur in the iterator (for a set, this is the default order by start offset and annotation id). If the order is different from the default order, the result may be unexpected or matching may not work depending on the exact patterns used.
start
- the starting text offset for the parse
end
- the ending text offset for the parse
outset
- an annotation set for where to add any new annotations in an action
Expand source code
class Context: """ Context contains information and refers to information for carrying out the parse. A context contains a reference to the document being parsed, the list of annotations to use, the start and end text offsets the parsing should be restricted to, the output annotation set to use, the maximum recursion depth and a structure for memoization. All these fields are immutable, i.e. the references stored do not usually change during parsing or when Pampac executes rules on a document. However, all the referenced data apart from start and end may change. """ def __init__( self, doc, anns, start=None, end=None, outset=None, # memoize=False, # max_recusion=None, ): """ Initialize a parse context. Args: doc: the document which should get parsed anns: an iterable of annotations to use for the parsing. The annotations are used in the order they occur in the iterator (for a set, this is the default order by start offset and annotation id). If the order is different from the default order, the result may be unexpected or matching may not work depending on the exact patterns used. start: the starting text offset for the parse end: the ending text offset for the parse outset: an annotation set for where to add any new annotations in an action """ # max_recusion: the maximum recursion depth for recursive parse rules (NOT YET IMPLEMENTED) # self._memotable = {} # self.max_recursion = max_recusion self.doc = doc self.outset = outset self._annset = ( None # cache for the annotations as a detached immutable set, if needed ) # make sure the start and end offsets are plausible or set the default to start/end of document if start is None: self.start = 0 else: if start >= len(doc.text) or start < 0: raise Exception( "Invalid start offset: {start}, document length is {len(doc.text}" ) self.start = start if end is None: self.end = len(doc.text) # offset after the last text character! else: if end <= start or end > len(doc.text): raise Exception("Invalid end offset: {end}, start is {self.start}") self.end = end # make sure all the anns are within the given offset range anns = [a for a in anns if a.start >= self.start and a.end <= self.end] self.anns = anns # self.memoize = memoize @property def annset(self): """ Return the annotations as a set. Returns: annotations as a detached immutable AnnotationSet """ if self._annset is None: self._annset = AnnotationSet.from_anns(self.anns) return self._annset def get_ann(self, location) -> Union[Annotation, None]: """ Return the ann at the given location, or None if there is none (mainly for the end-of-anns index). Returns: annotation or None """ if location.ann_location >= len(self.anns): return None return self.anns[location.ann_location] def nextidx4offset(self, location, offset, next_ann=False): """ Return the index of the next annotation that starts at or after the given text offset. If no such annotation exists the end of annotations index (equal to length of annotations) is returned. Args: location: current location, the annotation is searched from the annotation index following the one in the current location offset: offset to look for next_ann: if True, always finds the NEXT annotation after the one pointed at with the current location. If false keeps the current one if it is still the next one. Returns: annotation index """ idx = location.ann_location # print(f"DEBUG Trying to find next idx for curlocation={location} and curidx={idx}, offset={offset}") if next_ann: idx += 1 while True: if idx >= len(self.anns): return len(self.anns) ann = self.anns[idx] # print(f"DEBUG Checking ann={ann}") if ann.start >= offset: return idx idx += 1 def inc_location(self, location, by_offset=None, by_index=None, to_offset=None): """ Return a new location which represents the given location incremented by either the given number of index count (usually 1), or by the given offset length. Only one of the by parameters should be specified. If the update occurs by offset, then the annotation index is updated to that of the next index with a start offset equal or larger than the updated text offset. This may be the end of annotations index. If the text offset hits the end of text offset, the annotation index is set to the end of annotations index. If the update occurs by index, then the text offset is updated to the offset corresponding to the end offset of the annotation, if there is one. Args: location: by_offset: the number of text characters to increment the text offset by by_index: the number of annotations to increment the index by to_offset: if given, the by_ arguments are ignored and instead the offset is set to the given offset with the annotation index set to the next annotation at or after that offset Returns: new location """ newloc = Location( text_location=location.text_location, ann_location=location.ann_location ) if to_offset is not None: assert to_offset < self.end assert to_offset >= self.start newloc.text_location = to_offset newloc.ann_location = self.nextidx4offset( location, newloc.text_location ) elif by_index is not None: # get the annotation before the one we want to point at next, so we get the end offset of the # last annotation consumed newloc.ann_location += by_index - 1 ann = self.get_ann(location) # if we already are at the end of the annotations, just leave everything as it is if not ann: return location newloc.text_location = ann.end # this is now the index of the next ann or the end of anns index newloc.ann_location += 1 else: # update by text offset # print(f"DEBUG Updating by text offset: {by_offset}, current loc is {newloc.text_location}") if newloc.text_location + by_offset >= self.end: # if we reach the end of the text, update the annotation index to end of annotations as well newloc.text_location = self.end newloc.ann_location = len(self.anns) else: # otherwise try to find the next matching annotation newloc.text_location += by_offset newloc.ann_location = self.nextidx4offset( location, newloc.text_location ) # if we got end of annotations index, we do NOT update the text to end of text! # we could still want to match something in the text after the last annotation. return newloc def update_location_byoffset(self, location): """ Update the passed location so that the annotation index is updated by the text offset: all annotations are skipped until the start offset of the annotation is at or past the text offset. Args: location: the location to update Returns: a new location with the annotation index updated """ for i in range(location.ann_location, len(self.anns)): if self.anns[i].start >= location.text_location: return Location(location.text_location, i) return Location(location.text_location, len(self.anns)) def update_location_byindex(self, location): """ Update the passed location from the annotation index and make sure it points to the end of the current annotation or the end of the document. Args: location: the location to update Returns: a new location with the text offset updated """ if location.ann_location == len(self.anns): # we already are beyond the last annotation so we set the text offset to beyond the text return Location(len(self.doc.text), location.ann_location) else: # set the text location to the end of the current annotation return Location( self.anns[location.ann_location].end, location.ann_location ) def at_endoftext(self, location): """ Returns true if the location represents the end of text location Args: location: location Returns: True if we are at end of text """ return location.text_location >= self.end def at_endofanns(self, location): """ Returns true if the location represents the end of anns location Args: location: location Returns: True if we are at end of anns """ return location.ann_location >= len(self.anns)
Instance variables
var annset
-
Return the annotations as a set.
Returns
annotations as a detached immutable AnnotationSet
Expand source code
@property def annset(self): """ Return the annotations as a set. Returns: annotations as a detached immutable AnnotationSet """ if self._annset is None: self._annset = AnnotationSet.from_anns(self.anns) return self._annset
Methods
def at_endofanns(self, location)
-
Returns true if the location represents the end of anns location
Args
location
- location
Returns
True if we are at end of anns
Expand source code
def at_endofanns(self, location): """ Returns true if the location represents the end of anns location Args: location: location Returns: True if we are at end of anns """ return location.ann_location >= len(self.anns)
def at_endoftext(self, location)
-
Returns true if the location represents the end of text location
Args
location
- location
Returns
True if we are at end of text
Expand source code
def at_endoftext(self, location): """ Returns true if the location represents the end of text location Args: location: location Returns: True if we are at end of text """ return location.text_location >= self.end
def get_ann(self, location) ‑> Optional[Annotation]
-
Return the ann at the given location, or None if there is none (mainly for the end-of-anns index).
Returns
annotation or None
Expand source code
def get_ann(self, location) -> Union[Annotation, None]: """ Return the ann at the given location, or None if there is none (mainly for the end-of-anns index). Returns: annotation or None """ if location.ann_location >= len(self.anns): return None return self.anns[location.ann_location]
def inc_location(self, location, by_offset=None, by_index=None, to_offset=None)
-
Return a new location which represents the given location incremented by either the given number of index count (usually 1), or by the given offset length. Only one of the by parameters should be specified.
If the update occurs by offset, then the annotation index is updated to that of the next index with a start offset equal or larger than the updated text offset. This may be the end of annotations index. If the text offset hits the end of text offset, the annotation index is set to the end of annotations index.
If the update occurs by index, then the text offset is updated to the offset corresponding to the end offset of the annotation, if there is one.
Args
- location:
by_offset
- the number of text characters to increment the text offset by
by_index
- the number of annotations to increment the index by
to_offset
- if given, the by_ arguments are ignored and instead the offset is set to the given offset with the annotation index set to the next annotation at or after that offset
Returns
new location
Expand source code
def inc_location(self, location, by_offset=None, by_index=None, to_offset=None): """ Return a new location which represents the given location incremented by either the given number of index count (usually 1), or by the given offset length. Only one of the by parameters should be specified. If the update occurs by offset, then the annotation index is updated to that of the next index with a start offset equal or larger than the updated text offset. This may be the end of annotations index. If the text offset hits the end of text offset, the annotation index is set to the end of annotations index. If the update occurs by index, then the text offset is updated to the offset corresponding to the end offset of the annotation, if there is one. Args: location: by_offset: the number of text characters to increment the text offset by by_index: the number of annotations to increment the index by to_offset: if given, the by_ arguments are ignored and instead the offset is set to the given offset with the annotation index set to the next annotation at or after that offset Returns: new location """ newloc = Location( text_location=location.text_location, ann_location=location.ann_location ) if to_offset is not None: assert to_offset < self.end assert to_offset >= self.start newloc.text_location = to_offset newloc.ann_location = self.nextidx4offset( location, newloc.text_location ) elif by_index is not None: # get the annotation before the one we want to point at next, so we get the end offset of the # last annotation consumed newloc.ann_location += by_index - 1 ann = self.get_ann(location) # if we already are at the end of the annotations, just leave everything as it is if not ann: return location newloc.text_location = ann.end # this is now the index of the next ann or the end of anns index newloc.ann_location += 1 else: # update by text offset # print(f"DEBUG Updating by text offset: {by_offset}, current loc is {newloc.text_location}") if newloc.text_location + by_offset >= self.end: # if we reach the end of the text, update the annotation index to end of annotations as well newloc.text_location = self.end newloc.ann_location = len(self.anns) else: # otherwise try to find the next matching annotation newloc.text_location += by_offset newloc.ann_location = self.nextidx4offset( location, newloc.text_location ) # if we got end of annotations index, we do NOT update the text to end of text! # we could still want to match something in the text after the last annotation. return newloc
def nextidx4offset(self, location, offset, next_ann=False)
-
Return the index of the next annotation that starts at or after the given text offset. If no such annotation exists the end of annotations index (equal to length of annotations) is returned.
Args
location
- current location, the annotation is searched from the annotation index following the one in the current location
offset
- offset to look for
next_ann
- if True, always finds the NEXT annotation after the one pointed at with the current location. If false keeps the current one if it is still the next one.
Returns
annotation index
Expand source code
def nextidx4offset(self, location, offset, next_ann=False): """ Return the index of the next annotation that starts at or after the given text offset. If no such annotation exists the end of annotations index (equal to length of annotations) is returned. Args: location: current location, the annotation is searched from the annotation index following the one in the current location offset: offset to look for next_ann: if True, always finds the NEXT annotation after the one pointed at with the current location. If false keeps the current one if it is still the next one. Returns: annotation index """ idx = location.ann_location # print(f"DEBUG Trying to find next idx for curlocation={location} and curidx={idx}, offset={offset}") if next_ann: idx += 1 while True: if idx >= len(self.anns): return len(self.anns) ann = self.anns[idx] # print(f"DEBUG Checking ann={ann}") if ann.start >= offset: return idx idx += 1
def update_location_byindex(self, location)
-
Update the passed location from the annotation index and make sure it points to the end of the current annotation or the end of the document.
Args
location
- the location to update
Returns
a new location with the text offset updated
Expand source code
def update_location_byindex(self, location): """ Update the passed location from the annotation index and make sure it points to the end of the current annotation or the end of the document. Args: location: the location to update Returns: a new location with the text offset updated """ if location.ann_location == len(self.anns): # we already are beyond the last annotation so we set the text offset to beyond the text return Location(len(self.doc.text), location.ann_location) else: # set the text location to the end of the current annotation return Location( self.anns[location.ann_location].end, location.ann_location )
def update_location_byoffset(self, location)
-
Update the passed location so that the annotation index is updated by the text offset: all annotations are skipped until the start offset of the annotation is at or past the text offset.
Args
location
- the location to update
Returns
a new location with the annotation index updated
Expand source code
def update_location_byoffset(self, location): """ Update the passed location so that the annotation index is updated by the text offset: all annotations are skipped until the start offset of the annotation is at or past the text offset. Args: location: the location to update Returns: a new location with the annotation index updated """ for i in range(location.ann_location, len(self.anns)): if self.anns[i].start >= location.text_location: return Location(location.text_location, i) return Location(location.text_location, len(self.anns))
class Failure (message=None, parser=None, location=None, causes=None, context=None)
-
Represents a parse failure.
Create a Failure instance.
Args
message
- the message to describe the parse failure.
parser
:str
- the class name of the parser
location
- the location at which the parser failed.
causes
- another failure instance or a list of other failure instances which can be used to describe the failure of nested parsers in more detail
context
- the context at the point of failure. This is stored as a reference so the context should not get modified after the failure is constructed.
Expand source code
class Failure: """ Represents a parse failure. """ def __init__( self, message=None, parser=None, location=None, causes=None, context=None, ): """ Create a Failure instance. Args: message: the message to describe the parse failure. parser (str): the class name of the parser location: the location at which the parser failed. causes: another failure instance or a list of other failure instances which can be used to describe the failure of nested parsers in more detail context: the context at the point of failure. This is stored as a reference so the context should not get modified after the failure is constructed. """ self.context = context self._parser = parser if not message: message = "Parser Error" self.message = message if location: self._cur_text = location.text_location self._cur_ann = location.ann_location else: self._cur_text = "?" self._cur_ann = "?" if isinstance(causes, Failure): self._causes = [causes] else: self._causes = causes def issuccess(self): """ Method for success and failure results which indicates if we have a success or failure. Returns: False """ return False def _get_causes(self): for cause in self._causes: if not cause._causes: # The root cause since there's no further failures. yield cause else: yield from cause._get_causes() def describe(self, indent=4, level=0): """ Return a string with information about the failure. Args: indent: number of characters to indent for each recursive failure. level: recursive level of failure Returns: String with information about the failure """ lead = " " * indent * level desc = ( f"{lead}{self._parser} at {self._cur_text}/{self._cur_ann}: " f"{self.message}" ) tail = "" if self._causes: tail = f"\n{lead}Caused by:\n" + "\n".join( x.describe(indent, level + 1) for x in self._get_causes() ) return desc + tail def __str__(self): return self.describe() def __repr__(self): return ( f"{self.__class__.__name__}({self.message!r}, " f"{self._cur_text!r}/{self._cur_ann}, {self._causes!r})" )
Methods
def describe(self, indent=4, level=0)
-
Return a string with information about the failure.
Args
indent
- number of characters to indent for each recursive failure.
level
- recursive level of failure
Returns
String with information about the failure
Expand source code
def describe(self, indent=4, level=0): """ Return a string with information about the failure. Args: indent: number of characters to indent for each recursive failure. level: recursive level of failure Returns: String with information about the failure """ lead = " " * indent * level desc = ( f"{lead}{self._parser} at {self._cur_text}/{self._cur_ann}: " f"{self.message}" ) tail = "" if self._causes: tail = f"\n{lead}Caused by:\n" + "\n".join( x.describe(indent, level + 1) for x in self._get_causes() ) return desc + tail
def issuccess(self)
-
Method for success and failure results which indicates if we have a success or failure.
Returns
False
Expand source code
def issuccess(self): """ Method for success and failure results which indicates if we have a success or failure. Returns: False """ return False
class Location (text_location=0, ann_location=0)
-
A ParseLocation represents the next location in the text and annotation list where a parser will try to match, i.e. the location after everything that has been consumed by the parser so far.
The text offset equal to the length of the text represent the EndOfText condition and the annotation index equal to the length of the annotation list represents the EndOfAnns condition.
Create a parser location.
Args
text_location
- the next text offset from which on to parse.
ann_location
- the next annotation index from which on to parse.
Expand source code
class Location: """ A ParseLocation represents the next location in the text and annotation list where a parser will try to match, i.e. the location after everything that has been consumed by the parser so far. The text offset equal to the length of the text represent the EndOfText condition and the annotation index equal to the length of the annotation list represents the EndOfAnns condition. """ def __init__(self, text_location=0, ann_location=0): """ Create a parser location. Args: text_location: the next text offset from which on to parse. ann_location: the next annotation index from which on to parse. """ self.text_location = text_location self.ann_location = ann_location def __str__(self): return f"Location({self.text_location},{self.ann_location})" def __repr__(self): return f"Location({self.text_location},{self.ann_location})" def __eq__(self, other): if not isinstance(other, Location): return False return ( self.text_location == other.text_location and self.ann_location == other.ann_location )
class Result (matches=None, location=None, span=None)
-
Represents an individual parser result. A successful parse can have any number of parser results which are alternate ways of how the parser can match the document. Each result can have an arbitrary number of "matches" (named spans where some part of the pattern fits the document). A result is an iterable of matches.
Create a parser result.
Args
matches
- the matching info associated with the result, this can be a single item or a list of items.
location
- the location where the result was matched, i.e. the location before matching was done.
span
- the span representing the start and end text offset for the match
Expand source code
class Result(Iterable, Sized): """ Represents an individual parser result. A successful parse can have any number of parser results which are alternate ways of how the parser can match the document. Each result can have an arbitrary number of "matches" (named spans where some part of the pattern fits the document). A result is an iterable of matches. """ def __init__(self, matches=None, location=None, span=None): """ Create a parser result. Args: matches: the matching info associated with the result, this can be a single item or a list of items. location: the location where the result was matched, i.e. the location *before* matching was done. span: the span representing the start and end text offset for the match """ assert location is not None assert span is not None if matches is not None: if isinstance(matches, dict): self.matches = [matches] elif isinstance(matches, Iterable): self.matches = list(matches) else: self.matches = [matches] else: self.matches = [] self.location = location self.span = span def anns4matches(self): """ Yields all the annotations, if any, in the results matches. """ for mtch_ in self.matches: tmp = mtch_.get("ann") if tmp: yield tmp def matches4name(self, name): """ Return a list of match info dictionaries with the given name. """ return [m for m in self.matches if m.get("name") == name] def __str__(self): return f"Result(loc={self.location},span=Span({self.span.start},{self.span.end}),nmatches={len(self.matches)})" def __repr__(self): return f"Result(loc={self.location},span=Span({self.span.start},{self.span.end}),matches={self.matches})" def __iter__(self): if self.matches is not None: return iter(self.matches) else: return iter([]) def __len__(self): if self.matches is not None: return len(self.matches) else: return 0
Ancestors
- collections.abc.Iterable
- collections.abc.Sized
Methods
def anns4matches(self)
-
Yields all the annotations, if any, in the results matches.
Expand source code
def anns4matches(self): """ Yields all the annotations, if any, in the results matches. """ for mtch_ in self.matches: tmp = mtch_.get("ann") if tmp: yield tmp
def matches4name(self, name)
-
Return a list of match info dictionaries with the given name.
Expand source code
def matches4name(self, name): """ Return a list of match info dictionaries with the given name. """ return [m for m in self.matches if m.get("name") == name]
class Success (results, context)
-
Represents a parse success as a (possibly empty) list of result elements.
Each success is a list of result elements, and each result element contains a list of matching info for named patterns that match. A result represents a fitting pattern at the top/outermost level of a parser. A parser that is made of sub parsers and sub-sub-parsers returns one or more matches over all the different ways how those sub-parsers can match at a specific location, and each result contains a result element for all the named sub- and sub-sub-parsers the main parser is made of.
Create a Success instance.
Args
results
- a result or a list of results which may be empty
context
- the context used when parsing that result. A reference to the context is stored so the context may change after the result has been produced if it is used for more parsing.
Expand source code
class Success(Iterable, Sized): """ Represents a parse success as a (possibly empty) list of result elements. Each success is a list of result elements, and each result element contains a list of matching info for named patterns that match. A result represents a fitting pattern at the top/outermost level of a parser. A parser that is made of sub parsers and sub-sub-parsers returns one or more matches over all the different ways how those sub-parsers can match at a specific location, and each result contains a result element for all the named sub- and sub-sub-parsers the main parser is made of. """ def __init__(self, results, context): """ Create a Success instance. Args: results: a result or a list of results which may be empty context: the context used when parsing that result. A reference to the context is stored so the context may change after the result has been produced if it is used for more parsing. """ if results is None: self._results = [] elif isinstance(results, Result): # now that the Result itself is an iterable, need to check first! self._results = [results] elif isinstance(results, Iterable): self._results = list(results) else: self._results = [results] self.context = context def issuccess(self): """ Method for success and failure results which indicates if we have a success or failure. Returns: True """ return True def pprint(self, file=None): # pragma: no cover """ Pretty print the success instance to the file or stdout if no file is specified. Args: file: open file handle for use with print. """ for idx, res in enumerate(self._results): if file: print(f"Result {idx}, location={res.location}:", file=file) else: print(f"Result {idx}, location={res.location}:", file=file) for jdx, mtch_ in enumerate(res.matches): if file: print(f" {jdx}: {mtch_}", file) else: print(f" {jdx}: {mtch_}", file) @staticmethod def select_result(results, matchtype="first"): """ Return the result described by parameter matchtype. If "all" returns the whole list of matches. Args: results: list of results to select from matchtype: one of "first", "shortest", "longest", "all". If there is more than one longest or shortest result, the first one of those in the list is returned. Returns: the filtered result or all results """ if matchtype is None: matchtype = "first" if matchtype == "all": return results elif matchtype == "first": return results[0] elif matchtype == "longest": result = results[0] loc = result.location for res in results: if res.location.text_location > loc.text_location: loc = res.location result = res return result elif matchtype == "shortest": result = results[0] loc = result.location for res in results: if res.location.text_location < loc.text_location: loc = res.location result = res return result else: raise Exception(f"Not a valid value for matchtype: {matchtype}") def result(self, matchtype="first"): """ Return the result described by parameter matchtype. If "all" returns the whole list of matches. Args: matchtype: one of "first", "shortest", "longest", "all". If there is more than one longest or shortest result, the first one of those in the list is returned. Returns: the filtered result or all results """ return Success.select_result(self._results, matchtype) def __iter__(self): return iter(self._results) def __len__(self): return len(self._results) def __eq__(self, other): if not isinstance(other, Success): return False return self._results == other._results def __str__(self): return str(self._results) def __getitem__(self, item): return self._results[item]
Ancestors
- collections.abc.Iterable
- collections.abc.Sized
Static methods
def select_result(results, matchtype='first')
-
Return the result described by parameter matchtype.
If "all" returns the whole list of matches.
Args
results
- list of results to select from
matchtype
- one of "first", "shortest", "longest", "all". If there is more than one longest or shortest result, the first one of those in the list is returned.
Returns
the filtered result or all results
Expand source code
@staticmethod def select_result(results, matchtype="first"): """ Return the result described by parameter matchtype. If "all" returns the whole list of matches. Args: results: list of results to select from matchtype: one of "first", "shortest", "longest", "all". If there is more than one longest or shortest result, the first one of those in the list is returned. Returns: the filtered result or all results """ if matchtype is None: matchtype = "first" if matchtype == "all": return results elif matchtype == "first": return results[0] elif matchtype == "longest": result = results[0] loc = result.location for res in results: if res.location.text_location > loc.text_location: loc = res.location result = res return result elif matchtype == "shortest": result = results[0] loc = result.location for res in results: if res.location.text_location < loc.text_location: loc = res.location result = res return result else: raise Exception(f"Not a valid value for matchtype: {matchtype}")
Methods
def issuccess(self)
-
Method for success and failure results which indicates if we have a success or failure.
Returns
True
Expand source code
def issuccess(self): """ Method for success and failure results which indicates if we have a success or failure. Returns: True """ return True
def pprint(self, file=None)
-
Pretty print the success instance to the file or stdout if no file is specified.
Args
file
- open file handle for use with print.
Expand source code
def pprint(self, file=None): # pragma: no cover """ Pretty print the success instance to the file or stdout if no file is specified. Args: file: open file handle for use with print. """ for idx, res in enumerate(self._results): if file: print(f"Result {idx}, location={res.location}:", file=file) else: print(f"Result {idx}, location={res.location}:", file=file) for jdx, mtch_ in enumerate(res.matches): if file: print(f" {jdx}: {mtch_}", file) else: print(f" {jdx}: {mtch_}", file)
def result(self, matchtype='first')
-
Return the result described by parameter matchtype. If "all" returns the whole list of matches.
Args
matchtype
- one of "first", "shortest", "longest", "all". If there is more than one longest or shortest result, the first one of those in the list is returned.
Returns
the filtered result or all results
Expand source code
def result(self, matchtype="first"): """ Return the result described by parameter matchtype. If "all" returns the whole list of matches. Args: matchtype: one of "first", "shortest", "longest", "all". If there is more than one longest or shortest result, the first one of those in the list is returned. Returns: the filtered result or all results """ return Success.select_result(self._results, matchtype)