Module gatenlp.offsetmapper
Module that implements the OffsetMapper class for mapping between Java-style and Python-style string offsets. Java strings are represented as UTF16 while Python strings are represented as Unicode code points, so offsets differ if a Unicode character needs more than one UTF16 code unit.
Expand source code
"""
Module that implements the OffsetMapper class for mapping between Java-style and Python-style string offsets.
Java strings are represented as UTF16 while Python strings are represented as Unicode code points, so offsets
differ if a Unicode character needs more than one UTF16 code unit.
"""
import numbers
OFFSET_TYPE_JAVA = "j"
OFFSET_TYPE_PYTHON = "p"
def _convert_from_table(offsets, from_table=None):
"""
Args:
offsets:
from_table: (Default value = None)
Returns:
"""
if from_table is None:
return offsets
if isinstance(offsets, numbers.Integral):
return int(from_table[offsets])
ret = []
for offset in offsets:
ret.append(int(from_table[offset]))
return ret
class OffsetMapper:
def __init__(self, text: str):
"""
Calculate the tables for mapping unicode code points to utf16 code units.
NOTE: currently this optimizes for conversion speed at the cost of memory, with one special case:
if after creating the java2python table we find that all offsets are identical, we discard
the tables and just set a flag for that.
Args:
text: the text as a python string
"""
# for now, remove dependency on numpy and use simple python lists of integers
# import numpy as np
cur_java_off = 0
python2java_list = [0]
java2python_list = []
last = len(text) - 1
for i, c in enumerate(text):
# get the java size of the current character
width = int(len(c.encode("utf-16be")) / 2)
assert width == 1 or width == 2
# the next java offset we get by incrementing the java offset by the with of the current char
cur_java_off += width
if i != last:
python2java_list.append(cur_java_off)
# i is the current python offset, so we append as many times to java2python_list as we have width
java2python_list.append(i)
if width == 2:
java2python_list.append(i)
if len(java2python_list) == len(text):
self.python2java = None
self.java2python = None
else:
python2java_list.append(python2java_list[-1] + 1)
# self.python2java = np.array(python2java_list, np.int32)
self.python2java = python2java_list
# self.java2python = np.array(java2python_list, np.int32)
java2python_list.append(java2python_list[-1] + 1)
self.java2python = java2python_list
def convert_to_python(self, offsets):
"""
Convert one java offset or an iterable of java offsets to python offset/s
Args:
offsets: a single offset or an iterable of offsets
Returns:
the converted offset or offsets
"""
return _convert_from_table(offsets, from_table=self.java2python)
def convert_to_java(self, offsets):
"""Convert one python offset or an iterable of python offsets to java offset/s
Args:
offsets: a single offset or an iterable of offsets
Returns:
the converted offset or offsets
"""
return _convert_from_table(offsets, from_table=self.python2java)
Classes
class OffsetMapper (text: str)
-
Calculate the tables for mapping unicode code points to utf16 code units.
NOTE: currently this optimizes for conversion speed at the cost of memory, with one special case: if after creating the java2python table we find that all offsets are identical, we discard the tables and just set a flag for that.
Args
text
- the text as a python string
Expand source code
class OffsetMapper: def __init__(self, text: str): """ Calculate the tables for mapping unicode code points to utf16 code units. NOTE: currently this optimizes for conversion speed at the cost of memory, with one special case: if after creating the java2python table we find that all offsets are identical, we discard the tables and just set a flag for that. Args: text: the text as a python string """ # for now, remove dependency on numpy and use simple python lists of integers # import numpy as np cur_java_off = 0 python2java_list = [0] java2python_list = [] last = len(text) - 1 for i, c in enumerate(text): # get the java size of the current character width = int(len(c.encode("utf-16be")) / 2) assert width == 1 or width == 2 # the next java offset we get by incrementing the java offset by the with of the current char cur_java_off += width if i != last: python2java_list.append(cur_java_off) # i is the current python offset, so we append as many times to java2python_list as we have width java2python_list.append(i) if width == 2: java2python_list.append(i) if len(java2python_list) == len(text): self.python2java = None self.java2python = None else: python2java_list.append(python2java_list[-1] + 1) # self.python2java = np.array(python2java_list, np.int32) self.python2java = python2java_list # self.java2python = np.array(java2python_list, np.int32) java2python_list.append(java2python_list[-1] + 1) self.java2python = java2python_list def convert_to_python(self, offsets): """ Convert one java offset or an iterable of java offsets to python offset/s Args: offsets: a single offset or an iterable of offsets Returns: the converted offset or offsets """ return _convert_from_table(offsets, from_table=self.java2python) def convert_to_java(self, offsets): """Convert one python offset or an iterable of python offsets to java offset/s Args: offsets: a single offset or an iterable of offsets Returns: the converted offset or offsets """ return _convert_from_table(offsets, from_table=self.python2java)
Methods
def convert_to_java(self, offsets)
-
Convert one python offset or an iterable of python offsets to java offset/s
Args
offsets
- a single offset or an iterable of offsets
Returns
the converted offset or offsets
Expand source code
def convert_to_java(self, offsets): """Convert one python offset or an iterable of python offsets to java offset/s Args: offsets: a single offset or an iterable of offsets Returns: the converted offset or offsets """ return _convert_from_table(offsets, from_table=self.python2java)
def convert_to_python(self, offsets)
-
Convert one java offset or an iterable of java offsets to python offset/s
Args
offsets
- a single offset or an iterable of offsets
Returns
the converted offset or offsets
Expand source code
def convert_to_python(self, offsets): """ Convert one java offset or an iterable of java offsets to python offset/s Args: offsets: a single offset or an iterable of offsets Returns: the converted offset or offsets """ return _convert_from_table(offsets, from_table=self.java2python)