Module gatenlp.serialization.default_htmlloader
Module that implements the default HTML loader
Expand source code
"""
Module that implements the default HTML loader
"""
from gatenlp.document import Document
from gatenlp.urlfileutils import is_url, get_str_from_url
class HtmlLoader:
""" """
@staticmethod
def load_rendered(
clazz,
from_ext=None,
from_mem=None,
parser=None,
markup_set_name="Original markups",
process_soup=None,
offset_mapper=None,
**kwargs,
):
"""
Args:
clazz:
from_ext: (Default value = None)
from_mem: (Default value = None)
parser: (Default value = None)
markup_set_name: (Default value = "Original markups")
process_soup: (Default value = None)
offset_mapper: (Default value = None)
**kwargs:
Returns:
"""
raise Exception("Rendered html parser not yet implemented")
@staticmethod
def load(
clazz,
from_ext=None,
from_mem=None,
parser="html.parser",
markup_set_name="Original markups",
encoding=None,
**kwargs,
):
"""Load a HTML file.
Args:
clazz: param from_ext:
from_ext: file our URL source
from_mem: string source
parser: one of "html.parser", "lxml", "lxml-xml", "html5lib" (default is "html.parser")
markup_set_name: the annotation set name for the set to contain the HTML
annotations (Default value = "Original markups")
encoding: the encoding to use for reading the file
"""
# NOTE: for now we have a simple heuristic for adding newlines to the text:
# before and after a block element, a newline is added unless there is already one
# NOTE: for now we use multi_valued_attributes=None which prevents attributes of the
# form "class='val1 val2'" to get converted into features with a list of values.
from bs4 import BeautifulSoup
import bs4
isurl, extstr = is_url(from_ext)
if from_ext is not None:
if isurl:
from_mem = get_str_from_url(extstr, encoding=encoding)
if from_mem:
bs = BeautifulSoup(from_mem, features=parser, multi_valued_attributes=None)
else:
with open(extstr, encoding=encoding) as infp:
bs = BeautifulSoup(infp, features=parser, multi_valued_attributes=None)
# we recursively iterate the tree depth first, going through the children
# and adding to a list that either contains the text or a dict with the information
# about annotations we want to add
nlels = {
"pre",
"br",
"p",
"div",
"tr",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"li",
"address",
"article",
"aside",
"blockquote",
"del",
"figure",
"figcaption",
"footer",
"header",
"hr",
"ins",
"main",
"nav",
"section",
"summary",
"input",
"legend",
"option",
"textarea",
"bdi",
"bdo",
"center",
"code",
"dfn",
"menu",
"dir",
"caption",
}
ignoreels = {"script", "style"}
docinfo = {"anninfos": [], "curoffset": 0, "curid": 0, "text": ""}
def walktree(el):
"""
Args:
el:
Returns:
"""
# print("DEBUG: type=", type(el))
if isinstance(el, bs4.element.Doctype):
# print("DEBUG: got doctype", type(el))
pass
elif isinstance(el, bs4.element.Comment):
# print("DEBUG: got Comment", type(el))
pass
elif isinstance(el, bs4.element.Script):
# print("DEBUG: got Script", type(el))
pass
elif isinstance(el, bs4.element.Tag):
# print("DEBUG: got tag: ", type(el), " name=",el.name)
# some tags we ignore completely:
if el.name in ignoreels:
return
# for some tags we insert a new line before, but only if we do not already have one
if not docinfo["text"].endswith("\n") and el.name in nlels:
docinfo["text"] += "\n"
# print("DEBUG: adding newline before at ", docinfo["curoffset"])
docinfo["curoffset"] += 1
ann = {
"type": el.name,
"features": el.attrs,
"id": docinfo["curid"],
"event": "start",
"start": docinfo["curoffset"],
}
thisid = docinfo["curid"]
docinfo["anninfos"].append(ann)
docinfo["curid"] += 1
for child in el.children:
walktree(child)
# for some tags we insert a new line after
if not docinfo["text"].endswith("\n") and el.name in nlels:
docinfo["text"] += "\n"
# print("DEBUG: adding newline after at ", docinfo["curoffset"])
docinfo["curoffset"] += 1
docinfo["anninfos"].append(
{"event": "end", "id": thisid, "end": docinfo["curoffset"]}
)
elif isinstance(el, bs4.element.NavigableString):
# print("DEBUG: got text: ", el)
text = str(el)
if text == "\n" and docinfo["text"].endswith("\n"):
return
docinfo["text"] += text
docinfo["curoffset"] += len(el)
else:
print("WARNING: odd element type", type(el))
walktree(bs)
# need to add the end corresponding to bs
# print("DEBUG: got docinfo:\n",docinfo)
id2anninfo = {} # from id to anninfo
nstart = 0
for anninfo in docinfo["anninfos"]:
if anninfo["event"] == "start":
nstart += 1
id2anninfo[anninfo["id"]] = anninfo
nend = 0
for anninfo in docinfo["anninfos"]:
if anninfo["event"] == "end":
nend += 1
end = anninfo["end"]
annid = anninfo["id"]
anninfo = id2anninfo[annid]
anninfo["end"] = end
# print("DEBUG: got nstart/nend", nstart, nend)
assert nstart == nend
# print("DEBUG: got id2anninfo:\n", id2anninfo)
doc = Document(docinfo["text"])
annset = doc.annset(markup_set_name)
for i in range(nstart):
anninfo = id2anninfo[i]
annset.add(
anninfo["start"],
anninfo["end"],
anntype=anninfo["type"],
features=anninfo["features"],
)
return doc
Classes
class HtmlLoader
-
Expand source code
class HtmlLoader: """ """ @staticmethod def load_rendered( clazz, from_ext=None, from_mem=None, parser=None, markup_set_name="Original markups", process_soup=None, offset_mapper=None, **kwargs, ): """ Args: clazz: from_ext: (Default value = None) from_mem: (Default value = None) parser: (Default value = None) markup_set_name: (Default value = "Original markups") process_soup: (Default value = None) offset_mapper: (Default value = None) **kwargs: Returns: """ raise Exception("Rendered html parser not yet implemented") @staticmethod def load( clazz, from_ext=None, from_mem=None, parser="html.parser", markup_set_name="Original markups", encoding=None, **kwargs, ): """Load a HTML file. Args: clazz: param from_ext: from_ext: file our URL source from_mem: string source parser: one of "html.parser", "lxml", "lxml-xml", "html5lib" (default is "html.parser") markup_set_name: the annotation set name for the set to contain the HTML annotations (Default value = "Original markups") encoding: the encoding to use for reading the file """ # NOTE: for now we have a simple heuristic for adding newlines to the text: # before and after a block element, a newline is added unless there is already one # NOTE: for now we use multi_valued_attributes=None which prevents attributes of the # form "class='val1 val2'" to get converted into features with a list of values. from bs4 import BeautifulSoup import bs4 isurl, extstr = is_url(from_ext) if from_ext is not None: if isurl: from_mem = get_str_from_url(extstr, encoding=encoding) if from_mem: bs = BeautifulSoup(from_mem, features=parser, multi_valued_attributes=None) else: with open(extstr, encoding=encoding) as infp: bs = BeautifulSoup(infp, features=parser, multi_valued_attributes=None) # we recursively iterate the tree depth first, going through the children # and adding to a list that either contains the text or a dict with the information # about annotations we want to add nlels = { "pre", "br", "p", "div", "tr", "h1", "h2", "h3", "h4", "h5", "h6", "li", "address", "article", "aside", "blockquote", "del", "figure", "figcaption", "footer", "header", "hr", "ins", "main", "nav", "section", "summary", "input", "legend", "option", "textarea", "bdi", "bdo", "center", "code", "dfn", "menu", "dir", "caption", } ignoreels = {"script", "style"} docinfo = {"anninfos": [], "curoffset": 0, "curid": 0, "text": ""} def walktree(el): """ Args: el: Returns: """ # print("DEBUG: type=", type(el)) if isinstance(el, bs4.element.Doctype): # print("DEBUG: got doctype", type(el)) pass elif isinstance(el, bs4.element.Comment): # print("DEBUG: got Comment", type(el)) pass elif isinstance(el, bs4.element.Script): # print("DEBUG: got Script", type(el)) pass elif isinstance(el, bs4.element.Tag): # print("DEBUG: got tag: ", type(el), " name=",el.name) # some tags we ignore completely: if el.name in ignoreels: return # for some tags we insert a new line before, but only if we do not already have one if not docinfo["text"].endswith("\n") and el.name in nlels: docinfo["text"] += "\n" # print("DEBUG: adding newline before at ", docinfo["curoffset"]) docinfo["curoffset"] += 1 ann = { "type": el.name, "features": el.attrs, "id": docinfo["curid"], "event": "start", "start": docinfo["curoffset"], } thisid = docinfo["curid"] docinfo["anninfos"].append(ann) docinfo["curid"] += 1 for child in el.children: walktree(child) # for some tags we insert a new line after if not docinfo["text"].endswith("\n") and el.name in nlels: docinfo["text"] += "\n" # print("DEBUG: adding newline after at ", docinfo["curoffset"]) docinfo["curoffset"] += 1 docinfo["anninfos"].append( {"event": "end", "id": thisid, "end": docinfo["curoffset"]} ) elif isinstance(el, bs4.element.NavigableString): # print("DEBUG: got text: ", el) text = str(el) if text == "\n" and docinfo["text"].endswith("\n"): return docinfo["text"] += text docinfo["curoffset"] += len(el) else: print("WARNING: odd element type", type(el)) walktree(bs) # need to add the end corresponding to bs # print("DEBUG: got docinfo:\n",docinfo) id2anninfo = {} # from id to anninfo nstart = 0 for anninfo in docinfo["anninfos"]: if anninfo["event"] == "start": nstart += 1 id2anninfo[anninfo["id"]] = anninfo nend = 0 for anninfo in docinfo["anninfos"]: if anninfo["event"] == "end": nend += 1 end = anninfo["end"] annid = anninfo["id"] anninfo = id2anninfo[annid] anninfo["end"] = end # print("DEBUG: got nstart/nend", nstart, nend) assert nstart == nend # print("DEBUG: got id2anninfo:\n", id2anninfo) doc = Document(docinfo["text"]) annset = doc.annset(markup_set_name) for i in range(nstart): anninfo = id2anninfo[i] annset.add( anninfo["start"], anninfo["end"], anntype=anninfo["type"], features=anninfo["features"], ) return doc
Static methods
def load(clazz, from_ext=None, from_mem=None, parser='html.parser', markup_set_name='Original markups', encoding=None, **kwargs)
-
Load a HTML file.
Args
clazz
- param from_ext:
from_ext
- file our URL source
from_mem
- string source
parser
- one of "html.parser", "lxml", "lxml-xml", "html5lib" (default is "html.parser")
markup_set_name
- the annotation set name for the set to contain the HTML annotations (Default value = "Original markups")
encoding
- the encoding to use for reading the file
Expand source code
@staticmethod def load( clazz, from_ext=None, from_mem=None, parser="html.parser", markup_set_name="Original markups", encoding=None, **kwargs, ): """Load a HTML file. Args: clazz: param from_ext: from_ext: file our URL source from_mem: string source parser: one of "html.parser", "lxml", "lxml-xml", "html5lib" (default is "html.parser") markup_set_name: the annotation set name for the set to contain the HTML annotations (Default value = "Original markups") encoding: the encoding to use for reading the file """ # NOTE: for now we have a simple heuristic for adding newlines to the text: # before and after a block element, a newline is added unless there is already one # NOTE: for now we use multi_valued_attributes=None which prevents attributes of the # form "class='val1 val2'" to get converted into features with a list of values. from bs4 import BeautifulSoup import bs4 isurl, extstr = is_url(from_ext) if from_ext is not None: if isurl: from_mem = get_str_from_url(extstr, encoding=encoding) if from_mem: bs = BeautifulSoup(from_mem, features=parser, multi_valued_attributes=None) else: with open(extstr, encoding=encoding) as infp: bs = BeautifulSoup(infp, features=parser, multi_valued_attributes=None) # we recursively iterate the tree depth first, going through the children # and adding to a list that either contains the text or a dict with the information # about annotations we want to add nlels = { "pre", "br", "p", "div", "tr", "h1", "h2", "h3", "h4", "h5", "h6", "li", "address", "article", "aside", "blockquote", "del", "figure", "figcaption", "footer", "header", "hr", "ins", "main", "nav", "section", "summary", "input", "legend", "option", "textarea", "bdi", "bdo", "center", "code", "dfn", "menu", "dir", "caption", } ignoreels = {"script", "style"} docinfo = {"anninfos": [], "curoffset": 0, "curid": 0, "text": ""} def walktree(el): """ Args: el: Returns: """ # print("DEBUG: type=", type(el)) if isinstance(el, bs4.element.Doctype): # print("DEBUG: got doctype", type(el)) pass elif isinstance(el, bs4.element.Comment): # print("DEBUG: got Comment", type(el)) pass elif isinstance(el, bs4.element.Script): # print("DEBUG: got Script", type(el)) pass elif isinstance(el, bs4.element.Tag): # print("DEBUG: got tag: ", type(el), " name=",el.name) # some tags we ignore completely: if el.name in ignoreels: return # for some tags we insert a new line before, but only if we do not already have one if not docinfo["text"].endswith("\n") and el.name in nlels: docinfo["text"] += "\n" # print("DEBUG: adding newline before at ", docinfo["curoffset"]) docinfo["curoffset"] += 1 ann = { "type": el.name, "features": el.attrs, "id": docinfo["curid"], "event": "start", "start": docinfo["curoffset"], } thisid = docinfo["curid"] docinfo["anninfos"].append(ann) docinfo["curid"] += 1 for child in el.children: walktree(child) # for some tags we insert a new line after if not docinfo["text"].endswith("\n") and el.name in nlels: docinfo["text"] += "\n" # print("DEBUG: adding newline after at ", docinfo["curoffset"]) docinfo["curoffset"] += 1 docinfo["anninfos"].append( {"event": "end", "id": thisid, "end": docinfo["curoffset"]} ) elif isinstance(el, bs4.element.NavigableString): # print("DEBUG: got text: ", el) text = str(el) if text == "\n" and docinfo["text"].endswith("\n"): return docinfo["text"] += text docinfo["curoffset"] += len(el) else: print("WARNING: odd element type", type(el)) walktree(bs) # need to add the end corresponding to bs # print("DEBUG: got docinfo:\n",docinfo) id2anninfo = {} # from id to anninfo nstart = 0 for anninfo in docinfo["anninfos"]: if anninfo["event"] == "start": nstart += 1 id2anninfo[anninfo["id"]] = anninfo nend = 0 for anninfo in docinfo["anninfos"]: if anninfo["event"] == "end": nend += 1 end = anninfo["end"] annid = anninfo["id"] anninfo = id2anninfo[annid] anninfo["end"] = end # print("DEBUG: got nstart/nend", nstart, nend) assert nstart == nend # print("DEBUG: got id2anninfo:\n", id2anninfo) doc = Document(docinfo["text"]) annset = doc.annset(markup_set_name) for i in range(nstart): anninfo = id2anninfo[i] annset.add( anninfo["start"], anninfo["end"], anntype=anninfo["type"], features=anninfo["features"], ) return doc
def load_rendered(clazz, from_ext=None, from_mem=None, parser=None, markup_set_name='Original markups', process_soup=None, offset_mapper=None, **kwargs)
-
Args
- clazz:
from_ext
- (Default value = None)
from_mem
- (Default value = None)
parser
- (Default value = None)
markup_set_name
- (Default value = "Original markups")
process_soup
- (Default value = None)
offset_mapper
- (Default value = None)
**kwargs: Returns:
Expand source code
@staticmethod def load_rendered( clazz, from_ext=None, from_mem=None, parser=None, markup_set_name="Original markups", process_soup=None, offset_mapper=None, **kwargs, ): """ Args: clazz: from_ext: (Default value = None) from_mem: (Default value = None) parser: (Default value = None) markup_set_name: (Default value = "Original markups") process_soup: (Default value = None) offset_mapper: (Default value = None) **kwargs: Returns: """ raise Exception("Rendered html parser not yet implemented")